Пример #1
0
def scrap(job_title, job_location, radius, result_nb):

    base_URL = 'http://www.indeed.fr/emplois?'
    query_parameters = [
        'q={}'.format('+'.join(job_title.split())),
        '&l={}'.format('+'.join(job_location.split())),
        '&rq={}'.format(radius), '&sort=date', '&fromage=last'
    ]

    query_URL = format_query(base_URL, query_parameters)
    print(query_URL)

    html = get_html(query_URL)
    try:
        num_jobs_txt = str(html.select('#searchCount'))
        num_jobs = int(parse_num(num_jobs_txt, 2))
    except:
        print('No jobs for search {} in {}'.format(job_title, job_location))
        sys.exit(0)

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {
        'job_site': 'indeed',
        'num_jobs': num_jobs,
        'date': current_date,
        'title': job_title,
        'location': job_location
    }

    # Cycle through all of the job postings that we can and grab the url pointing to
    # it, to then query it. All of the jobs should be available via the
    # .turnstileLink class, and then the href attribute will point to the URL.
    max_start_position = 1000 if num_jobs >= 1000 else num_jobs
    start_positions = range(0, max_start_position, 10)
    db_path = "".join([
        "db_", job_title, "_", job_location, "_",
        str(radius), "_",
        str(result_nb)
    ])

    jobs = []
    for i in range(0, result_nb, 10):
        try:
            jobs.extend(
                multiprocess_pages(query_URL, job_title, job_location, i))
        except RuntimeError:
            pass
            #retry ?
#cPickle.dump(jobs, "jobs")
    with open("".join([db_path, ".pkl"]), 'w') as f:
        cPickle.dump(jobs, f)
        f.close()
    return jobs
    try:
        job_title = sys.argv[1]
        job_location = sys.argv[2]
        radius = sys.argv[3]
    except IndexError:
        raise Exception(
            'Program needs a job title, job location, and radius inputted!')

    base_URL = 'https://www.ziprecruiter.com/candidate/search?'
    query_parameters = [
        'search={}'.format('+'.join(job_title.split())),
        '&location={}'.format('+'.join(job_location.split())),
        '&radius={}'.format(radius), '&days=5', '&include_near_duplicates=1'
    ]

    query_URL = format_query(base_URL, query_parameters)
    #print (query_URL)
    html = get_html(query_URL)

    try:
        num_jobs_txt = str(html.select('#job_results_headline')[0].text)
        num_jobs = int(parse_num(num_jobs_txt, 0))
        print(num_jobs)
    except:
        print('No jobs for search {} in {}'.format(job_title, job_location))
        sys.exit(0)

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {
        'job_site': 'ziprecruiter',
        'num_jobs': num_jobs,
Пример #3
0
    # I expect that at the very least a job title, job location, and radius
    # will be passed in, so I'll attempt to get both of those within
    # a try except and throw an error otherwise. 
    try: 
        job_title = sys.argv[1]
        job_location = sys.argv[2]
        radius = sys.argv[3]
    except IndexError: 
        raise Exception('Program needs a job title, job location, and radius inputted!')

    base_URL = 'http://jobs.monster.com/search/?'
    query_parameters = ['q={}'.format('-'.join(job_title.split())), 
            '&where={}'.format('-'.join(job_location.split())), '&sort=dt.rv.di', 
            '&rad={}'.format(radius)]

    query_URL = format_query(base_URL, query_parameters)
    driver = issue_driver_query(query_URL)
    
    try: 
        num_jobs_txt = get_num_jobs_txt(driver)
        num_jobs = int(parse_num(num_jobs_txt, 0))
    except: 
        print 'No jobs for search {} in {}'.format(job_title, job_location)
        sys.exit(0)

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {'job_site': 'monster', 'num_jobs': num_jobs, 
            'date': current_date, 'title': job_title, 'location': job_location}
    store_in_mongo([storage_dct], 'job_numbers', 'monster')
    
    # This loop will be used to keep clicking the next button after