Exemplo n.º 1
0
def scrap(job_title, job_location, radius, result_nb):

    base_URL = 'http://www.indeed.fr/emplois?'
    query_parameters = [
        'q={}'.format('+'.join(job_title.split())),
        '&l={}'.format('+'.join(job_location.split())),
        '&rq={}'.format(radius), '&sort=date', '&fromage=last'
    ]

    query_URL = format_query(base_URL, query_parameters)
    print(query_URL)

    html = get_html(query_URL)
    try:
        num_jobs_txt = str(html.select('#searchCount'))
        num_jobs = int(parse_num(num_jobs_txt, 2))
    except:
        print('No jobs for search {} in {}'.format(job_title, job_location))
        sys.exit(0)

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {
        'job_site': 'indeed',
        'num_jobs': num_jobs,
        'date': current_date,
        'title': job_title,
        'location': job_location
    }

    # Cycle through all of the job postings that we can and grab the url pointing to
    # it, to then query it. All of the jobs should be available via the
    # .turnstileLink class, and then the href attribute will point to the URL.
    max_start_position = 1000 if num_jobs >= 1000 else num_jobs
    start_positions = range(0, max_start_position, 10)
    db_path = "".join([
        "db_", job_title, "_", job_location, "_",
        str(radius), "_",
        str(result_nb)
    ])

    jobs = []
    for i in range(0, result_nb, 10):
        try:
            jobs.extend(
                multiprocess_pages(query_URL, job_title, job_location, i))
        except RuntimeError:
            pass
            #retry ?
#cPickle.dump(jobs, "jobs")
    with open("".join([db_path, ".pkl"]), 'w') as f:
        cPickle.dump(jobs, f)
        f.close()
    return jobs
Exemplo n.º 2
0
def query_for_data(driver, json_dct, job, idx):
    """Grab all info. from the job posting
    
    This will include the job title, the job location, the 
    posting company, the date posted, and then any stars assigned. 
    After grabbing this information, click and get the job posting's
    actual text. 

    Args: 
        driver: Selenium webdriver
        json_dct: dict 
            Dictionary holding the current information that is being stored
            for that job posting. 
        job: Selenium WebElement
        idx: int
            Holds the # of the job posting the program is on (0 indexed here). 

    Return: dct
    """

    posting_title = job.find_element_by_class_name('title').text
    split_posting_company = job.find_element_by_class_name(
        'companyInfo').text.split()
    posting_location = job.find_element_by_xpath(
        "//div//span[@itemprop='jobLocation']").text
    try:
        posting_date = job.find_element_by_class_name('minor').text
    except:
        posting_date = ''

    # I couldn't think of any clearly better way to do this. If they have
    # a number of stars, it comes in the posting companies text. I guess
    # I could have done a search and replace, but I'd rather slightly adjust
    # some functionality I already have (i.e. parse_num) than build another
    # function to find the number of stars, store it, and then replace it with
    # empty text.
    if parse_num(' '.join(split_posting_company), 0):
        num_stars = split_posting_company[0]
        posting_company = ' '.join(split_posting_company[1:])
        out_json_dct = gen_output(json_dct.copy(), posting_title,
                                  posting_location, posting_date,
                                  posting_company, num_stars)
    else:
        posting_company = ' '.join(split_posting_company)
        out_json_dct = gen_output(json_dct.copy(), posting_title,
                                  posting_location, posting_date,
                                  posting_company)

    out_json_dct['posting_txt'] = grab_posting_txt(driver, job, idx)
    return out_json_dct
Exemplo n.º 3
0
def query_for_data(driver, json_dct, job, idx): 
    """Grab all info. from the job posting
    
    This will include the job title, the job location, the 
    posting company, the date posted, and then any stars assigned. 
    After grabbing this information, click and get the job posting's
    actual text. 

    Args: 
        driver: Selenium webdriver
        json_dct: dict 
            Dictionary holding the current information that is being stored
            for that job posting. 
        job: Selenium WebElement
        idx: int
            Holds the # of the job posting the program is on (0 indexed here). 

    Return: dct
    """

    posting_title = job.find_element_by_class_name('title').text
    split_posting_company = job.find_element_by_class_name(
            'companyInfo').text.split()
    posting_location = job.find_element_by_xpath(
            "//div//span[@itemprop='jobLocation']").text
    try: 
        posting_date = job.find_element_by_class_name('minor').text
    except: 
        posting_date = ''

    # I couldn't think of any clearly better way to do this. If they have 
    # a number of stars, it comes in the posting companies text. I guess
    # I could have done a search and replace, but I'd rather slightly adjust
    # some functionality I already have (i.e. parse_num) than build another
    # function to find the number of stars, store it, and then replace it with
    # empty text. 
    if parse_num(' '.join(split_posting_company), 0):
        num_stars = split_posting_company[0]
        posting_company = ' '.join(split_posting_company[1:])
        out_json_dct = gen_output(json_dct.copy(), posting_title, 
                posting_location, posting_date, posting_company, num_stars)
    else: 
        posting_company = ' '.join(split_posting_company)
        out_json_dct = gen_output(json_dct.copy(), posting_title, 
                posting_location, posting_date, posting_company)
    
    out_json_dct['posting_txt'] = grab_posting_txt(driver, job, idx)
    return out_json_dct
            'Program needs a job title, job location, and radius inputted!')

    base_URL = 'https://www.ziprecruiter.com/candidate/search?'
    query_parameters = [
        'search={}'.format('+'.join(job_title.split())),
        '&location={}'.format('+'.join(job_location.split())),
        '&radius={}'.format(radius), '&days=5', '&include_near_duplicates=1'
    ]

    query_URL = format_query(base_URL, query_parameters)
    #print (query_URL)
    html = get_html(query_URL)

    try:
        num_jobs_txt = str(html.select('#job_results_headline')[0].text)
        num_jobs = int(parse_num(num_jobs_txt, 0))
        print(num_jobs)
    except:
        print('No jobs for search {} in {}'.format(job_title, job_location))
        sys.exit(0)

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {
        'job_site': 'ziprecruiter',
        'num_jobs': num_jobs,
        'date': current_date,
        'title': job_title,
        'location': job_location
    }
    store_in_mongo([storage_dct], 'job_numbers', 'ziprecruiter')
Exemplo n.º 5
0
        job_location = sys.argv[2]
        radius = sys.argv[3]
    except IndexError: 
        raise Exception('Program needs a job title, job location, and radius inputted!')

    base_URL = 'http://jobs.monster.com/search/?'
    query_parameters = ['q={}'.format('-'.join(job_title.split())), 
            '&where={}'.format('-'.join(job_location.split())), '&sort=dt.rv.di', 
            '&rad={}'.format(radius)]

    query_URL = format_query(base_URL, query_parameters)
    driver = issue_driver_query(query_URL)
    
    try: 
        num_jobs_txt = get_num_jobs_txt(driver)
        num_jobs = int(parse_num(num_jobs_txt, 0))
    except: 
        print 'No jobs for search {} in {}'.format(job_title, job_location)
        sys.exit(0)

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {'job_site': 'monster', 'num_jobs': num_jobs, 
            'date': current_date, 'title': job_title, 'location': job_location}
    store_in_mongo([storage_dct], 'job_numbers', 'monster')
    
    # This loop will be used to keep clicking the next button after
    # scraping jobs on that page. 
    is_next = True
    while is_next: 
        scrape_job_page(driver, job_title, job_location)
        is_next = check_if_next(driver)
Exemplo n.º 6
0
    try:
        job_title = sys.argv[1]
        job_location = sys.argv[2]
    except IndexError:
        raise Exception('Program needs a job title and job location inputted!')

    # Issue the job query.
    base_URL = 'https://www.glassdoor.com/index.htm'
    query_params = (('KeywordSearch', job_title), ('LocationSearch',
                                                   job_location))
    driver = issue_driver_query(base_URL, query_params)

    # Find the text holding the number of jobs, and parse it.
    time.sleep(random.randint(7, 15))
    num_jobs_txt = driver.find_elements_by_xpath('//header')[1].text
    num_jobs = int(parse_num(num_jobs_txt, 0))

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {
        'job_site': 'glassdoor',
        'num_jobs': num_jobs,
        'date': current_date,
        'title': job_title,
        'location': job_location
    }
    store_in_mongo([storage_dct], 'job_numbers', 'glassdoor')

    # Find the text holding the number of pages in the job search.
    time.sleep(random.randint(2, 6))
    try:
        num_pages_txt = driver.find_element_by_id('ResultsFooter').text
Exemplo n.º 7
0
    try: 
        job_title = sys.argv[1]
        job_location = sys.argv[2]
    except IndexError: 
        raise Exception('Program needs a job title and job location inputted!')
    
    # Issue the job query. 
    base_URL = 'https://www.glassdoor.com/index.htm'
    query_params = (('KeywordSearch', job_title), 
            ('LocationSearch', job_location))
    driver = issue_driver_query(base_URL, query_params)

    # Find the text holding the number of jobs, and parse it. 
    time.sleep(random.randint(7, 15))
    num_jobs_txt = driver.find_elements_by_xpath('//header')[1].text
    num_jobs = int(parse_num(num_jobs_txt, 0)) 

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {'job_site': 'glassdoor', 'num_jobs': num_jobs, 
            'date': current_date, 'title': job_title, 'location': job_location}
    store_in_mongo([storage_dct], 'job_numbers', 'glassdoor')

    # Find the text holding the number of pages in the job search. 
    time.sleep(random.randint(2, 6))
    try: 
        num_pages_txt = driver.find_element_by_id('ResultsFooter').text
        num_pages = int(parse_num(num_pages_txt, 1))
    except: 
        print 'No jobs for search {} in {}'.format(job_title, job_location)
        sys.exit(0)