Пример #1
0
def multiprocess_pages(base_URL, job_title, job_location, page_start): 
    """Grab the URLS and other relevant info. from job postings on the page. 

    The Indeed URL used for job searching takes another parameter, `start`, that 
    allows you to start the job search at jobs 10-20, 20-30, etc. Use this to grab
    job results from multiple pages at once, passing the result from a page on to
    a thread to grab the details from each job posting. 
    
    Args: 
    ----
        base_URL: str 
        job_title: str 
        job_location: str 
        page_start: int 
    """

    url = base_URL + '&start=' + str(page_start)
    html = get_html(url)
    # Each row corresponds to a job. 
    rows = html.select('.row')
    threads = []
    mongo_update_lst = []
    for row in rows: 
        thread = RequestInfoThread(row, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads: 
        thread.join()
        mongo_update_lst.append(thread.json_dct)

    store_in_mongo(mongo_update_lst, 'job_postings', 'indeed')
Пример #2
0
def multiprocess_pages(base_URL, job_title, job_location, page_number):
    """Grab the URLS and other relevant info. from job postings on the page. 

    The Simply Hired URL used for job searching takes another parameter, `pn`, that
    allows you to start the job search at jobs 11-20, 21-30, etc. Use this to grab
    job results from multiple pages at once, and then feed the jobs from each page
    to threads for further parsing. 

    Args: 
    ----
        base_URL: str 
        job_title: str 
        job_location: str 
        page_number: int 
    """

    url = base_URL + '&pn=' + str(page_number)
    html = get_html(url)
    # Each row corresponds to a job.
    jobs = html.select('.js-job')
    threads = []
    mongo_update_lst = []
    for job in jobs:
        thread = RequestInfoThread(job, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads:
        thread.join()
        mongo_update_lst.append(thread.json_dct)

    store_in_mongo(mongo_update_lst, 'job_postings', 'simplyhired')
Пример #3
0
def scrape_job_page(driver, job_title, job_location):
    """Scrape a page of jobs from Glassdoor. 

    Grab everything that is possible or relevant for each of the 
    jobs posted on a given page. This will typically include the job title, 
    job location, posting company, date posted, and any stars assigned 
    (if any). Parse the relevant information, and then store it. 

    Args: 
        driver: Selenium webdriver
        job_title: str
        job_location: str
    """

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    json_dct = {'search_title': job_title, \
            'search_location': job_location, \
            'search_date': current_date, 'job_site': 'glassdoor'}

    jobs = driver.find_elements_by_class_name('jobListing')

    mongo_update_lst = [
        query_for_data(driver, json_dct, job, idx)
        for idx, job in enumerate(jobs[:-1])
    ]

    store_in_mongo(mongo_update_lst, 'job_postings', 'glassdoor')
Пример #4
0
def multiprocess_pages(base_URL, job_title, job_location, page_num):
    """Grab the URLs and other relevant info. from job postings on the page. 

    The ZipRecruiter URL used for job searching takes an additional parameter,   
    `page`, that allows you to start the job search at page 0-20 (20 is the max). 
    Use this to grab job results from multiple pages at once, and then pass jobs
    on to threads to grab relevant info. 

    Args: 
    ----
        base_URL: str 
        job_title: str 
        job_location: str 
        page_start: int 
    """

    url = query_URL + '&page=' + str(page_num)
    html = get_html(url)
    rows = html.select('.job_result')
    threads = []
    mongo_update_lst = []
    for row in rows:
        thread = RequestInfoThread(row, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads:
        thread.join()
        mongo_update_lst.append(thread.json_dct)

    store_in_mongo(mongo_update_lst, 'job_postings', 'ziprecruiter')
Пример #5
0
    def __exit__(self, *args):
        """Ensure that any URLs scraped for get their text attributes updated."""

        store_in_mongo(self.articles_to_scrape,
                       self.db_name,
                       self.coll_name,
                       key='web_url')
Пример #6
0
def scrape_job_page(driver, job_title, job_location):
    """Scrape a page of jobs from Monster.

    Grab everything that is possible (or relevant) for each of the jobs posted 
    for a given page. This will typically include the job title, job location,
    posting company, the date posted, and the posting text. 

    Args: 
    ----
        driver: Selenium webdriver
        job_title: str
        job_location: str
    """

    titles, locations, companies, dates, hrefs = query_for_data(driver)

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    json_dct = {'search_title': job_title, \
            'search_location': job_location, \
            'search_date': current_date, 'job_site': 'monster'}

    thread_lst = []
    for href in hrefs:
        try:
            thread = HrefQueryThread(href.get_attribute('href'))
        except:
            print('Exception in href thread builder')
            thread = HrefQueryThread('')
        thread_lst.append(thread)
        thread.start()
    mongo_update_lst = []
    for title, location, company, date, thread in \
            zip(titles, locations, companies, dates, thread_lst):
        try:
            mongo_dct = gen_output(json_dct.copy(), title, location, company,
                                   date, thread)
            mongo_update_lst.append(mongo_dct)
        except:
            print('Missed element in Monster!')

    store_in_mongo(mongo_update_lst, 'job_postings', 'monster')
    while attribute.find('Other') == -1:
        values[attribute] = value
        points_misc_idx += 1
        # The value is always the last item present, surrounded by (), and the 
        # 1+ items before that are the attributes to which those points belong. 
        split_text = sum_points_misc_lst[points_misc_idx].split()
        attribute = ' '.join(split_text[:-1])
        value = split_text[-1].replace('(', '').replace(')', '')
    values[attribute] = value
    points_misc_idx += 1

    return values, points_misc_idx 

if __name__ == '__main__':
    try: 
        year = sys.argv[1]
    except Exception as e: 
        print(e)
        raise Exception('<Usage> Input a year to grab music data for.')

    URL = 'http://www.albumoftheyear.org/list/summary/' + year + '/'
    soup = get_html(URL) 

    css_selectors = ['.artistTitle', '.albumTitle', '.summaryPoints', 
                     '.summaryPointsMisc']
    desired_contents = select_soup(soup, css_selectors)
    desired_contents_text = grab_contents_key(desired_contents, "text")
    desired_contents_renamed = rename_keys(desired_contents_text)
    final_lst = parse_contents(desired_contents_renamed)
    store_in_mongo(final_lst, 'music', 'music_lists')
    content_txt = content.text
    score_idx = content_txt.find(score_str)
    score_str_len = len(score_str)
    beg_idx = score_idx + score_str_len
    end_idx = beg_idx + 2
    score = content_txt[beg_idx:end_idx]

    return score


if __name__ == '__main__':
    try:
        year = sys.argv[1]
    except Exception as e:
        print(e)
        raise Exception('<Usage> Input a year to grab data music data for.')

    URL = 'http://www.albumoftheyear.org/list/summary/' + year + '/'
    soup = get_html(URL)

    css_selectors = ['.albumTitle']
    album_titles_contents = select_soup(soup, css_selectors)
    album_titles_lst = list(
        grab_contents_key(album_titles_contents, 'text').values())
    album_titles = album_titles_lst[0]
    album_title_links = grab_contents_key(album_titles_contents, 'a')
    album_title_hrefs = grab_contents_key(album_title_links, 'href')

    final_json_lst = process_album_title_hrefs(album_title_hrefs, album_titles)
    store_in_mongo(final_json_lst, 'music', 'music_lists', key="Album Title")
Пример #9
0
        '&mi={}'.format(radius), '&fdb=5', '&clst=CTL'
    ]

    query_URL = format_query(base_URL, query_parameters)

    html = get_html(query_URL)
    try:
        num_jobs_txt = str(html.select('.result-headline')[0].text)
        num_jobs = int(parse_num(num_jobs_txt, 2))
    except:
        print('No jobs for search {} in {}'.format(job_title, job_location))
        sys.exit(0)

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {
        'job_site': 'simplyhired',
        'num_jobs': num_jobs,
        'date': current_date,
        'title': job_title,
        'location': job_location
    }
    store_in_mongo([storage_dct], 'job_numbers', 'simplyhired')

    # All of the jobs should be available through the '.js-job-link' CSS class.
    max_pages = num_jobs // 10 + 1
    page_numbers = range(1, max_pages + 1)
    execute_queries = partial(multiprocess_pages, query_URL, job_title,
                              job_location)
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
    pool.map(execute_queries, page_numbers)
Пример #10
0
    driver = issue_driver_query(base_URL, query_params)

    # Find the text holding the number of jobs, and parse it.
    time.sleep(random.randint(7, 15))
    num_jobs_txt = driver.find_elements_by_xpath('//header')[1].text
    num_jobs = int(parse_num(num_jobs_txt, 0))

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {
        'job_site': 'glassdoor',
        'num_jobs': num_jobs,
        'date': current_date,
        'title': job_title,
        'location': job_location
    }
    store_in_mongo([storage_dct], 'job_numbers', 'glassdoor')

    # Find the text holding the number of pages in the job search.
    time.sleep(random.randint(2, 6))
    try:
        num_pages_txt = driver.find_element_by_id('ResultsFooter').text
        num_pages = int(parse_num(num_pages_txt, 1))
    except:
        print('No jobs for search {} in {}'.format(job_title, job_location))
        sys.exit(0)

    # Give it a little time before starting to click and parse
    time.sleep(random.randint(6, 12))

    is_next = True
    while is_next:
Пример #11
0
    ----
        rating_txt: str
            Text that potentially holds the rating. 
        idx: int
            Holds the rating if the text does not. 

    Return: int
    """

    if len(rating_txt) >= 1:
        rating = int(rating_txt[0].replace('.', ''))
    else:
        rating = idx

    return rating


if __name__ == '__main__':
    lists_url = 'http://www.albumoftheyear.org/lists.php'

    soup = get_html(lists_url)
    critics_content = select_soup(soup, '.criticListBlockTitle')
    critics_names = grab_contents_key(critics_content, "text")
    critics_links = grab_contents_key(critics_content, 'a')
    critics_hrefs = grab_contents_key(critics_links, 'href')

    raw_output = grab_critics_info(critics_names, critics_hrefs)
    formatted_output = [{"Album Title": k, "Critics Scores": v} for \
            k, v in raw_output.items()]
    store_in_mongo(formatted_output, 'music', 'music_lists', key="Album Title")
Пример #12
0
    base_URL = 'https://www.indeed.com/jobs?'
    query_parameters = ['q={}'.format('+'.join(job_title.split())),
            '&l={}'.format('+'.join(job_location.split())), 
            '&radius={}'.format(radius), '&sort=date', '&fromage=5']

    query_URL = format_query(base_URL, query_parameters)

    html = get_html(query_URL)
    try: 
        num_jobs_txt = str(html.select('#searchCount'))
        num_jobs = int(parse_num(num_jobs_txt, 2))
    except: 
        print('No jobs for search {} in {}'.format(job_title, job_location))
        sys.exit(0)

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {'job_site': 'indeed', 'num_jobs': num_jobs, 
            'date': current_date, 'title': job_title, 'location': job_location}
    store_in_mongo([storage_dct], 'job_numbers', 'indeed')

    # Cycle through all of the job postings that we can and grab the url pointing to
    # it, to then query it. All of the jobs should be available via the 
    # .turnstileLink class, and then the href attribute will point to the URL. 
    max_start_position = 1000 if num_jobs >= 1000 else num_jobs
    start_positions = range(0, max_start_position, 10)
    execute_queries = partial(multiprocess_pages, query_URL, \
            job_title, job_location)
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
    pool.map(execute_queries, start_positions)
Пример #13
0
        raise Exception('Program needs a job title and job location inputted!')

    # Navigate to the base URL and issue the original search query.
    base_URL = 'http://www.careerbuilder.com/'
    query_params = (('keywords', job_title), ('location', job_location))
    driver = issue_driver_query(base_URL, query_params)

    # Grab num. jobs
    try:
        num_jobs_txt = driver.find_element_by_css_selector('div .count').text
        num_jobs = int(parse_num(num_jobs_txt, 0))
    except:
        print('No jobs for search {} in {}'.format(job_title, job_location))
        sys.exit(0)

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {
        'job_site': 'careerbuilder',
        'num_jobs': num_jobs,
        'date': current_date,
        'title': job_title,
        'location': job_location
    }
    store_in_mongo([storage_dct], 'job_numbers', 'careerbuilder')

    is_next = True
    while is_next:
        jobs = scrape_job_page(driver, job_title, job_location)
        is_next = check_if_next(driver)
    driver.close()
Пример #14
0
    query_URL = format_query(base_URL, query_parameters)
    html = get_html(query_URL)

    try:
        num_jobs_txt = str(html.select('#job_results_headline')[0].text)
        num_jobs = int(parse_num(num_jobs_txt, 0))
    except:
        print('No jobs for search {} in {}'.format(job_title, job_location))
        sys.exit(0)

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {
        'job_site': 'ziprecruiter',
        'num_jobs': num_jobs,
        'date': current_date,
        'title': job_title,
        'location': job_location
    }
    store_in_mongo([storage_dct], 'job_numbers', 'ziprecruiter')

    # Cycle through the pages of jobs to grab all of the info. that we want. Each
    # page holds 20 jobs, so the number of pages we'll cyle through will be
    # num_jobs / 20. The caveat, though is that they only give 20 pages to look
    # through at maximum (hence the min below).
    pages = min(20, num_jobs // 20 + 1)
    page_positions = range(1, pages + 1)
    execute_queries = partial(multiprocess_pages, query_URL, job_title,
                              job_location)
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
    pool.map(execute_queries, page_positions)
Пример #15
0
    query_URL = format_query(base_URL, query_parameters)
    driver = issue_driver_query(query_URL, driver_path=driver_path)

    if verbose: print('<v> Successfully connected selenium')

    try:
        num_jobs = get_num_jobs_txt(driver)
        num_jobs = int(parse_num(num_jobs, 0))
        if verbose: print('<v> {} jobs found'.format(num_jobs))
    except:
        print('No jobs for search {} in {}'.format(job_title, job_location))
        sys.exit(0)

    assert 0, 'halt'

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {
        'job_site': 'monster',
        'num_jobs': num_jobs,
        'date': current_date,
        'title': job_title,
        'location': job_location
    }
    store_in_mongo([storage_dct], 'job_numbers', 'monster')

    is_next = True
    while is_next:
        scrape_job_page(driver, job_title, job_location)
        is_next = check_if_next(driver)
    driver.close()