def multiprocess_pages(base_URL, job_title, job_location, page_start): """Grab the URLS and other relevant info. from job postings on the page. The Indeed URL used for job searching takes another parameter, `start`, that allows you to start the job search at jobs 10-20, 20-30, etc. Use this to grab job results from multiple pages at once, passing the result from a page on to a thread to grab the details from each job posting. Args: ---- base_URL: str job_title: str job_location: str page_start: int """ url = base_URL + '&start=' + str(page_start) html = get_html(url) # Each row corresponds to a job. rows = html.select('.row') threads = [] mongo_update_lst = [] for row in rows: thread = RequestInfoThread(row, job_title, job_location) thread.start() threads.append(thread) for thread in threads: thread.join() mongo_update_lst.append(thread.json_dct) store_in_mongo(mongo_update_lst, 'job_postings', 'indeed')
def multiprocess_pages(base_URL, job_title, job_location, page_number): """Grab the URLS and other relevant info. from job postings on the page. The Simply Hired URL used for job searching takes another parameter, `pn`, that allows you to start the job search at jobs 11-20, 21-30, etc. Use this to grab job results from multiple pages at once, and then feed the jobs from each page to threads for further parsing. Args: ---- base_URL: str job_title: str job_location: str page_number: int """ url = base_URL + '&pn=' + str(page_number) html = get_html(url) # Each row corresponds to a job. jobs = html.select('.js-job') threads = [] mongo_update_lst = [] for job in jobs: thread = RequestInfoThread(job, job_title, job_location) thread.start() threads.append(thread) for thread in threads: thread.join() mongo_update_lst.append(thread.json_dct) store_in_mongo(mongo_update_lst, 'job_postings', 'simplyhired')
def scrape_job_page(driver, job_title, job_location): """Scrape a page of jobs from Glassdoor. Grab everything that is possible or relevant for each of the jobs posted on a given page. This will typically include the job title, job location, posting company, date posted, and any stars assigned (if any). Parse the relevant information, and then store it. Args: driver: Selenium webdriver job_title: str job_location: str """ current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain'))) json_dct = {'search_title': job_title, \ 'search_location': job_location, \ 'search_date': current_date, 'job_site': 'glassdoor'} jobs = driver.find_elements_by_class_name('jobListing') mongo_update_lst = [ query_for_data(driver, json_dct, job, idx) for idx, job in enumerate(jobs[:-1]) ] store_in_mongo(mongo_update_lst, 'job_postings', 'glassdoor')
def multiprocess_pages(base_URL, job_title, job_location, page_num): """Grab the URLs and other relevant info. from job postings on the page. The ZipRecruiter URL used for job searching takes an additional parameter, `page`, that allows you to start the job search at page 0-20 (20 is the max). Use this to grab job results from multiple pages at once, and then pass jobs on to threads to grab relevant info. Args: ---- base_URL: str job_title: str job_location: str page_start: int """ url = query_URL + '&page=' + str(page_num) html = get_html(url) rows = html.select('.job_result') threads = [] mongo_update_lst = [] for row in rows: thread = RequestInfoThread(row, job_title, job_location) thread.start() threads.append(thread) for thread in threads: thread.join() mongo_update_lst.append(thread.json_dct) store_in_mongo(mongo_update_lst, 'job_postings', 'ziprecruiter')
def __exit__(self, *args): """Ensure that any URLs scraped for get their text attributes updated.""" store_in_mongo(self.articles_to_scrape, self.db_name, self.coll_name, key='web_url')
def scrape_job_page(driver, job_title, job_location): """Scrape a page of jobs from Monster. Grab everything that is possible (or relevant) for each of the jobs posted for a given page. This will typically include the job title, job location, posting company, the date posted, and the posting text. Args: ---- driver: Selenium webdriver job_title: str job_location: str """ titles, locations, companies, dates, hrefs = query_for_data(driver) current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain'))) json_dct = {'search_title': job_title, \ 'search_location': job_location, \ 'search_date': current_date, 'job_site': 'monster'} thread_lst = [] for href in hrefs: try: thread = HrefQueryThread(href.get_attribute('href')) except: print('Exception in href thread builder') thread = HrefQueryThread('') thread_lst.append(thread) thread.start() mongo_update_lst = [] for title, location, company, date, thread in \ zip(titles, locations, companies, dates, thread_lst): try: mongo_dct = gen_output(json_dct.copy(), title, location, company, date, thread) mongo_update_lst.append(mongo_dct) except: print('Missed element in Monster!') store_in_mongo(mongo_update_lst, 'job_postings', 'monster')
while attribute.find('Other') == -1: values[attribute] = value points_misc_idx += 1 # The value is always the last item present, surrounded by (), and the # 1+ items before that are the attributes to which those points belong. split_text = sum_points_misc_lst[points_misc_idx].split() attribute = ' '.join(split_text[:-1]) value = split_text[-1].replace('(', '').replace(')', '') values[attribute] = value points_misc_idx += 1 return values, points_misc_idx if __name__ == '__main__': try: year = sys.argv[1] except Exception as e: print(e) raise Exception('<Usage> Input a year to grab music data for.') URL = 'http://www.albumoftheyear.org/list/summary/' + year + '/' soup = get_html(URL) css_selectors = ['.artistTitle', '.albumTitle', '.summaryPoints', '.summaryPointsMisc'] desired_contents = select_soup(soup, css_selectors) desired_contents_text = grab_contents_key(desired_contents, "text") desired_contents_renamed = rename_keys(desired_contents_text) final_lst = parse_contents(desired_contents_renamed) store_in_mongo(final_lst, 'music', 'music_lists')
content_txt = content.text score_idx = content_txt.find(score_str) score_str_len = len(score_str) beg_idx = score_idx + score_str_len end_idx = beg_idx + 2 score = content_txt[beg_idx:end_idx] return score if __name__ == '__main__': try: year = sys.argv[1] except Exception as e: print(e) raise Exception('<Usage> Input a year to grab data music data for.') URL = 'http://www.albumoftheyear.org/list/summary/' + year + '/' soup = get_html(URL) css_selectors = ['.albumTitle'] album_titles_contents = select_soup(soup, css_selectors) album_titles_lst = list( grab_contents_key(album_titles_contents, 'text').values()) album_titles = album_titles_lst[0] album_title_links = grab_contents_key(album_titles_contents, 'a') album_title_hrefs = grab_contents_key(album_title_links, 'href') final_json_lst = process_album_title_hrefs(album_title_hrefs, album_titles) store_in_mongo(final_json_lst, 'music', 'music_lists', key="Album Title")
'&mi={}'.format(radius), '&fdb=5', '&clst=CTL' ] query_URL = format_query(base_URL, query_parameters) html = get_html(query_URL) try: num_jobs_txt = str(html.select('.result-headline')[0].text) num_jobs = int(parse_num(num_jobs_txt, 2)) except: print('No jobs for search {} in {}'.format(job_title, job_location)) sys.exit(0) current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain'))) storage_dct = { 'job_site': 'simplyhired', 'num_jobs': num_jobs, 'date': current_date, 'title': job_title, 'location': job_location } store_in_mongo([storage_dct], 'job_numbers', 'simplyhired') # All of the jobs should be available through the '.js-job-link' CSS class. max_pages = num_jobs // 10 + 1 page_numbers = range(1, max_pages + 1) execute_queries = partial(multiprocess_pages, query_URL, job_title, job_location) pool = multiprocessing.Pool(multiprocessing.cpu_count()) pool.map(execute_queries, page_numbers)
driver = issue_driver_query(base_URL, query_params) # Find the text holding the number of jobs, and parse it. time.sleep(random.randint(7, 15)) num_jobs_txt = driver.find_elements_by_xpath('//header')[1].text num_jobs = int(parse_num(num_jobs_txt, 0)) current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain'))) storage_dct = { 'job_site': 'glassdoor', 'num_jobs': num_jobs, 'date': current_date, 'title': job_title, 'location': job_location } store_in_mongo([storage_dct], 'job_numbers', 'glassdoor') # Find the text holding the number of pages in the job search. time.sleep(random.randint(2, 6)) try: num_pages_txt = driver.find_element_by_id('ResultsFooter').text num_pages = int(parse_num(num_pages_txt, 1)) except: print('No jobs for search {} in {}'.format(job_title, job_location)) sys.exit(0) # Give it a little time before starting to click and parse time.sleep(random.randint(6, 12)) is_next = True while is_next:
---- rating_txt: str Text that potentially holds the rating. idx: int Holds the rating if the text does not. Return: int """ if len(rating_txt) >= 1: rating = int(rating_txt[0].replace('.', '')) else: rating = idx return rating if __name__ == '__main__': lists_url = 'http://www.albumoftheyear.org/lists.php' soup = get_html(lists_url) critics_content = select_soup(soup, '.criticListBlockTitle') critics_names = grab_contents_key(critics_content, "text") critics_links = grab_contents_key(critics_content, 'a') critics_hrefs = grab_contents_key(critics_links, 'href') raw_output = grab_critics_info(critics_names, critics_hrefs) formatted_output = [{"Album Title": k, "Critics Scores": v} for \ k, v in raw_output.items()] store_in_mongo(formatted_output, 'music', 'music_lists', key="Album Title")
base_URL = 'https://www.indeed.com/jobs?' query_parameters = ['q={}'.format('+'.join(job_title.split())), '&l={}'.format('+'.join(job_location.split())), '&radius={}'.format(radius), '&sort=date', '&fromage=5'] query_URL = format_query(base_URL, query_parameters) html = get_html(query_URL) try: num_jobs_txt = str(html.select('#searchCount')) num_jobs = int(parse_num(num_jobs_txt, 2)) except: print('No jobs for search {} in {}'.format(job_title, job_location)) sys.exit(0) current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain'))) storage_dct = {'job_site': 'indeed', 'num_jobs': num_jobs, 'date': current_date, 'title': job_title, 'location': job_location} store_in_mongo([storage_dct], 'job_numbers', 'indeed') # Cycle through all of the job postings that we can and grab the url pointing to # it, to then query it. All of the jobs should be available via the # .turnstileLink class, and then the href attribute will point to the URL. max_start_position = 1000 if num_jobs >= 1000 else num_jobs start_positions = range(0, max_start_position, 10) execute_queries = partial(multiprocess_pages, query_URL, \ job_title, job_location) pool = multiprocessing.Pool(multiprocessing.cpu_count()) pool.map(execute_queries, start_positions)
raise Exception('Program needs a job title and job location inputted!') # Navigate to the base URL and issue the original search query. base_URL = 'http://www.careerbuilder.com/' query_params = (('keywords', job_title), ('location', job_location)) driver = issue_driver_query(base_URL, query_params) # Grab num. jobs try: num_jobs_txt = driver.find_element_by_css_selector('div .count').text num_jobs = int(parse_num(num_jobs_txt, 0)) except: print('No jobs for search {} in {}'.format(job_title, job_location)) sys.exit(0) current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain'))) storage_dct = { 'job_site': 'careerbuilder', 'num_jobs': num_jobs, 'date': current_date, 'title': job_title, 'location': job_location } store_in_mongo([storage_dct], 'job_numbers', 'careerbuilder') is_next = True while is_next: jobs = scrape_job_page(driver, job_title, job_location) is_next = check_if_next(driver) driver.close()
query_URL = format_query(base_URL, query_parameters) html = get_html(query_URL) try: num_jobs_txt = str(html.select('#job_results_headline')[0].text) num_jobs = int(parse_num(num_jobs_txt, 0)) except: print('No jobs for search {} in {}'.format(job_title, job_location)) sys.exit(0) current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain'))) storage_dct = { 'job_site': 'ziprecruiter', 'num_jobs': num_jobs, 'date': current_date, 'title': job_title, 'location': job_location } store_in_mongo([storage_dct], 'job_numbers', 'ziprecruiter') # Cycle through the pages of jobs to grab all of the info. that we want. Each # page holds 20 jobs, so the number of pages we'll cyle through will be # num_jobs / 20. The caveat, though is that they only give 20 pages to look # through at maximum (hence the min below). pages = min(20, num_jobs // 20 + 1) page_positions = range(1, pages + 1) execute_queries = partial(multiprocess_pages, query_URL, job_title, job_location) pool = multiprocessing.Pool(multiprocessing.cpu_count()) pool.map(execute_queries, page_positions)
query_URL = format_query(base_URL, query_parameters) driver = issue_driver_query(query_URL, driver_path=driver_path) if verbose: print('<v> Successfully connected selenium') try: num_jobs = get_num_jobs_txt(driver) num_jobs = int(parse_num(num_jobs, 0)) if verbose: print('<v> {} jobs found'.format(num_jobs)) except: print('No jobs for search {} in {}'.format(job_title, job_location)) sys.exit(0) assert 0, 'halt' current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain'))) storage_dct = { 'job_site': 'monster', 'num_jobs': num_jobs, 'date': current_date, 'title': job_title, 'location': job_location } store_in_mongo([storage_dct], 'job_numbers', 'monster') is_next = True while is_next: scrape_job_page(driver, job_title, job_location) is_next = check_if_next(driver) driver.close()