def multiprocess_pages(query_URL, job_title, job_location, page_num): """Grab the URLs and other relevant info. from job postings on the page. The ZipRecruiter URL used for job searching takes an additional parameter, `page`, that allows you to start the job search at page 0-20 (20 is the max). Use this to grab job results from multiple pages at once, and then pass jobs on to threads to grab relevant info. Args: ---- base_URL: str job_title: str job_location: str page_start: int """ url = query_URL + '&page=' + str(page_num) #print (url) html = get_html(url) rows = html.select('.job_content') #print (rows) threads = [] mongo_update_lst = [] for row in rows: # print (row) thread = RequestInfoThread(row, job_title, job_location) thread.start() threads.append(thread) for thread in threads: thread.join() mongo_update_lst.append(thread.json_dct) #store_in_mongo(mongo_update_lst, 'job_postings', 'ziprecruiter') #print (mongo_update_lst) store_in_mongo(mongo_update_lst, 'job_postings', 'ziprecruiter_final')
def multiprocess_pages(base_URL, job_title, job_location, page_number): """Grab the URLS and other relevant info. from job postings on the page. The Simply Hired URL used for job searching takes another parameter, `pn`, that allows you to start the job search at jobs 11-20, 21-30, etc. Use this to grab job results from multiple pages at once, and then feed the jobs from each page to threads for further parsing. Args: ---- base_URL: str job_title: str job_location: str page_number: int """ url = base_URL + '&pn=' + str(page_number) html = get_html(url) # Each row corresponds to a job. jobs = html.select('.js-job') threads = [] mongo_update_lst = [] for job in jobs: thread = RequestInfoThread(job, job_title, job_location) thread.start() threads.append(thread) for thread in threads: thread.join() mongo_update_lst.append(thread.json_dct) store_in_mongo(mongo_update_lst, 'job_postings', 'simplyhired')
def multiprocess_pages(base_URL, job_title, job_location, page_num): """Grab the URLs and other relevant info. from job postings on the page. The ZipRecruiter URL used for job searching takes an additional parameter, `page`, that allows you to start the job search at page 0-20 (20 is the max). Use this to grab job results from multiple pages at once, and then pass jobs on to threads to grab relevant info. Args: ---- base_URL: str job_title: str job_location: str page_start: int """ url = query_URL + "&page=" + str(page_num) html = get_html(url) rows = html.select(".job_result") threads = [] mongo_update_lst = [] for row in rows: thread = RequestInfoThread(row, job_title, job_location) thread.start() threads.append(thread) for thread in threads: thread.join() mongo_update_lst.append(thread.json_dct) store_in_mongo(mongo_update_lst, "job_postings", "ziprecruiter")
def multiprocess_pages(base_URL, job_title, job_location, page_num): """Grab the URLs and other relevant info. from job postings on the page. The ZipRecruiter URL used for job searching takes an additional parameter, `page`, that allows you to start the job search at page 0-20 (20 is the max). I can use this to grab job results from multiple pages at once. This function here takes in the base_URL, and then adds that page={page_num} parameter to the URL, and then queries it. It passes the results on to a thread to grab the details from each job posting. Args: base_URL: String that holds the base URL to add the page_num parameter to. job_title: String holding the job title used for the search job_location: String holding the job location used for the search page_num: Integer of what the `page` paramter in the URL should be set to. """ url = query_URL + '&page=' + str(page_num) html = get_html(url) rows = html.select('.job_result') threads = [] mongo_update_lst = [] for row in rows: thread = RequestInfoThread(row, job_title, job_location) thread.start() threads.append(thread) for thread in threads: thread.join() mongo_update_lst.append(thread.json_dct) store_in_mongo(mongo_update_lst, 'job_postings', 'ziprecruiter')
def multiprocess_pages(base_URL, job_title, job_location, page_start): """Grab the URLS and other relevant info. from job postings on the page. The Indeed URL used for job searching takes another parameter, `start`, that allows you to start the job search at jobs 10-20, 20-30, etc. Use this to grab job results from multiple pages at once, passing the result from a page on to a thread to grab the details from each job posting. Args: ---- base_URL: str job_title: str job_location: str page_start: int """ url = base_URL + '&start=' + str(page_start) html = get_html(url) # Each row corresponds to a job. rows = html.select('.row') threads = [] mongo_update_lst = [] for row in rows: thread = RequestInfoThread(row, job_title, job_location) thread.start() threads.append(thread) for thread in threads: thread.join() mongo_update_lst.append(thread.json_dct) store_in_mongo(mongo_update_lst, 'job_postings', 'indeed')
def multiprocess_pages(base_URL, job_title, job_location, page_start): """Grab the URLS and other relevant info. from job postings on the page. The Indeed URL used for job searching takes another parameter, `start`, that allows you to start the job search at jobs 10-20, 20-30, etc. I can use this to grab job results from multiple pages at once. This function takes in the base_URL and then adds that start={page_start} parameter to the URL, and then queries it. It passes the results on to a thread to grab the details from each job posting. Args: base_URL: String that holds the base URL to add the page_start parameter to. job_title: String holding the job title used for the search job_location: String holding the job location used for the search page_start: Integer of what the `start` parameter in the URL should be set to. """ url = base_URL + '&start=' + str(page_start) html = get_html(url) # Each row corresponds to a job. rows = html.select('.row') threads = [] mongo_update_lst = [] for row in rows: thread = RequestInfoThread(row, job_title, job_location) thread.start() threads.append(thread) for thread in threads: thread.join() mongo_update_lst.append(thread.json_dct) store_in_mongo(mongo_update_lst, 'job_postings', 'indeed')
def multiprocess_pages(base_URL, job_title, job_location, page_number): """Grab the URLS and other relevant info. from job postings on the page. The Simply Hired URL used for job searching takes another parameter, `pn`, that allows you to start the job search at jobs 11-20, 21-30, etc. Use this to grab job results from multiple pages at once. This function takes in the base_URL, then adds that pn={page_number} parameter to the URL, and then queries it. It passes the results on to a thread to grab the details from each job posting. Args: ---- base_URL: str Holds the base URL to add the page_start parameter to. job_title: str Holds the job title used for the search. job_location: str Holds the job location used for the search. page_number: int Holds what the `start` parameter in the URL should be set to. """ url = base_URL + '&pn=' + str(page_number) html = get_html(url) # Each row corresponds to a job. jobs = html.select('.js-job') threads = [] mongo_update_lst = [] for job in jobs: thread = RequestInfoThread(job, job_title, job_location) thread.start() threads.append(thread) for thread in threads: thread.join() mongo_update_lst.append(thread.json_dct) store_in_mongo(mongo_update_lst, 'job_postings', 'simplyhired')