Пример #1
0
def multiprocess_pages(base_URL, job_title, job_location, page_start): 
    """Grab the URLS and other relevant info. from job postings on the page. 

    The Indeed URL used for job searching takes another parameter, 
    `start`, that allows you to start the job search at jobs 10-20, 
    20-30, etc. I can use this to grab job results from multiple pages at
    once. This function takes in the base_URL and then adds that
    start={page_start} parameter to the URL, and then queries it. 
    It passes the results on to a thread to grab the details from each
    job posting.

    Args: 
        base_URL: String that holds the base URL to add the page_start 
            parameter to. 
        job_title: String holding the job title used for the search
        job_location: String holding the job location used for the search 
        page_start: Integer of what the `start` parameter in the URL should
            be set to. 
    """

    url = base_URL + '&start=' + str(page_start)
    html = get_html(url)
    # Each row corresponds to a job. 
    rows = html.select('.row')
    threads = []
    mongo_update_lst = []
    for row in rows: 
        thread = RequestInfoThread(row, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads: 
        thread.join()
        mongo_update_lst.append(thread.json_dct)

    store_in_mongo(mongo_update_lst, 'job_postings', 'indeed')
Пример #2
0
def multiprocess_pages(base_URL, job_title, job_location, page_num):
    """Grab the URLs and other relevant info. from job postings on the page. 

    The ZipRecruiter URL used for job searching takes an additional parameter,   
    `page`, that allows you to start the job search at page 0-20 (20 is the max). 
    Use this to grab job results from multiple pages at once, and then pass jobs
    on to threads to grab relevant info. 

    Args: 
    ----
        base_URL: str 
        job_title: str 
        job_location: str 
        page_start: int 
    """

    url = query_URL + "&page=" + str(page_num)
    html = get_html(url)
    rows = html.select(".job_result")
    threads = []
    mongo_update_lst = []
    for row in rows:
        thread = RequestInfoThread(row, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads:
        thread.join()
        mongo_update_lst.append(thread.json_dct)

    store_in_mongo(mongo_update_lst, "job_postings", "ziprecruiter")
Пример #3
0
def multiprocess_pages(base_URL, job_title, job_location, page_start): 
    """Grab the URLS and other relevant info. from job postings on the page. 

    The Indeed URL used for job searching takes another parameter, `start`, that 
    allows you to start the job search at jobs 10-20, 20-30, etc. Use this to grab
    job results from multiple pages at once, passing the result from a page on to
    a thread to grab the details from each job posting. 
    
    Args: 
    ----
        base_URL: str 
        job_title: str 
        job_location: str 
        page_start: int 
    """

    url = base_URL + '&start=' + str(page_start)
    html = get_html(url)
    # Each row corresponds to a job. 
    rows = html.select('.row')
    threads = []
    mongo_update_lst = []
    for row in rows: 
        thread = RequestInfoThread(row, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads: 
        thread.join()
        mongo_update_lst.append(thread.json_dct)

    store_in_mongo(mongo_update_lst, 'job_postings', 'indeed')
Пример #4
0
def multiprocess_pages(base_URL, job_title, job_location, page_number): 
    """Grab the URLS and other relevant info. from job postings on the page. 

    The Simply Hired URL used for job searching takes another parameter, `pn`, that
    allows you to start the job search at jobs 11-20, 21-30, etc. Use this to grab
    job results from multiple pages at once, and then feed the jobs from each page
    to threads for further parsing. 

    Args: 
    ----
        base_URL: str 
        job_title: str 
        job_location: str 
        page_number: int 
    """

    url = base_URL + '&pn=' + str(page_number)
    html = get_html(url)
    # Each row corresponds to a job. 
    jobs = html.select('.js-job')
    threads = []
    mongo_update_lst = []
    for job in jobs: 
        thread = RequestInfoThread(job, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads: 
        thread.join()
        mongo_update_lst.append(thread.json_dct)
    
    store_in_mongo(mongo_update_lst, 'job_postings', 'simplyhired')
Пример #5
0
def multiprocess_pages(base_URL, job_title, job_location, page_number): 
    """Grab the URLS and other relevant info. from job postings on the page. 

    The Simply Hired URL used for job searching takes another parameter, 
    `pn`, that allows you to start the job search at jobs 11-20, 
    21-30, etc. Use this to grab job results from multiple pages at
    once. 
    
    This function takes in the base_URL, then adds that
    pn={page_number} parameter to the URL, and then queries it. 
    It passes the results on to a thread to grab the details from each
    job posting.


    Args: 
    ----
        base_URL: str 
            Holds the base URL to add the page_start parameter to. 
        job_title: str 
            Holds the job title used for the search. 
        job_location: str 
            Holds the job location used for the search. 
        page_number: int 
            Holds what the `start` parameter in the URL should be set to. 
    """

    url = base_URL + '&pn=' + str(page_number)
    html = get_html(url)
    # Each row corresponds to a job. 
    jobs = html.select('.js-job')
    threads = []
    mongo_update_lst = []
    for job in jobs: 
        thread = RequestInfoThread(job, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads: 
        thread.join()
        mongo_update_lst.append(thread.json_dct)
    
    store_in_mongo(mongo_update_lst, 'job_postings', 'simplyhired')
def multiprocess_pages(query_URL, job_title, job_location, page_num):
    """Grab the URLs and other relevant info. from job postings on the page. 

    The ZipRecruiter URL used for job searching takes an additional parameter,   
    `page`, that allows you to start the job search at page 0-20 (20 is the max). 
    Use this to grab job results from multiple pages at once, and then pass jobs
    on to threads to grab relevant info. 

    Args: 
    ----
        base_URL: str 
        job_title: str 
        job_location: str 
        page_start: int 
    """
    url = query_URL + '&page=' + str(page_num)
    #print (url)
    html = get_html(url)

    rows = html.select('.job_content')
    #print (rows)
    threads = []
    mongo_update_lst = []
    for row in rows:
        # print (row)
        thread = RequestInfoThread(row, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads:
        thread.join()
        mongo_update_lst.append(thread.json_dct)

    #store_in_mongo(mongo_update_lst, 'job_postings', 'ziprecruiter')
    #print (mongo_update_lst)
    store_in_mongo(mongo_update_lst, 'job_postings', 'ziprecruiter_final')
Пример #7
0
def multiprocess_pages(base_URL, job_title, job_location, page_number):
    """Grab the URLS and other relevant info. from job postings on the page. 

    The Simply Hired URL used for job searching takes another parameter, `pn`, that
    allows you to start the job search at jobs 11-20, 21-30, etc. Use this to grab
    job results from multiple pages at once, and then feed the jobs from each page
    to threads for further parsing. 

    Args: 
    ----
        base_URL: str 
        job_title: str 
        job_location: str 
        page_number: int 
    """

    url = base_URL + '&pn=' + str(page_number)
    html = get_html(url)
    # Each row corresponds to a job.
    jobs = html.select('.js-job')
    threads = []
    mongo_update_lst = []
    for job in jobs:
        thread = RequestInfoThread(job, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads:
        thread.join()
        mongo_update_lst.append(thread.json_dct)

    store_in_mongo(mongo_update_lst, 'job_postings', 'simplyhired')
Пример #8
0
def multiprocess_pages(base_URL, job_title, job_location, page_num): 
    """Grab the URLs and other relevant info. from job postings on the page. 

    The ZipRecruiter URL used for job searching takes an additional 
    parameter, `page`, that allows you to start the job search at page 
    0-20 (20 is the max). I can use this to grab job results from multiple
    pages at once. This function here takes in the base_URL, and then 
    adds that page={page_num} parameter to the URL, and then queries it. 
    It passes the results on to a thread to grab the details from each 
    job posting. 

    Args: 
        base_URL: String that holds the base URL to add the page_num 
            parameter to. 
        job_title: String holding the job title used for the search 
        job_location: String holding the job location used for the search
        page_num: Integer of what the `page` paramter in the URL should 
            be set to. 
    """

    url = query_URL + '&page=' + str(page_num)
    html = get_html(url)
    rows = html.select('.job_result')
    threads = []
    mongo_update_lst = []
    for row in rows: 
        thread = RequestInfoThread(row, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads: 
        thread.join()
        mongo_update_lst.append(thread.json_dct)

    store_in_mongo(mongo_update_lst, 'job_postings', 'ziprecruiter')
Пример #9
0
def multiprocess_pages(base_URL, job_title, job_location, page_start):
    """Grab the URLS and other relevant info. from job postings on the page. 

    The Indeed URL used for job searching takes another parameter, 
    `start`, that allows you to start the job search at jobs 10-20, 
    20-30, etc. I can use this to grab job results from multiple pages at
    once. This function takes in the base_URL and then adds that
    start={page_start} parameter to the URL, and then queries it. 
    It passes the results on to a thread to grab the details from each
    job posting.

    Args: 
        base_URL: String that holds the base URL to add the page_start 
            parameter to. 
        job_title: String holding the job title used for the search
        job_location: String holding the job location used for the search 
        page_start: Integer of what the `start` parameter in the URL should
            be set to. 
    """

    url = base_URL + '&start=' + str(page_start)
    html = get_html(url)
    # Each row corresponds to a job.
    rows = html.select('.row')
    threads = []
    mongo_update_lst = []
    for row in rows:
        thread = RequestInfoThread(row, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads:
        thread.join()
        mongo_update_lst.append(thread.json_dct)

    store_in_mongo(mongo_update_lst, 'job_postings', 'indeed')
Пример #10
0
def multiprocess_pages(base_URL, job_title, job_location, page_start): 
    """Grab the URLS and other relevant info. from job postings on the page. 

    The Indeed URL used for job searching takes another parameter, `start`, that 
    allows you to start the job search at jobs 10-20, 20-30, etc. Use this to grab
    job results from multiple pages at once, passing the result from a page on to
    a thread to grab the details from each job posting. 
    
    Args: 
    ----
        base_URL: str 
        job_title: str 
        job_location: str 
        page_start: int 
    """

    url = base_URL + '&start=' + str(page_start)
    html = get_html(url)
    # Each row corresponds to a job. 
    rows = html.select('.row')
    threads = []
    mongo_update_lst = []
    for row in rows: 
        thread = RequestInfoThread(row, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads: 
        thread.join()
        mongo_update_lst.append(thread.json_dct)

    store_in_mongo(mongo_update_lst, 'job_postings', 'indeed')