Exemplos de parse_num em Python, exemplos de utils.parsing_utilities.parse_num em Python

Exemplo n.º 1

0

Exibir arquivo

def query_for_data(driver, json_dct, job, idx):
    """Grab all info. from the job posting
    
    This will include the job title, the job location, the 
    posting company, the date posted, and then any stars assigned. 
    After grabbing this information, click and get the job posting's
    actual text. 

    Args: 
        driver: Selenium webdriver
        json_dct: dict 
            Dictionary holding the current information that is being stored
            for that job posting. 
        job: Selenium WebElement
        idx: int
            Holds the # of the job posting the program is on (0 indexed here). 

    Return: dct
    """

    posting_title = job.find_element_by_class_name('title').text
    split_posting_company = job.find_element_by_class_name(
        'companyInfo').text.split()
    posting_location = job.find_element_by_xpath(
        "//div//span[@itemprop='jobLocation']").text
    try:
        posting_date = job.find_element_by_class_name('minor').text
    except:
        posting_date = ''

    # I couldn't think of any clearly better way to do this. If they have
    # a number of stars, it comes in the posting companies text. I guess
    # I could have done a search and replace, but I'd rather slightly adjust
    # some functionality I already have (i.e. parse_num) than build another
    # function to find the number of stars, store it, and then replace it with
    # empty text.
    if parse_num(' '.join(split_posting_company), 0):
        num_stars = split_posting_company[0]
        posting_company = ' '.join(split_posting_company[1:])
        out_json_dct = gen_output(json_dct.copy(), posting_title,
                                  posting_location, posting_date,
                                  posting_company, num_stars)
    else:
        posting_company = ' '.join(split_posting_company)
        out_json_dct = gen_output(json_dct.copy(), posting_title,
                                  posting_location, posting_date,
                                  posting_company)

    out_json_dct['posting_txt'] = grab_posting_txt(driver, job, idx)
    return out_json_dct

Exemplo n.º 2

0

Exibir arquivo

        raise Exception(
            'Program needs a job title, job location, and radius inputted!')

    base_URL = 'http://www.simplyhired.com/search?'
    query_parameters = [
        'q={}'.format('+'.join(job_title.split())),
        '&l={}'.format('+'.join(job_location.split())),
        '&mi={}'.format(radius), '&fdb=5', '&clst=CTL'
    ]

    query_URL = format_query(base_URL, query_parameters)

    html = get_html(query_URL)
    try:
        num_jobs_txt = str(html.select('.result-headline')[0].text)
        num_jobs = int(parse_num(num_jobs_txt, 2))
    except:
        print('No jobs for search {} in {}'.format(job_title, job_location))
        sys.exit(0)

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {
        'job_site': 'simplyhired',
        'num_jobs': num_jobs,
        'date': current_date,
        'title': job_title,
        'location': job_location
    }
    store_in_mongo([storage_dct], 'job_numbers', 'simplyhired')

    # All of the jobs should be available through the '.js-job-link' CSS class.

Exemplo n.º 3

0

Exibir arquivo

    try:
        job_title = sys.argv[1]
        job_location = sys.argv[2]
    except IndexError:
        raise Exception('Program needs a job title and job location inputted!')

    # Issue the job query.
    base_URL = 'https://www.glassdoor.com/index.htm'
    query_params = (('KeywordSearch', job_title), ('LocationSearch',
                                                   job_location))
    driver = issue_driver_query(base_URL, query_params)

    # Find the text holding the number of jobs, and parse it.
    time.sleep(random.randint(7, 15))
    num_jobs_txt = driver.find_elements_by_xpath('//header')[1].text
    num_jobs = int(parse_num(num_jobs_txt, 0))

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {
        'job_site': 'glassdoor',
        'num_jobs': num_jobs,
        'date': current_date,
        'title': job_title,
        'location': job_location
    }
    store_in_mongo([storage_dct], 'job_numbers', 'glassdoor')

    # Find the text holding the number of pages in the job search.
    time.sleep(random.randint(2, 6))
    try:
        num_pages_txt = driver.find_element_by_id('ResultsFooter').text

Exemplo n.º 4

0

Exibir arquivo

Arquivo: job_scraper.py Projeto: xkortex/web-scrapers

    base_URL = 'http://jobs.monster.com/search/?'
    query_parameters = [
        'q={}'.format('-'.join(job_title.split())),
        '&where={}'.format('-'.join(job_location.split())), '&sort=dt.rv.di',
        '&rad={}'.format(radius)
    ]

    query_URL = format_query(base_URL, query_parameters)
    driver = issue_driver_query(query_URL, driver_path=driver_path)

    if verbose: print('<v> Successfully connected selenium')

    try:
        num_jobs = get_num_jobs_txt(driver)
        num_jobs = int(parse_num(num_jobs, 0))
        if verbose: print('<v> {} jobs found'.format(num_jobs))
    except:
        print('No jobs for search {} in {}'.format(job_title, job_location))
        sys.exit(0)

    assert 0, 'halt'

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {
        'job_site': 'monster',
        'num_jobs': num_jobs,
        'date': current_date,
        'title': job_title,
        'location': job_location
    }