Python get_html 예제들, general_utilities.query_utilities.get_html Python 예제들

예제 #1

0

파일 보기

파일: job_scraper.py 프로젝트: deepanshurathi553/web-scrappers

def multiprocess_pages(query_URL, job_title, job_location, page_num):
    """Grab the URLs and other relevant info. from job postings on the page. 

    The ZipRecruiter URL used for job searching takes an additional parameter,   
    `page`, that allows you to start the job search at page 0-20 (20 is the max). 
    Use this to grab job results from multiple pages at once, and then pass jobs
    on to threads to grab relevant info. 

    Args: 
    ----
        base_URL: str 
        job_title: str 
        job_location: str 
        page_start: int 
    """
    url = query_URL + '&page=' + str(page_num)
    #print (url)
    html = get_html(url)

    rows = html.select('.job_content')
    #print (rows)
    threads = []
    mongo_update_lst = []
    for row in rows:
        # print (row)
        thread = RequestInfoThread(row, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads:
        thread.join()
        mongo_update_lst.append(thread.json_dct)

    #store_in_mongo(mongo_update_lst, 'job_postings', 'ziprecruiter')
    #print (mongo_update_lst)
    store_in_mongo(mongo_update_lst, 'job_postings', 'ziprecruiter_final')

예제 #2

0

파일 보기

def get_critic_lst_content(critics_hrefs_values, critic_lst_idx):
    """Grab the CSS element that holds all relevant info. for a critic list. 

    For the critic href at the inputted idx in the critics_hrefs_values, grab
    all of the items with the class '.listLargeTitle'. This will then be used 
    to cycle through each one of them and grab information from them. 

    Args: 
    ----
        critics_hrefs_values: list of strings 
        critic_lst_idx: int

    Return: 
    ------
        critic_lst_content_vals: list 
        soup: bs4.BeautifulSoup object
    """

    base_individual_list_url = 'http://www.albumoftheyear.org'
    css_selectors = ['.listLargeTitle']

    critic_url = base_individual_list_url + critics_hrefs_values[critic_lst_idx]
    soup = get_html(base_individual_list_url +
                    critics_hrefs_values[critic_lst_idx])

    critic_content_lst = list(select_soup(soup, css_selectors).values())
    critic_lst_content_vals = critic_content_lst[0]
    # We reverse them because they are posted from the highest ranked (worst album)
    # to the lowest rank (i.e. Post-1 is the highest ranked album on the critic list).
    critic_lst_content_vals.reverse()

    return critic_lst_content_vals, soup

예제 #3

0

파일 보기

def multiprocess_pages(base_URL, job_title, job_location, page_num): 
    """Grab the URLs and other relevant info. from job postings on the page. 

    The ZipRecruiter URL used for job searching takes an additional 
    parameter, `page`, that allows you to start the job search at page 
    0-20 (20 is the max). I can use this to grab job results from multiple
    pages at once. This function here takes in the base_URL, and then 
    adds that page={page_num} parameter to the URL, and then queries it. 
    It passes the results on to a thread to grab the details from each 
    job posting. 

    Args: 
        base_URL: String that holds the base URL to add the page_num 
            parameter to. 
        job_title: String holding the job title used for the search 
        job_location: String holding the job location used for the search
        page_num: Integer of what the `page` paramter in the URL should 
            be set to. 
    """

    url = query_URL + '&page=' + str(page_num)
    html = get_html(url)
    rows = html.select('.job_result')
    threads = []
    mongo_update_lst = []
    for row in rows: 
        thread = RequestInfoThread(row, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads: 
        thread.join()
        mongo_update_lst.append(thread.json_dct)

    store_in_mongo(mongo_update_lst, 'job_postings', 'ziprecruiter')

예제 #4

0

파일 보기

파일: job_scraper.py 프로젝트: sallamander/web-scrapers

def multiprocess_pages(base_URL, job_title, job_location, page_number): 
    """Grab the URLS and other relevant info. from job postings on the page. 

    The Simply Hired URL used for job searching takes another parameter, `pn`, that
    allows you to start the job search at jobs 11-20, 21-30, etc. Use this to grab
    job results from multiple pages at once, and then feed the jobs from each page
    to threads for further parsing. 

    Args: 
    ----
        base_URL: str 
        job_title: str 
        job_location: str 
        page_number: int 
    """

    url = base_URL + '&pn=' + str(page_number)
    html = get_html(url)
    # Each row corresponds to a job. 
    jobs = html.select('.js-job')
    threads = []
    mongo_update_lst = []
    for job in jobs: 
        thread = RequestInfoThread(job, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads: 
        thread.join()
        mongo_update_lst.append(thread.json_dct)
    
    store_in_mongo(mongo_update_lst, 'job_postings', 'simplyhired')

예제 #5

0

파일 보기

파일: end_year_critic_lists.py 프로젝트: sallamander/web-scrapers

def get_critic_lst_content(critics_hrefs_values, critic_lst_idx):
    """Grab the CSS element that holds all relevant info. for a critic list. 

    For the critic href at the inputted idx in the critics_hrefs_values, grab
    all of the items with the class '.listLargeTitle'. This will then be used 
    to cycle through each one of them and grab information from them. 

    Args: 
    ----
        critics_hrefs_values: list of strings 
        critic_lst_idx: int

    Return: 
    ------
        critic_lst_content_vals: list 
        soup: bs4.BeautifulSoup object
    """

    base_individual_list_url = 'http://www.albumoftheyear.org'
    css_selectors = ['.listLargeTitle']

    critic_url = base_individual_list_url + critics_hrefs_values[critic_lst_idx]
    soup = get_html(base_individual_list_url + critics_hrefs_values[critic_lst_idx]) 

    critic_content_lst = list(select_soup(soup, css_selectors).values())
    critic_lst_content_vals = critic_content_lst[0]
    # We reverse them because they are posted from the highest ranked (worst album)
    # to the lowest rank (i.e. Post-1 is the highest ranked album on the critic list).
    critic_lst_content_vals.reverse()

    return critic_lst_content_vals, soup

예제 #6

0

파일 보기

파일: job_scraper.py 프로젝트: rdt88/web-scrapers

def multiprocess_pages(base_URL, job_title, job_location, page_start):
    """Grab the URLS and other relevant info. from job postings on the page. 

    The Indeed URL used for job searching takes another parameter, 
    `start`, that allows you to start the job search at jobs 10-20, 
    20-30, etc. I can use this to grab job results from multiple pages at
    once. This function takes in the base_URL and then adds that
    start={page_start} parameter to the URL, and then queries it. 
    It passes the results on to a thread to grab the details from each
    job posting.

    Args: 
        base_URL: String that holds the base URL to add the page_start 
            parameter to. 
        job_title: String holding the job title used for the search
        job_location: String holding the job location used for the search 
        page_start: Integer of what the `start` parameter in the URL should
            be set to. 
    """

    url = base_URL + '&start=' + str(page_start)
    html = get_html(url)
    # Each row corresponds to a job.
    rows = html.select('.row')
    threads = []
    mongo_update_lst = []
    for row in rows:
        thread = RequestInfoThread(row, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads:
        thread.join()
        mongo_update_lst.append(thread.json_dct)

    store_in_mongo(mongo_update_lst, 'job_postings', 'indeed')

예제 #7

0

파일 보기

파일: end_year_critic_lists.py 프로젝트: rf24/web-scrapers

def get_critic_lst_content(critics_hrefs_values, idx):
    """Grab the CSS element that holds all relevant info. for a critic list. 

    For the critic href at the inputted idx in the critics_hrefs_values, grab
    all of the items with the class '.listLargeTitle'. This will then be used 
    to cycle through each one of them and grab information from them. 

    Args: 
    ----
        critics_hrefs_values: list of strings 
            Holds the href attribute for each critic list, and is 
            used to issue a get request against that href. 
        idx: int
            Holds the index of the current critics list that is being 
            looked at. 

    Return: list, bs4.BeautifulSoup object 
    """

    base_individual_list_url = 'http://www.albumoftheyear.org'
    css_selectors = ['.listLargeTitle']

    critic_url = base_individual_list_url + critics_hrefs_values[idx]
    soup = get_html(base_individual_list_url + critics_hrefs_values[idx]) 

    critic_lst_content_vals = select_soup(soup, css_selectors).values()[0]
    # We reverse them because they are posted from the highest ranked 
    # (worst album) to the lowest rank (i.e. Post-1 is the highest ranked 
    # album on the critic list).
    critic_lst_content_vals.reverse()

    return critic_lst_content_vals, soup

예제 #8

0

파일 보기

파일: job_scraper.py 프로젝트: deepanshurathi553/web-scrappers

def multiprocess_pages(base_URL, job_title, job_location, page_number):
    """Grab the URLS and other relevant info. from job postings on the page. 

    The Simply Hired URL used for job searching takes another parameter, `pn`, that
    allows you to start the job search at jobs 11-20, 21-30, etc. Use this to grab
    job results from multiple pages at once, and then feed the jobs from each page
    to threads for further parsing. 

    Args: 
    ----
        base_URL: str 
        job_title: str 
        job_location: str 
        page_number: int 
    """

    url = base_URL + '&pn=' + str(page_number)
    html = get_html(url)
    # Each row corresponds to a job.
    #f = open("HTML Code 2", "w", encoding = "utf-8")
    #f.write(html.prettify())
    jobs = html.select('.jobs')
    #print (jobs)
    threads = []
    mongo_update_lst = []
    for job in jobs:
        thread = RequestInfoThread(job, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads:
        thread.join()
        mongo_update_lst.append(thread.json_dct)

    store_in_mongo(mongo_update_lst, 'job_postings', 'monster')

예제 #9

0

파일 보기

파일: job_scraper.py 프로젝트: Joretapau/Scrap-Data-For-Good

def multiprocess_pages(base_URL, job_title, job_location, page_start):
    """Grab the URLS and other relevant info. from job postings on the page.

    The Indeed URL used for job searching takes another parameter, `start`, that
    allows you to start the job search at jobs 10-20, 20-30, etc. Use this to grab
    job results from multiple pages at once, passing the result from a page on to
    a thread to grab the details from each job posting.

    Args:
    ----
        base_URL: str
        job_title: str
        job_location: str
        page_start: int
    """

    url = base_URL + '&start=' + str(page_start)
    html = get_html(url)
    # Each row corresponds to a job.
    rows = html.select('.row')
    threads = []
    mongo_update_lst = []
    for row in rows:
        thread = RequestInfoThread(row, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads:
        thread.join()
        mongo_update_lst.append((thread.json_dct))

    #r = StrictRedis(host='localhost', port=6379, db=0)

    #r.lpush("JOB-"+job_title+"-"+job_location, *mongo_update_lst)
    return mongo_update_lst

예제 #10

0

파일 보기

파일: job_scraper.py 프로젝트: sallamander/web-scrapers

def multiprocess_pages(base_URL, job_title, job_location, page_num):
    """Grab the URLs and other relevant info. from job postings on the page. 

    The ZipRecruiter URL used for job searching takes an additional parameter,   
    `page`, that allows you to start the job search at page 0-20 (20 is the max). 
    Use this to grab job results from multiple pages at once, and then pass jobs
    on to threads to grab relevant info. 

    Args: 
    ----
        base_URL: str 
        job_title: str 
        job_location: str 
        page_start: int 
    """

    url = query_URL + "&page=" + str(page_num)
    html = get_html(url)
    rows = html.select(".job_result")
    threads = []
    mongo_update_lst = []
    for row in rows:
        thread = RequestInfoThread(row, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads:
        thread.join()
        mongo_update_lst.append(thread.json_dct)

    store_in_mongo(mongo_update_lst, "job_postings", "ziprecruiter")

예제 #11

0

파일 보기

파일: job_scraper.py 프로젝트: cdingding/web-scrapers

def multiprocess_pages(base_URL, job_title, job_location, page_start): 
    """Grab the URLS and other relevant info. from job postings on the page. 

    The Indeed URL used for job searching takes another parameter, 
    `start`, that allows you to start the job search at jobs 10-20, 
    20-30, etc. I can use this to grab job results from multiple pages at
    once. This function takes in the base_URL and then adds that
    start={page_start} parameter to the URL, and then queries it. 
    It passes the results on to a thread to grab the details from each
    job posting.

    Args: 
        base_URL: String that holds the base URL to add the page_start 
            parameter to. 
        job_title: String holding the job title used for the search
        job_location: String holding the job location used for the search 
        page_start: Integer of what the `start` parameter in the URL should
            be set to. 
    """

    url = base_URL + '&start=' + str(page_start)
    html = get_html(url)
    # Each row corresponds to a job. 
    rows = html.select('.row')
    threads = []
    mongo_update_lst = []
    for row in rows: 
        thread = RequestInfoThread(row, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads: 
        thread.join()
        mongo_update_lst.append(thread.json_dct)

    store_in_mongo(mongo_update_lst, 'job_postings', 'indeed')

예제 #12

0

파일 보기

def process_album_title_hrefs(album_title_hrefs, album_titles): 
    """Grab the critic and user scores for each inputted href. 

    Loop over the hrefs in `album_title_hrefs`, issue a get request on the URL 
    associated with that href, and then parse the content to grab the User and 
    Critic scores for that album. Store the User and Critic scores in a dictionary
    along with the Album title. Output it all in a list, with one entry per href. 

    Args: 
    ----
        album_title_hrefs: list of strings 
        album_titles: list of strings

    Return: 
    ------
        final_json_lst: list
    """

    base_url = 'http://www.albumoftheyear.org'
    final_json_lst = []
    album_title_hrefs_lst = list(album_title_hrefs.values())
    for idx, href in enumerate(album_title_hrefs_lst[0]):
        soup = get_html(base_url + href)

        center_content_lst = list(select_soup(soup, '#centerContent').values())
        center_content = center_content_lst[0][0]
        user_score = int(find_score(center_content, 'USER SCORE'))
        critic_score = int(find_score(center_content, 'CRITIC SCORE'))

        json_dct = {'Album Title': album_titles[idx], "User Score": user_score, 
                    "Critic Score": critic_score}
        final_json_lst.append(json_dct)

    return final_json_lst

예제 #13

0

파일 보기

파일: job_scraper.py 프로젝트: sallamander/web-scrapers

def multiprocess_pages(base_URL, job_title, job_location, page_start): 
    """Grab the URLS and other relevant info. from job postings on the page. 

    The Indeed URL used for job searching takes another parameter, `start`, that 
    allows you to start the job search at jobs 10-20, 20-30, etc. Use this to grab
    job results from multiple pages at once, passing the result from a page on to
    a thread to grab the details from each job posting. 
    
    Args: 
    ----
        base_URL: str 
        job_title: str 
        job_location: str 
        page_start: int 
    """

    url = base_URL + '&start=' + str(page_start)
    html = get_html(url)
    # Each row corresponds to a job. 
    rows = html.select('.row')
    threads = []
    mongo_update_lst = []
    for row in rows: 
        thread = RequestInfoThread(row, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads: 
        thread.join()
        mongo_update_lst.append(thread.json_dct)

    store_in_mongo(mongo_update_lst, 'job_postings', 'indeed')

예제 #14

0

파일 보기

파일: job_scraper.py 프로젝트: Joretapau/Scrap-Data-For-Good

def scrap(job_title, job_location, radius, result_nb):

    base_URL = 'http://www.indeed.fr/emplois?'
    query_parameters = [
        'q={}'.format('+'.join(job_title.split())),
        '&l={}'.format('+'.join(job_location.split())),
        '&rq={}'.format(radius), '&sort=date', '&fromage=last'
    ]

    query_URL = format_query(base_URL, query_parameters)
    print(query_URL)

    html = get_html(query_URL)
    try:
        num_jobs_txt = str(html.select('#searchCount'))
        num_jobs = int(parse_num(num_jobs_txt, 2))
    except:
        print('No jobs for search {} in {}'.format(job_title, job_location))
        sys.exit(0)

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {
        'job_site': 'indeed',
        'num_jobs': num_jobs,
        'date': current_date,
        'title': job_title,
        'location': job_location
    }

    # Cycle through all of the job postings that we can and grab the url pointing to
    # it, to then query it. All of the jobs should be available via the
    # .turnstileLink class, and then the href attribute will point to the URL.
    max_start_position = 1000 if num_jobs >= 1000 else num_jobs
    start_positions = range(0, max_start_position, 10)
    db_path = "".join([
        "db_", job_title, "_", job_location, "_",
        str(radius), "_",
        str(result_nb)
    ])

    jobs = []
    for i in range(0, result_nb, 10):
        try:
            jobs.extend(
                multiprocess_pages(query_URL, job_title, job_location, i))
        except RuntimeError:
            pass
            #retry ?
#cPickle.dump(jobs, "jobs")
    with open("".join([db_path, ".pkl"]), 'w') as f:
        cPickle.dump(jobs, f)
        f.close()
    return jobs

예제 #15

0

파일 보기

    def scrape_pages(self):
        """Scrape all pages stored in `self.web_urls`."""

        for article in self.articles_to_scrape:
            url = article['web_url']

            if url.startswith('/'):
                url = 'http://www.nytimes.com' + url
            sleep(1 / 20)
            soup = get_html(url)
            article_txt = self._parse_soup(soup)

            if article_txt:
                article['text'] = article_txt

예제 #16

0

파일 보기

파일: article_scraper.py 프로젝트: sallamander/web-scrapers

    def scrape_pages(self):
        """Scrape all pages stored in `self.web_urls`."""

        for article in self.articles_to_scrape:
            url = article['web_url']
            
            if url.startswith('/'):
                url = 'http://www.nytimes.com' + url
            sleep(1/20)
            soup = get_html(url)
            article_txt = self._parse_soup(soup)

            if article_txt: 
                article['text'] = article_txt

예제 #17

0

파일 보기

파일: threading_utilities.py 프로젝트: deepanshurathi553/web-scrappers

    def _query_href(self): 
        """Grab the text from the href. 

        Returns: str of visible text from the href. 
        """
        try:
            soup = get_html(self.href)

            texts = soup.findAll(text=True)
            visible_texts = filter(find_visible_texts, texts)
        except Exception as e: 
            print(e)
            visible_texts = ['SSLError', 'happened']

        return ' '.join(visible_texts)

예제 #18

0

파일 보기

파일: threading_utilities.py 프로젝트: cdingding/web-scrapers

    def _query_href(self): 
        """Grab the text from the href. 

        Now we want to actually follow the href that is given in the 
        job posting, and grab the posting text from there. 

        Args: 
            href: String of the href to the job posting. 
        """
        try:
            soup = get_html(self.href)

            texts = soup.findAll(text=True)
            visible_texts = filter(find_visible_texts, texts)
        except Exception as e: 
            print e 
            visible_texts = ['SSLError', 'happened']

        return ' '.join(visible_texts)

예제 #19

0

파일 보기

파일: threading_utilities.py 프로젝트: rdt88/web-scrapers

    def _query_href(self):
        """Grab the text from the href. 

        Now we want to actually follow the href that is given in the 
        job posting, and grab the posting text from there. 

        Args: 
            href: String of the href to the job posting. 
        """
        try:
            soup = get_html(self.href)

            texts = soup.findAll(text=True)
            visible_texts = filter(find_visible_texts, texts)
        except Exception as e:
            print e
            visible_texts = ['SSLError', 'happened']

        return ' '.join(visible_texts)

예제 #20

0

파일 보기

파일: job_scraper.py 프로젝트: rf24/web-scrapers

def multiprocess_pages(base_URL, job_title, job_location, page_number): 
    """Grab the URLS and other relevant info. from job postings on the page. 

    The Simply Hired URL used for job searching takes another parameter, 
    `pn`, that allows you to start the job search at jobs 11-20, 
    21-30, etc. Use this to grab job results from multiple pages at
    once. 
    
    This function takes in the base_URL, then adds that
    pn={page_number} parameter to the URL, and then queries it. 
    It passes the results on to a thread to grab the details from each
    job posting.


    Args: 
    ----
        base_URL: str 
            Holds the base URL to add the page_start parameter to. 
        job_title: str 
            Holds the job title used for the search. 
        job_location: str 
            Holds the job location used for the search. 
        page_number: int 
            Holds what the `start` parameter in the URL should be set to. 
    """

    url = base_URL + '&pn=' + str(page_number)
    html = get_html(url)
    # Each row corresponds to a job. 
    jobs = html.select('.js-job')
    threads = []
    mongo_update_lst = []
    for job in jobs: 
        thread = RequestInfoThread(job, job_title, job_location)
        thread.start()
        threads.append(thread)
    for thread in threads: 
        thread.join()
        mongo_update_lst.append(thread.json_dct)
    
    store_in_mongo(mongo_update_lst, 'job_postings', 'simplyhired')

예제 #21

0

파일 보기

def process_album_title_hrefs(album_title_hrefs, album_titles): 
    '''
    Input: List
    Output: Dictionary

    For each of the inputted hrefs, go to the href and grab the overall 
    critic and user scores. 
    '''
    base_url = 'http://www.albumoftheyear.org'
    final_json_lst = []
    for idx, href in enumerate(album_title_hrefs.values()[0]):
        soup = get_html(base_url + href)
        center_content = select_soup(soup, '#centerContent').values()[0][0]
        user_score = int(find_score(center_content, 'USER SCORE'))
        critic_score = int(find_score(center_content, 'CRITIC SCORE'))
        json_dct = {'Album Title': album_titles[idx], "User Score": user_score, 
                    "Critic Score": critic_score}

        final_json_lst.append(json_dct)

    return final_json_lst

예제 #22

0

파일 보기

파일: end_year_critic_lists.py 프로젝트: cdingding/web-scrapers

def get_critic_lst_content(critics_hrefs_values, idx):
    '''
    Input: List, Integer
    Output: List, BeautifulSoup object

    For the critic href at the inputted idx in the critics_hrefs_values, grab
    all of the items with the class '.listLargeTitle'. This will then be used 
    to cycle through each one of them and grab information from them. 
    '''

    base_individual_list_url = 'http://www.albumoftheyear.org'
    css_selectors = ['.listLargeTitle']

    critic_url = base_individual_list_url + critics_hrefs_values[idx]
    soup = get_html(base_individual_list_url + critics_hrefs_values[idx]) 

    critic_lst_content_vals = select_soup(soup, css_selectors).values()[0]
    # We reverse them because they are posted from the highest ranked 
    # (worst album) to the lowest rank (i.e. Post-1 is the highest ranked 
    # album on the critic list).
    critic_lst_content_vals.reverse()

    return critic_lst_content_vals, soup

예제 #23

0

파일 보기

파일: end_year_critic_lists.py 프로젝트: rdt88/web-scrapers

def get_critic_lst_content(critics_hrefs_values, idx):
    '''
    Input: List, Integer
    Output: List, BeautifulSoup object

    For the critic href at the inputted idx in the critics_hrefs_values, grab
    all of the items with the class '.listLargeTitle'. This will then be used 
    to cycle through each one of them and grab information from them. 
    '''

    base_individual_list_url = 'http://www.albumoftheyear.org'
    css_selectors = ['.listLargeTitle']

    critic_url = base_individual_list_url + critics_hrefs_values[idx]
    soup = get_html(base_individual_list_url + critics_hrefs_values[idx])

    critic_lst_content_vals = select_soup(soup, css_selectors).values()[0]
    # We reverse them because they are posted from the highest ranked
    # (worst album) to the lowest rank (i.e. Post-1 is the highest ranked
    # album on the critic list).
    critic_lst_content_vals.reverse()

    return critic_lst_content_vals, soup

예제 #24

0

파일 보기

파일: albums_of_year_lst_ind.py 프로젝트: rf24/web-scrapers

def process_album_title_hrefs(album_title_hrefs, album_titles): 
    """Grab the critic and user scores for each inputted href. 

    Loop over the hrefs in `album_title_hrefs`, issue a get request
    on the URL associated with that href, and then parse the content 
    to grab the User and Critic scores for that album. Store the 
    User and Critic scores in a dictionary along with the Album title, 
    and then append that to a list to output for easy storage. 

    Args: 
    ----
        album_title_hrefs: list of strings 
            Holds the hrefs of each album title to issue a get
            request on. 
        album_titles: list of strings
            Holds the album titles to store with the User and 
            Critic scores that we're grabbing. This will allow 
            identification of a User/Critic score with a particular
            album. 

    Return: list of dictionaries 
    """

    base_url = 'http://www.albumoftheyear.org'
    final_json_lst = []
    for idx, href in enumerate(album_title_hrefs.values()[0]):
        soup = get_html(base_url + href)

        center_content = select_soup(soup, '#centerContent').values()[0][0]
        user_score = int(find_score(center_content, 'USER SCORE'))
        critic_score = int(find_score(center_content, 'CRITIC SCORE'))

        json_dct = {'Album Title': album_titles[idx], "User Score": user_score, 
                    "Critic Score": critic_score}
        final_json_lst.append(json_dct)

    return final_json_lst

예제 #25

0

파일 보기

파일: end_year_critic_lists.py 프로젝트: sallamander/web-scrapers

        rating_txt: str
            Text that potentially holds the rating. 
        idx: int
            Holds the rating if the text does not. 

    Return: int
    """

    if len(rating_txt) >= 1: 
        rating = int(rating_txt[0].replace('.', ''))
    else: 
        rating = idx

    return rating

if __name__ == '__main__':
    lists_url = 'http://www.albumoftheyear.org/lists.php'

    soup = get_html(lists_url)
    critics_content = select_soup(soup, '.criticListBlockTitle')
    critics_names = grab_contents_key(critics_content, "text")
    critics_links = grab_contents_key(critics_content, 'a')
    critics_hrefs = grab_contents_key(critics_links, 'href')

    raw_output = grab_critics_info(critics_names, critics_hrefs)
    formatted_output = [{"Album Title": k, "Critics Scores": v} for \
            k, v in raw_output.items()]
    store_in_mongo(formatted_output, 'music', 'music_lists', 
                        key="Album Title")

예제 #26

0

파일 보기

파일: job_scraper.py 프로젝트: deepanshurathi553/web-scrappers

        job_location = sys.argv[2]
        radius = sys.argv[3]
    except IndexError:
        raise Exception(
            'Program needs a job title, job location, and radius inputted!')

    base_URL = 'https://www.ziprecruiter.com/candidate/search?'
    query_parameters = [
        'search={}'.format('+'.join(job_title.split())),
        '&location={}'.format('+'.join(job_location.split())),
        '&radius={}'.format(radius), '&days=5', '&include_near_duplicates=1'
    ]

    query_URL = format_query(base_URL, query_parameters)
    #print (query_URL)
    html = get_html(query_URL)

    try:
        num_jobs_txt = str(html.select('#job_results_headline')[0].text)
        num_jobs = int(parse_num(num_jobs_txt, 0))
        print(num_jobs)
    except:
        print('No jobs for search {} in {}'.format(job_title, job_location))
        sys.exit(0)

    current_date = str(datetime.datetime.now(pytz.timezone('US/Mountain')))
    storage_dct = {
        'job_site': 'ziprecruiter',
        'num_jobs': num_jobs,
        'date': current_date,
        'title': job_title,

예제 #27

0

파일 보기

파일: job_scraper.py 프로젝트: sallamander/web-scrapers

        job_location = sys.argv[2]
        radius = sys.argv[3]
    except IndexError:
        raise Exception("Program needs a job title, job location, and radius inputted!")

    base_URL = "https://www.ziprecruiter.com/candidate/search?"
    query_parameters = [
        "search={}".format("+".join(job_title.split())),
        "&location={}".format("+".join(job_location.split())),
        "&radius={}".format(radius),
        "&days=5",
        "&include_near_duplicates=1",
    ]

    query_URL = format_query(base_URL, query_parameters)
    html = get_html(query_URL)

    try:
        num_jobs_txt = str(html.select("#job_results_headline")[0].text)
        num_jobs = int(parse_num(num_jobs_txt, 0))
    except:
        print("No jobs for search {} in {}".format(job_title, job_location))
        sys.exit(0)

    current_date = str(datetime.datetime.now(pytz.timezone("US/Mountain")))
    storage_dct = {
        "job_site": "ziprecruiter",
        "num_jobs": num_jobs,
        "date": current_date,
        "title": job_title,
        "location": job_location,

예제 #28

0

파일 보기

파일: end_year_critic_lists.py 프로젝트: rdt88/web-scrapers

def format_output(raw_output):
    '''
    Input: Dictionary
    Output: List

    Reformat the dictionary so that we can easily insert it into our Mongo
    database. Basically, right now the dictionary consists of album titles 
    as the keys, and lists of their ratings on critics lists as the values. 
    We need it to be a list of dictionaries in the format: 
    
    {'Album Title': album_title, 'Critics Scores' : critics_scores_lst}
    '''

    output_lst = [{"Album Title": k, "Critics Scores": v} for \
            k, v in raw_output.iteritems()]
    return output_lst


if __name__ == '__main__':
    lists_url = 'http://www.albumoftheyear.org/lists.php'

    soup = get_html(lists_url)
    critics_content = select_soup(soup, '.criticListBlockTitle')
    critics_names = grab_contents_key(critics_content, "text")
    critics_links = grab_contents_key(critics_content, 'a')
    critics_hrefs = grab_contents_key(critics_links, 'href')

    raw_output = grab_critics_info(critics_names, critics_hrefs)
    formatted_output = format_output(raw_output)
    store_in_mongo(formatted_output, 'music', 'music_lists', key="Album Title")

예제 #29

0

파일 보기

파일: albums_of_year_lst_full.py 프로젝트: wrathinc/web-scrapers

    while attribute.find('Other') == -1:
        values[attribute] = value
        points_misc_idx += 1
        # The value is always the last item present, surrounded by (), and the 
        # 1+ items before that are the attributes to which those points belong. 
        split_text = sum_points_misc_lst[points_misc_idx].split()
        attribute = ' '.join(split_text[:-1])
        value = split_text[-1].replace('(', '').replace(')', '')
    values[attribute] = value
    points_misc_idx += 1

    return values, points_misc_idx 

if __name__ == '__main__':
    try: 
        year = sys.argv[1]
    except Exception as e: 
        print(e)
        raise Exception('<Usage> Input a year to grab music data for.')

    URL = 'http://www.albumoftheyear.org/list/summary/' + year + '/'
    soup = get_html(URL) 

    css_selectors = ['.artistTitle', '.albumTitle', '.summaryPoints', 
                     '.summaryPointsMisc']
    desired_contents = select_soup(soup, css_selectors)
    desired_contents_text = grab_contents_key(desired_contents, "text")
    desired_contents_renamed = rename_keys(desired_contents_text)
    final_lst = parse_contents(desired_contents_renamed)
    store_in_mongo(final_lst, 'music', 'music_lists')