Python select_soup示例，special_utilities.select_soup Python示例

示例#1

0

显示文件

文件： end_year_critic_lists.py 项目： rf24/web-scrapers

def get_critic_lst_content(critics_hrefs_values, idx):
    """Grab the CSS element that holds all relevant info. for a critic list. 

    For the critic href at the inputted idx in the critics_hrefs_values, grab
    all of the items with the class '.listLargeTitle'. This will then be used 
    to cycle through each one of them and grab information from them. 

    Args: 
    ----
        critics_hrefs_values: list of strings 
            Holds the href attribute for each critic list, and is 
            used to issue a get request against that href. 
        idx: int
            Holds the index of the current critics list that is being 
            looked at. 

    Return: list, bs4.BeautifulSoup object 
    """

    base_individual_list_url = 'http://www.albumoftheyear.org'
    css_selectors = ['.listLargeTitle']

    critic_url = base_individual_list_url + critics_hrefs_values[idx]
    soup = get_html(base_individual_list_url + critics_hrefs_values[idx]) 

    critic_lst_content_vals = select_soup(soup, css_selectors).values()[0]
    # We reverse them because they are posted from the highest ranked 
    # (worst album) to the lowest rank (i.e. Post-1 is the highest ranked 
    # album on the critic list).
    critic_lst_content_vals.reverse()

    return critic_lst_content_vals, soup

示例#2

0

显示文件

def process_album_title_hrefs(album_title_hrefs, album_titles): 
    """Grab the critic and user scores for each inputted href. 

    Loop over the hrefs in `album_title_hrefs`, issue a get request on the URL 
    associated with that href, and then parse the content to grab the User and 
    Critic scores for that album. Store the User and Critic scores in a dictionary
    along with the Album title. Output it all in a list, with one entry per href. 

    Args: 
    ----
        album_title_hrefs: list of strings 
        album_titles: list of strings

    Return: 
    ------
        final_json_lst: list
    """

    base_url = 'http://www.albumoftheyear.org'
    final_json_lst = []
    album_title_hrefs_lst = list(album_title_hrefs.values())
    for idx, href in enumerate(album_title_hrefs_lst[0]):
        soup = get_html(base_url + href)

        center_content_lst = list(select_soup(soup, '#centerContent').values())
        center_content = center_content_lst[0][0]
        user_score = int(find_score(center_content, 'USER SCORE'))
        critic_score = int(find_score(center_content, 'CRITIC SCORE'))

        json_dct = {'Album Title': album_titles[idx], "User Score": user_score, 
                    "Critic Score": critic_score}
        final_json_lst.append(json_dct)

    return final_json_lst

示例#3

0

显示文件

文件： end_year_critic_lists.py 项目： sallamander/web-scrapers

def get_critic_lst_content(critics_hrefs_values, critic_lst_idx):
    """Grab the CSS element that holds all relevant info. for a critic list. 

    For the critic href at the inputted idx in the critics_hrefs_values, grab
    all of the items with the class '.listLargeTitle'. This will then be used 
    to cycle through each one of them and grab information from them. 

    Args: 
    ----
        critics_hrefs_values: list of strings 
        critic_lst_idx: int

    Return: 
    ------
        critic_lst_content_vals: list 
        soup: bs4.BeautifulSoup object
    """

    base_individual_list_url = 'http://www.albumoftheyear.org'
    css_selectors = ['.listLargeTitle']

    critic_url = base_individual_list_url + critics_hrefs_values[critic_lst_idx]
    soup = get_html(base_individual_list_url + critics_hrefs_values[critic_lst_idx]) 

    critic_content_lst = list(select_soup(soup, css_selectors).values())
    critic_lst_content_vals = critic_content_lst[0]
    # We reverse them because they are posted from the highest ranked (worst album)
    # to the lowest rank (i.e. Post-1 is the highest ranked album on the critic list).
    critic_lst_content_vals.reverse()

    return critic_lst_content_vals, soup

示例#4

0

显示文件

文件： end_year_critic_lists.py 项目： xkortex/web-scrapers

def get_critic_lst_content(critics_hrefs_values, critic_lst_idx):
    """Grab the CSS element that holds all relevant info. for a critic list. 

    For the critic href at the inputted idx in the critics_hrefs_values, grab
    all of the items with the class '.listLargeTitle'. This will then be used 
    to cycle through each one of them and grab information from them. 

    Args: 
    ----
        critics_hrefs_values: list of strings 
        critic_lst_idx: int

    Return: 
    ------
        critic_lst_content_vals: list 
        soup: bs4.BeautifulSoup object
    """

    base_individual_list_url = 'http://www.albumoftheyear.org'
    css_selectors = ['.listLargeTitle']

    critic_url = base_individual_list_url + critics_hrefs_values[critic_lst_idx]
    soup = get_html(base_individual_list_url +
                    critics_hrefs_values[critic_lst_idx])

    critic_content_lst = list(select_soup(soup, css_selectors).values())
    critic_lst_content_vals = critic_content_lst[0]
    # We reverse them because they are posted from the highest ranked (worst album)
    # to the lowest rank (i.e. Post-1 is the highest ranked album on the critic list).
    critic_lst_content_vals.reverse()

    return critic_lst_content_vals, soup

示例#5

0

显示文件

def process_album_title_hrefs(album_title_hrefs, album_titles): 
    '''
    Input: List
    Output: Dictionary

    For each of the inputted hrefs, go to the href and grab the overall 
    critic and user scores. 
    '''
    base_url = 'http://www.albumoftheyear.org'
    final_json_lst = []
    for idx, href in enumerate(album_title_hrefs.values()[0]):
        soup = get_html(base_url + href)
        center_content = select_soup(soup, '#centerContent').values()[0][0]
        user_score = int(find_score(center_content, 'USER SCORE'))
        critic_score = int(find_score(center_content, 'CRITIC SCORE'))
        json_dct = {'Album Title': album_titles[idx], "User Score": user_score, 
                    "Critic Score": critic_score}

        final_json_lst.append(json_dct)

    return final_json_lst

示例#6

0

显示文件

文件： end_year_critic_lists.py 项目： rdt88/web-scrapers

def get_critic_lst_content(critics_hrefs_values, idx):
    '''
    Input: List, Integer
    Output: List, BeautifulSoup object

    For the critic href at the inputted idx in the critics_hrefs_values, grab
    all of the items with the class '.listLargeTitle'. This will then be used 
    to cycle through each one of them and grab information from them. 
    '''

    base_individual_list_url = 'http://www.albumoftheyear.org'
    css_selectors = ['.listLargeTitle']

    critic_url = base_individual_list_url + critics_hrefs_values[idx]
    soup = get_html(base_individual_list_url + critics_hrefs_values[idx])

    critic_lst_content_vals = select_soup(soup, css_selectors).values()[0]
    # We reverse them because they are posted from the highest ranked
    # (worst album) to the lowest rank (i.e. Post-1 is the highest ranked
    # album on the critic list).
    critic_lst_content_vals.reverse()

    return critic_lst_content_vals, soup

示例#7

0

显示文件

文件： end_year_critic_lists.py 项目： cdingding/web-scrapers

def get_critic_lst_content(critics_hrefs_values, idx):
    '''
    Input: List, Integer
    Output: List, BeautifulSoup object

    For the critic href at the inputted idx in the critics_hrefs_values, grab
    all of the items with the class '.listLargeTitle'. This will then be used 
    to cycle through each one of them and grab information from them. 
    '''

    base_individual_list_url = 'http://www.albumoftheyear.org'
    css_selectors = ['.listLargeTitle']

    critic_url = base_individual_list_url + critics_hrefs_values[idx]
    soup = get_html(base_individual_list_url + critics_hrefs_values[idx]) 

    critic_lst_content_vals = select_soup(soup, css_selectors).values()[0]
    # We reverse them because they are posted from the highest ranked 
    # (worst album) to the lowest rank (i.e. Post-1 is the highest ranked 
    # album on the critic list).
    critic_lst_content_vals.reverse()

    return critic_lst_content_vals, soup

示例#8

0

显示文件

文件： albums_of_year_lst_ind.py 项目： rf24/web-scrapers

def process_album_title_hrefs(album_title_hrefs, album_titles): 
    """Grab the critic and user scores for each inputted href. 

    Loop over the hrefs in `album_title_hrefs`, issue a get request
    on the URL associated with that href, and then parse the content 
    to grab the User and Critic scores for that album. Store the 
    User and Critic scores in a dictionary along with the Album title, 
    and then append that to a list to output for easy storage. 

    Args: 
    ----
        album_title_hrefs: list of strings 
            Holds the hrefs of each album title to issue a get
            request on. 
        album_titles: list of strings
            Holds the album titles to store with the User and 
            Critic scores that we're grabbing. This will allow 
            identification of a User/Critic score with a particular
            album. 

    Return: list of dictionaries 
    """

    base_url = 'http://www.albumoftheyear.org'
    final_json_lst = []
    for idx, href in enumerate(album_title_hrefs.values()[0]):
        soup = get_html(base_url + href)

        center_content = select_soup(soup, '#centerContent').values()[0][0]
        user_score = int(find_score(center_content, 'USER SCORE'))
        critic_score = int(find_score(center_content, 'CRITIC SCORE'))

        json_dct = {'Album Title': album_titles[idx], "User Score": user_score, 
                    "Critic Score": critic_score}
        final_json_lst.append(json_dct)

    return final_json_lst

示例#9

0

显示文件

文件： albums_of_year_lst_full.py 项目： xkortex/web-scrapers

    while attribute.find('Other') == -1:
        values[attribute] = value
        points_misc_idx += 1
        # The value is always the last item present, surrounded by (), and the 
        # 1+ items before that are the attributes to which those points belong. 
        split_text = sum_points_misc_lst[points_misc_idx].split()
        attribute = ' '.join(split_text[:-1])
        value = split_text[-1].replace('(', '').replace(')', '')
    values[attribute] = value
    points_misc_idx += 1

    return values, points_misc_idx 

if __name__ == '__main__':
    try: 
        year = sys.argv[1]
    except Exception as e: 
        print(e)
        raise Exception('<Usage> Input a year to grab music data for.')

    URL = 'http://www.albumoftheyear.org/list/summary/' + year + '/'
    soup = get_html(URL) 

    css_selectors = ['.artistTitle', '.albumTitle', '.summaryPoints', 
                     '.summaryPointsMisc']
    desired_contents = select_soup(soup, css_selectors)
    desired_contents_text = grab_contents_key(desired_contents, "text")
    desired_contents_renamed = rename_keys(desired_contents_text)
    final_lst = parse_contents(desired_contents_renamed)
    store_in_mongo(final_lst, 'music', 'music_lists')

示例#10

0

显示文件

文件： albums_of_year_lst_ind.py 项目： xkortex/web-scrapers

    content_txt = content.text
    score_idx = content_txt.find(score_str)
    score_str_len = len(score_str)
    beg_idx = score_idx + score_str_len
    end_idx = beg_idx + 2
    score = content_txt[beg_idx:end_idx]

    return score


if __name__ == '__main__':
    try:
        year = sys.argv[1]
    except Exception as e:
        print(e)
        raise Exception('<Usage> Input a year to grab data music data for.')

    URL = 'http://www.albumoftheyear.org/list/summary/' + year + '/'
    soup = get_html(URL)

    css_selectors = ['.albumTitle']
    album_titles_contents = select_soup(soup, css_selectors)
    album_titles_lst = list(
        grab_contents_key(album_titles_contents, 'text').values())
    album_titles = album_titles_lst[0]
    album_title_links = grab_contents_key(album_titles_contents, 'a')
    album_title_hrefs = grab_contents_key(album_title_links, 'href')

    final_json_lst = process_album_title_hrefs(album_title_hrefs, album_titles)
    store_in_mongo(final_json_lst, 'music', 'music_lists', key="Album Title")

示例#11

0

显示文件

文件： albums_of_year_lst_full.py 项目： sallamander/web-scrapers

    while attribute.find('Other') == -1:
        values[attribute] = value
        points_misc_idx += 1
        # The value is always the last item present, surrounded by (), and the 
        # 1+ items before that are the attributes to which those points belong. 
        split_text = sum_points_misc_lst[points_misc_idx].split()
        attribute = ' '.join(split_text[:-1])
        value = split_text[-1].replace('(', '').replace(')', '')
    values[attribute] = value
    points_misc_idx += 1

    return values, points_misc_idx 

if __name__ == '__main__':
    try: 
        year = sys.argv[1]
    except Exception as e: 
        print(e)
        raise Exception('<Usage> Input a year to grab music data for.')

    URL = 'http://www.albumoftheyear.org/list/summary/' + year + '/'
    soup = get_html(URL) 

    css_selectors = ['.artistTitle', '.albumTitle', '.summaryPoints', 
                     '.summaryPointsMisc']
    desired_contents = select_soup(soup, css_selectors)
    desired_contents_text = grab_contents_key(desired_contents, "text")
    desired_contents_renamed = rename_keys(desired_contents_text)
    final_lst = parse_contents(desired_contents_renamed)
    store_in_mongo(final_lst, 'music', 'music_lists')

示例#12

0

显示文件

文件： albums_of_year_lst_ind.py 项目： sallamander/web-scrapers

    content_txt = content.text
    score_idx = content_txt.find(score_str) 
    score_str_len = len(score_str)
    beg_idx = score_idx + score_str_len
    end_idx = beg_idx + 2    
    score = content_txt[beg_idx:end_idx]

    return score

if __name__ == '__main__': 
    try: 
        year = sys.argv[1]
    except Exception as e: 
        print(e)
        raise Exception('<Usage> Input a year to grab data music data for.')

    URL = 'http://www.albumoftheyear.org/list/summary/' + year + '/'
    soup = get_html(URL) 

    css_selectors = ['.albumTitle']
    album_titles_contents = select_soup(soup, css_selectors)
    album_titles_lst = list(grab_contents_key(album_titles_contents, 'text').values())
    album_titles = album_titles_lst[0]
    album_title_links = grab_contents_key(album_titles_contents, 'a')
    album_title_hrefs = grab_contents_key(album_title_links, 'href')

    final_json_lst = process_album_title_hrefs(album_title_hrefs, album_titles)
    store_in_mongo(final_json_lst, 'music', 'music_lists', key="Album Title")

示例#13

0

显示文件

文件： end_year_critic_lists.py 项目： sallamander/web-scrapers

        rating_txt: str
            Text that potentially holds the rating. 
        idx: int
            Holds the rating if the text does not. 

    Return: int
    """

    if len(rating_txt) >= 1: 
        rating = int(rating_txt[0].replace('.', ''))
    else: 
        rating = idx

    return rating

if __name__ == '__main__':
    lists_url = 'http://www.albumoftheyear.org/lists.php'

    soup = get_html(lists_url)
    critics_content = select_soup(soup, '.criticListBlockTitle')
    critics_names = grab_contents_key(critics_content, "text")
    critics_links = grab_contents_key(critics_content, 'a')
    critics_hrefs = grab_contents_key(critics_links, 'href')

    raw_output = grab_critics_info(critics_names, critics_hrefs)
    formatted_output = [{"Album Title": k, "Critics Scores": v} for \
            k, v in raw_output.items()]
    store_in_mongo(formatted_output, 'music', 'music_lists', 
                        key="Album Title")

示例#14

0

显示文件

文件： end_year_critic_lists.py 项目： xkortex/web-scrapers

    ----
        rating_txt: str
            Text that potentially holds the rating. 
        idx: int
            Holds the rating if the text does not. 

    Return: int
    """

    if len(rating_txt) >= 1:
        rating = int(rating_txt[0].replace('.', ''))
    else:
        rating = idx

    return rating


if __name__ == '__main__':
    lists_url = 'http://www.albumoftheyear.org/lists.php'

    soup = get_html(lists_url)
    critics_content = select_soup(soup, '.criticListBlockTitle')
    critics_names = grab_contents_key(critics_content, "text")
    critics_links = grab_contents_key(critics_content, 'a')
    critics_hrefs = grab_contents_key(critics_links, 'href')

    raw_output = grab_critics_info(critics_names, critics_hrefs)
    formatted_output = [{"Album Title": k, "Critics Scores": v} for \
            k, v in raw_output.items()]
    store_in_mongo(formatted_output, 'music', 'music_lists', key="Album Title")