def get_critic_lst_content(critics_hrefs_values, idx): """Grab the CSS element that holds all relevant info. for a critic list. For the critic href at the inputted idx in the critics_hrefs_values, grab all of the items with the class '.listLargeTitle'. This will then be used to cycle through each one of them and grab information from them. Args: ---- critics_hrefs_values: list of strings Holds the href attribute for each critic list, and is used to issue a get request against that href. idx: int Holds the index of the current critics list that is being looked at. Return: list, bs4.BeautifulSoup object """ base_individual_list_url = 'http://www.albumoftheyear.org' css_selectors = ['.listLargeTitle'] critic_url = base_individual_list_url + critics_hrefs_values[idx] soup = get_html(base_individual_list_url + critics_hrefs_values[idx]) critic_lst_content_vals = select_soup(soup, css_selectors).values()[0] # We reverse them because they are posted from the highest ranked # (worst album) to the lowest rank (i.e. Post-1 is the highest ranked # album on the critic list). critic_lst_content_vals.reverse() return critic_lst_content_vals, soup
def process_album_title_hrefs(album_title_hrefs, album_titles): """Grab the critic and user scores for each inputted href. Loop over the hrefs in `album_title_hrefs`, issue a get request on the URL associated with that href, and then parse the content to grab the User and Critic scores for that album. Store the User and Critic scores in a dictionary along with the Album title. Output it all in a list, with one entry per href. Args: ---- album_title_hrefs: list of strings album_titles: list of strings Return: ------ final_json_lst: list """ base_url = 'http://www.albumoftheyear.org' final_json_lst = [] album_title_hrefs_lst = list(album_title_hrefs.values()) for idx, href in enumerate(album_title_hrefs_lst[0]): soup = get_html(base_url + href) center_content_lst = list(select_soup(soup, '#centerContent').values()) center_content = center_content_lst[0][0] user_score = int(find_score(center_content, 'USER SCORE')) critic_score = int(find_score(center_content, 'CRITIC SCORE')) json_dct = {'Album Title': album_titles[idx], "User Score": user_score, "Critic Score": critic_score} final_json_lst.append(json_dct) return final_json_lst
def get_critic_lst_content(critics_hrefs_values, critic_lst_idx): """Grab the CSS element that holds all relevant info. for a critic list. For the critic href at the inputted idx in the critics_hrefs_values, grab all of the items with the class '.listLargeTitle'. This will then be used to cycle through each one of them and grab information from them. Args: ---- critics_hrefs_values: list of strings critic_lst_idx: int Return: ------ critic_lst_content_vals: list soup: bs4.BeautifulSoup object """ base_individual_list_url = 'http://www.albumoftheyear.org' css_selectors = ['.listLargeTitle'] critic_url = base_individual_list_url + critics_hrefs_values[critic_lst_idx] soup = get_html(base_individual_list_url + critics_hrefs_values[critic_lst_idx]) critic_content_lst = list(select_soup(soup, css_selectors).values()) critic_lst_content_vals = critic_content_lst[0] # We reverse them because they are posted from the highest ranked (worst album) # to the lowest rank (i.e. Post-1 is the highest ranked album on the critic list). critic_lst_content_vals.reverse() return critic_lst_content_vals, soup
def process_album_title_hrefs(album_title_hrefs, album_titles): ''' Input: List Output: Dictionary For each of the inputted hrefs, go to the href and grab the overall critic and user scores. ''' base_url = 'http://www.albumoftheyear.org' final_json_lst = [] for idx, href in enumerate(album_title_hrefs.values()[0]): soup = get_html(base_url + href) center_content = select_soup(soup, '#centerContent').values()[0][0] user_score = int(find_score(center_content, 'USER SCORE')) critic_score = int(find_score(center_content, 'CRITIC SCORE')) json_dct = {'Album Title': album_titles[idx], "User Score": user_score, "Critic Score": critic_score} final_json_lst.append(json_dct) return final_json_lst
def get_critic_lst_content(critics_hrefs_values, idx): ''' Input: List, Integer Output: List, BeautifulSoup object For the critic href at the inputted idx in the critics_hrefs_values, grab all of the items with the class '.listLargeTitle'. This will then be used to cycle through each one of them and grab information from them. ''' base_individual_list_url = 'http://www.albumoftheyear.org' css_selectors = ['.listLargeTitle'] critic_url = base_individual_list_url + critics_hrefs_values[idx] soup = get_html(base_individual_list_url + critics_hrefs_values[idx]) critic_lst_content_vals = select_soup(soup, css_selectors).values()[0] # We reverse them because they are posted from the highest ranked # (worst album) to the lowest rank (i.e. Post-1 is the highest ranked # album on the critic list). critic_lst_content_vals.reverse() return critic_lst_content_vals, soup
def process_album_title_hrefs(album_title_hrefs, album_titles): """Grab the critic and user scores for each inputted href. Loop over the hrefs in `album_title_hrefs`, issue a get request on the URL associated with that href, and then parse the content to grab the User and Critic scores for that album. Store the User and Critic scores in a dictionary along with the Album title, and then append that to a list to output for easy storage. Args: ---- album_title_hrefs: list of strings Holds the hrefs of each album title to issue a get request on. album_titles: list of strings Holds the album titles to store with the User and Critic scores that we're grabbing. This will allow identification of a User/Critic score with a particular album. Return: list of dictionaries """ base_url = 'http://www.albumoftheyear.org' final_json_lst = [] for idx, href in enumerate(album_title_hrefs.values()[0]): soup = get_html(base_url + href) center_content = select_soup(soup, '#centerContent').values()[0][0] user_score = int(find_score(center_content, 'USER SCORE')) critic_score = int(find_score(center_content, 'CRITIC SCORE')) json_dct = {'Album Title': album_titles[idx], "User Score": user_score, "Critic Score": critic_score} final_json_lst.append(json_dct) return final_json_lst
while attribute.find('Other') == -1: values[attribute] = value points_misc_idx += 1 # The value is always the last item present, surrounded by (), and the # 1+ items before that are the attributes to which those points belong. split_text = sum_points_misc_lst[points_misc_idx].split() attribute = ' '.join(split_text[:-1]) value = split_text[-1].replace('(', '').replace(')', '') values[attribute] = value points_misc_idx += 1 return values, points_misc_idx if __name__ == '__main__': try: year = sys.argv[1] except Exception as e: print(e) raise Exception('<Usage> Input a year to grab music data for.') URL = 'http://www.albumoftheyear.org/list/summary/' + year + '/' soup = get_html(URL) css_selectors = ['.artistTitle', '.albumTitle', '.summaryPoints', '.summaryPointsMisc'] desired_contents = select_soup(soup, css_selectors) desired_contents_text = grab_contents_key(desired_contents, "text") desired_contents_renamed = rename_keys(desired_contents_text) final_lst = parse_contents(desired_contents_renamed) store_in_mongo(final_lst, 'music', 'music_lists')
content_txt = content.text score_idx = content_txt.find(score_str) score_str_len = len(score_str) beg_idx = score_idx + score_str_len end_idx = beg_idx + 2 score = content_txt[beg_idx:end_idx] return score if __name__ == '__main__': try: year = sys.argv[1] except Exception as e: print(e) raise Exception('<Usage> Input a year to grab data music data for.') URL = 'http://www.albumoftheyear.org/list/summary/' + year + '/' soup = get_html(URL) css_selectors = ['.albumTitle'] album_titles_contents = select_soup(soup, css_selectors) album_titles_lst = list( grab_contents_key(album_titles_contents, 'text').values()) album_titles = album_titles_lst[0] album_title_links = grab_contents_key(album_titles_contents, 'a') album_title_hrefs = grab_contents_key(album_title_links, 'href') final_json_lst = process_album_title_hrefs(album_title_hrefs, album_titles) store_in_mongo(final_json_lst, 'music', 'music_lists', key="Album Title")
content_txt = content.text score_idx = content_txt.find(score_str) score_str_len = len(score_str) beg_idx = score_idx + score_str_len end_idx = beg_idx + 2 score = content_txt[beg_idx:end_idx] return score if __name__ == '__main__': try: year = sys.argv[1] except Exception as e: print(e) raise Exception('<Usage> Input a year to grab data music data for.') URL = 'http://www.albumoftheyear.org/list/summary/' + year + '/' soup = get_html(URL) css_selectors = ['.albumTitle'] album_titles_contents = select_soup(soup, css_selectors) album_titles_lst = list(grab_contents_key(album_titles_contents, 'text').values()) album_titles = album_titles_lst[0] album_title_links = grab_contents_key(album_titles_contents, 'a') album_title_hrefs = grab_contents_key(album_title_links, 'href') final_json_lst = process_album_title_hrefs(album_title_hrefs, album_titles) store_in_mongo(final_json_lst, 'music', 'music_lists', key="Album Title")
rating_txt: str Text that potentially holds the rating. idx: int Holds the rating if the text does not. Return: int """ if len(rating_txt) >= 1: rating = int(rating_txt[0].replace('.', '')) else: rating = idx return rating if __name__ == '__main__': lists_url = 'http://www.albumoftheyear.org/lists.php' soup = get_html(lists_url) critics_content = select_soup(soup, '.criticListBlockTitle') critics_names = grab_contents_key(critics_content, "text") critics_links = grab_contents_key(critics_content, 'a') critics_hrefs = grab_contents_key(critics_links, 'href') raw_output = grab_critics_info(critics_names, critics_hrefs) formatted_output = [{"Album Title": k, "Critics Scores": v} for \ k, v in raw_output.items()] store_in_mongo(formatted_output, 'music', 'music_lists', key="Album Title")
---- rating_txt: str Text that potentially holds the rating. idx: int Holds the rating if the text does not. Return: int """ if len(rating_txt) >= 1: rating = int(rating_txt[0].replace('.', '')) else: rating = idx return rating if __name__ == '__main__': lists_url = 'http://www.albumoftheyear.org/lists.php' soup = get_html(lists_url) critics_content = select_soup(soup, '.criticListBlockTitle') critics_names = grab_contents_key(critics_content, "text") critics_links = grab_contents_key(critics_content, 'a') critics_hrefs = grab_contents_key(critics_links, 'href') raw_output = grab_critics_info(critics_names, critics_hrefs) formatted_output = [{"Album Title": k, "Critics Scores": v} for \ k, v in raw_output.items()] store_in_mongo(formatted_output, 'music', 'music_lists', key="Album Title")