def scrape() -> None: """Scrapes links from issue-date pages from www.billboard.com. Returns None.""" print("--- ISSUE DATE SCRAPING, STARTED --- ") todo, finished = scrape_setup(SUB_CATEGORY_FIN, ISSUE_FIN) fin_len = len(finished) todo_len = len(todo) print("finished:", fin_len) print("todo :", todo_len) errors = load_file_list(ISSUE_ERR) for link in todo: try: soup = get_soup(link, filter_=A_TAGS) #rearrange url suffix charts = Path(link).parts[-3] subcat = Path(link).parts[-1] pattern = format_search_string(charts, subcat) issuedates = get_links(soup, pattern) hrefs = get_hrefs(issuedates) links = sorted(list(map(lambda x: HOME_PAGE + x, hrefs))) save_append(links, ISSUE_FIN) print("Saved :: ", link) except AttributeError: errors.append(link) except KeyboardInterrupt: print("Stopped manually.") save(errors, list(set(ISSUE_ERR))) print("--- ISSUE DATE SCRAPING, FINISHED --- ") return None
def scrape() -> None: """Scrape the subcategories from www.billboard.com. Returns None.""" print("--- SCRAPE SUBCATEGORIES, STARTED ---") todo, finished = scrape_setup(YEAR_FIN, SUB_CATEGORY_FIN) fin_len = len(finished) todo_len = len(todo) print("finished:", fin_len) print("todo :", todo_len) errors = load_file_list(SUB_CATEGORY_ERR) allurls = [] for link in todo: try: soup = get_soup(link, filter_=A_TAGS) year = get_year(link) pattern = format_regex_string(year) a_tags = get_links(soup, pattern) hrefs = get_hrefs(a_tags) allurls += list(map(lambda x: HOME_PAGE + x, hrefs)) print("Done:", link) except: print("Error:", link) errors.append(link) save_append_line(link, SUB_CATEGORY_ERR) save(allurls, SUB_CATEGORY_FIN) save(list(set(errors)), SUB_CATEGORY_ERR) print("--- SCRAPE SUBCATEGORIES, FINISHED ---") return None
def scrape() -> None: """Scrapes rankings from www.billboard.com. Returns None.""" print("--- RANK SCRAPING, STARTED --- ") todo, finished = scrape_setup(ISSUE_FIN, RANK_FIN) fin_len = len(finished) todo_len = len(todo) print("finished:", fin_len) print("todo :", todo_len) errors = load_file_list(RANK_ERR) rank_count = 0 for link in todo: try: soup = get_soup(link, filter_=DIVS) div_soup = soup.find_all("div") for element in div_soup: if has_data_title(element): name = format_file_name(link) save_ranking(name, element) save_append_line(link, RANK_FIN) rank_count += 1 except AttributeError: errors.append(link) except IndexError: errors.append(link) except KeyboardInterrupt: print("Stopped manually.") save(list(set(errors)), RANK_ERR) quit() save(list(set(errors)), RANK_ERR) print("rank_count:", str(rank_count)) print("--- RANK SCRAPING, FINISHED --- ") return None
def scrape() -> None: """Scrapes the year-links from www.billboard.com. Returns None.""" print("--- SCRAPING YEARS, STARTED ---") soup = get_soup(YEAR_PAGE, filter_=A_TAGS) links = get_links(soup, "\/archive\/charts\/[0-9]*") links = get_hrefs(links) links = list(map(lambda x: HOME_PAGE + x, links)) save(links, YEAR_FIN) print("--- SCRAPING YEARS, FINISHED ---") return None
def scrape() -> None: print("--- CATEGORY SCRAPING STARTED ---") print("Scraping from:", HOME_PAGE) soup = get_soup(HOME_PAGE) category_links = get_links(soup, "^/artists/") a_tags = set(category_links) hrefs = get_hrefs(a_tags) suffixed = list(map(lambda x: x+"/99999", hrefs)) prefixed = list(map(lambda x: HOME_PAGE+x, suffixed)) save(prefixed, CATEGORY_FIN) print("--- CATEGORY SCRAPING FINISHED ---")
def scrape() -> None: """A single scraping attempt of 'link'. Returns None.""" print("--- LYRIC SCRAPING; STARTED ---") print("Loading unfinished work...") todo, finished = scrape_setup(LYRIC_TODO, LYRIC_FIN) fin_len = len(finished) todo_len = len(todo) completed = 0 for link in todo: try: soup = get_soup(link) song = get_song(soup) artist = get_artist(soup) file_name = format_file_name(artist, song) #final clean up of lyrics lyrics = get_lyrics(soup) lyrics = list(lyrics.split("\n")) lyrics = list(map(lambda x: x.strip(), lyrics)) save_append_line(link, LYRIC_FIN) letter = artist[0] save_path = LYRIC_DIR + letter + "lyrics/" ensure_exists(save_path) if letter in UPPERS: save_lyrics(lyrics, save_path + file_name) else: symbol_dir = LYRIC_DIR + "symbollyrics/" save_lyrics(lyrics, symbol_dir + file_name) except AttributeError: save_append_line(link, LYRIC_ERRORS) except IndexError: save_append_line(link, LYRIC_ERRORS) except KeyboardInterrupt: print("\nStopped manually.") quit() except SSLEOFError: print("\nSSL EOF Error. Quitting...") quit() except MaxRetryError: print("\nMax Retry Error. Quitting...") quit() except requests.exceptions.SSLError: print("\nRequest SSL Error. Quitting...") quit() completed += 1 print('\r%s %s' % (completed, "lyrics"), end='\r') return None
def scrape() -> None: """Main scraping function. Returns None.""" print("--- SONG SCRAPING, START ---") todo, finished = scrape_setup_song(ARTIST_DIR, SONG_FIN) print("Finished:", len(finished)) print("To do :", len(todo)) errors = load_file_list(SONG_ERRORS) for thing in sorted(todo): try: soup = get_soup(thing) a_tags = get_links(soup, "^/lyric/") hrefs = get_hrefs(a_tags) links = list(map(lambda x: unquote(HOME_PAGE + x), hrefs)) save_append(links, LYRIC_TODO) save_append_line(thing, SONG_FIN) except: errors.append(thing) save(list(set(errors)), SONG_ERRORS) print("--- SONG SCRAPING, FINISHED ---")
def scrape() -> None: """Main scraping function. Returns None.""" print("--- ARTIST SCRAPING STARTED ---") errors = [] todo, finished = scrape_setup(CATEGORY_FIN, ARTIST_FIN) total = len(todo) completed = 0 for cat in todo: try: soup = get_soup(cat) art_hrefs = get_links(soup, "^artist") art_links = list(map(format_artist_link, art_hrefs)) category = Path(cat).parts[3] text_file = (ARTIST_DIR + category + "_" + "artistlinks.txt") save(art_links, text_file) finished.append(cat) except: errors.append(cat) completed += 1 progress_bar(completed, total) save(errors, ARTIST_ERRORS) save(finished, ARTIST_FIN) print("--- ARTIST SCRAPING FINISHED ---")
def test_year_scraping_href_elements_regex_valid(): """Ensure that the href structure for scraping the years has not changed.""" soup = get_soup(YEAR_PAGE, filter_=A_TAGS) links = get_links(soup, r"\/archive\/charts\/[0-9]*") assert len(links) > 0