def scrape() -> None:
    """Scrapes links from issue-date pages from www.billboard.com. 
        Returns None."""
    print("--- ISSUE DATE SCRAPING, STARTED --- ")
    todo, finished = scrape_setup(SUB_CATEGORY_FIN, ISSUE_FIN)
    fin_len = len(finished)
    todo_len = len(todo)
    print("finished:", fin_len)
    print("todo    :", todo_len)

    errors = load_file_list(ISSUE_ERR)
    for link in todo:
        try:
            soup = get_soup(link, filter_=A_TAGS)

            #rearrange url suffix
            charts = Path(link).parts[-3]
            subcat = Path(link).parts[-1]

            pattern = format_search_string(charts, subcat)
            issuedates = get_links(soup, pattern)
            hrefs = get_hrefs(issuedates)
            links = sorted(list(map(lambda x: HOME_PAGE + x, hrefs)))
            save_append(links, ISSUE_FIN)
            print("Saved :: ", link)
        except AttributeError:
            errors.append(link)
        except KeyboardInterrupt:
            print("Stopped manually.")
    save(errors, list(set(ISSUE_ERR)))
    print("--- ISSUE DATE SCRAPING, FINISHED --- ")
    return None
Пример #2
0
def scrape() -> None:
    """Scrape the subcategories from www.billboard.com. Returns None."""
    print("--- SCRAPE SUBCATEGORIES, STARTED ---")
    todo, finished = scrape_setup(YEAR_FIN, SUB_CATEGORY_FIN)
    fin_len = len(finished)
    todo_len = len(todo)
    print("finished:", fin_len)
    print("todo    :", todo_len)

    errors = load_file_list(SUB_CATEGORY_ERR)
    allurls = []
    for link in todo:
        try:
            soup = get_soup(link, filter_=A_TAGS)
            year = get_year(link)
            pattern = format_regex_string(year)
            a_tags = get_links(soup, pattern)
            hrefs = get_hrefs(a_tags)
            allurls += list(map(lambda x: HOME_PAGE + x, hrefs))
            print("Done:", link)
        except:
            print("Error:", link)
            errors.append(link)
            save_append_line(link, SUB_CATEGORY_ERR)
    save(allurls, SUB_CATEGORY_FIN)
    save(list(set(errors)), SUB_CATEGORY_ERR)
    print("--- SCRAPE SUBCATEGORIES, FINISHED ---")
    return None
def scrape() -> None:
    """Scrapes rankings from www.billboard.com. Returns None."""
    print("--- RANK SCRAPING, STARTED --- ")
    todo, finished = scrape_setup(ISSUE_FIN, RANK_FIN)
    fin_len = len(finished)
    todo_len = len(todo)
    print("finished:", fin_len)
    print("todo    :", todo_len)

    errors = load_file_list(RANK_ERR)
    rank_count = 0
    for link in todo:
        try:
            soup = get_soup(link, filter_=DIVS)
            div_soup = soup.find_all("div")
            for element in div_soup:
                if has_data_title(element):
                    name = format_file_name(link)
                    save_ranking(name, element)
            save_append_line(link, RANK_FIN)
            rank_count += 1
        except AttributeError:
            errors.append(link)
        except IndexError:
            errors.append(link)
        except KeyboardInterrupt:
            print("Stopped manually.")
            save(list(set(errors)), RANK_ERR)
            quit()
    save(list(set(errors)), RANK_ERR)
    print("rank_count:", str(rank_count))
    print("--- RANK SCRAPING, FINISHED --- ")
    return None
def scrape() -> None:
    """Scrapes the year-links from www.billboard.com. Returns None."""
    print("--- SCRAPING YEARS, STARTED ---")
    soup = get_soup(YEAR_PAGE, filter_=A_TAGS)
    links = get_links(soup, "\/archive\/charts\/[0-9]*")
    links = get_hrefs(links)
    links = list(map(lambda x: HOME_PAGE + x, links))
    save(links, YEAR_FIN)
    print("--- SCRAPING YEARS, FINISHED ---")
    return None
Пример #5
0
def scrape() -> None:
    print("--- CATEGORY SCRAPING STARTED ---")
    print("Scraping from:", HOME_PAGE)
    soup = get_soup(HOME_PAGE)
    category_links = get_links(soup, "^/artists/")
    a_tags = set(category_links)
    hrefs = get_hrefs(a_tags)
    suffixed = list(map(lambda x: x+"/99999", hrefs))
    prefixed = list(map(lambda x: HOME_PAGE+x, suffixed))
    save(prefixed, CATEGORY_FIN)
    print("--- CATEGORY SCRAPING FINISHED ---")
Пример #6
0
def scrape() -> None:
    """A single scraping attempt of 'link'. Returns None."""
    print("--- LYRIC SCRAPING; STARTED ---")
    print("Loading unfinished work...")
    todo, finished = scrape_setup(LYRIC_TODO, LYRIC_FIN)
    fin_len = len(finished)
    todo_len = len(todo)

    completed = 0
    for link in todo:
        try:
            soup = get_soup(link)
            song = get_song(soup)
            artist = get_artist(soup)
            file_name = format_file_name(artist, song)

            #final clean up of lyrics
            lyrics = get_lyrics(soup)
            lyrics = list(lyrics.split("\n"))
            lyrics = list(map(lambda x: x.strip(), lyrics))
            save_append_line(link, LYRIC_FIN)

            letter = artist[0]
            save_path = LYRIC_DIR + letter + "lyrics/"
            ensure_exists(save_path)

            if letter in UPPERS:
                save_lyrics(lyrics, save_path + file_name)
            else:
                symbol_dir = LYRIC_DIR + "symbollyrics/"
                save_lyrics(lyrics, symbol_dir + file_name)
        except AttributeError:
            save_append_line(link, LYRIC_ERRORS)
        except IndexError:
            save_append_line(link, LYRIC_ERRORS)
        except KeyboardInterrupt:
            print("\nStopped manually.")
            quit()
        except SSLEOFError:
            print("\nSSL EOF Error. Quitting...")
            quit()
        except MaxRetryError:
            print("\nMax Retry Error. Quitting...")
            quit()
        except requests.exceptions.SSLError:
            print("\nRequest SSL Error. Quitting...")
            quit()
        completed += 1
        print('\r%s %s' % (completed, "lyrics"), end='\r')
    return None
Пример #7
0
def scrape() -> None:
    """Main scraping function. Returns None."""
    print("--- SONG SCRAPING, START ---")
    todo, finished = scrape_setup_song(ARTIST_DIR, SONG_FIN)
    print("Finished:", len(finished))
    print("To do   :", len(todo))

    errors = load_file_list(SONG_ERRORS)
    for thing in sorted(todo):
        try:
            soup = get_soup(thing)
            a_tags = get_links(soup, "^/lyric/")
            hrefs = get_hrefs(a_tags)
            links = list(map(lambda x: unquote(HOME_PAGE + x), hrefs))
            save_append(links, LYRIC_TODO)
            save_append_line(thing, SONG_FIN)
        except:
            errors.append(thing)

    save(list(set(errors)), SONG_ERRORS)
    print("--- SONG SCRAPING, FINISHED ---")
Пример #8
0
def scrape() -> None:
    """Main scraping function. Returns None."""
    print("--- ARTIST SCRAPING STARTED ---")
    errors = []
    todo, finished = scrape_setup(CATEGORY_FIN, ARTIST_FIN)
    total = len(todo)
    completed = 0

    for cat in todo:
        try:
            soup = get_soup(cat)
            art_hrefs = get_links(soup, "^artist")
            art_links = list(map(format_artist_link, art_hrefs))
            category = Path(cat).parts[3]
            text_file = (ARTIST_DIR + category + "_" + "artistlinks.txt")
            save(art_links, text_file)
            finished.append(cat)
        except:
            errors.append(cat)
        completed += 1
        progress_bar(completed, total)
    save(errors, ARTIST_ERRORS)
    save(finished, ARTIST_FIN)
    print("--- ARTIST SCRAPING FINISHED ---")
def test_year_scraping_href_elements_regex_valid():
    """Ensure that the href structure for scraping the years has not changed."""
    soup = get_soup(YEAR_PAGE, filter_=A_TAGS)
    links = get_links(soup, r"\/archive\/charts\/[0-9]*")
    assert len(links) > 0