def scrape(): video_ids = random_songs(NUM_SONGS) aggr = SongAggregateScraper() aggr.scrape_video_ids(video_ids, MIN_NUM_COMMENTS, MAX_NUM_COMMENTS, FILTER_LIST) aggr.print_summary() aggr.save_to_file(SAVEFILE)
def test_scrape_5m(self): video_ids = random_songs(LIMIT, 0) aggr = SongAggregateScraper() aggr.scrape_video_ids(video_ids, MIN_NUM_COMMENTS, MAX_NUM_COMMENTS, FILTER_LIST) #aggr.print_summary() self.assertTrue( len(aggr.get_passing_songs()) >= 2, "5 million youtube scraper")
# Project Libraries import filter as fltr from song_aggregate import SongAggregateScraper from five_million import random_songs # General Libraries """ Scrapes comments from a subset of 5 million YouTube song dataset """ BASEDIR = '../../../../Thesis/data_sample/W/D' MIN_NUM_COMMENTS = 40 MAX_NUM_COMMENTS = 80 NUM_SONGS = 100 filter_list = [fltr.english_filter] video_ids = random_songs(NUM_SONGS) aggr_5m = SongAggregateScraper() aggr_5m.scrape_video_ids(video_ids, MIN_NUM_COMMENTS, MAX_NUM_COMMENTS, filter_list) aggr_echo = SongAggregateScraper() aggr_echo.scrape_from_echonest(BASEDIR, MIN_NUM_COMMENTS, MAX_NUM_COMMENTS, filter_list, limit=NUM_SONGS) aggr_5m.print_summary() aggr_echo.print_summary()