def scrape(self, tries_remaining=5): """ Search and scrape YouTube to return a list of matching videos. """ # prevents an infinite loop but allows for a few retries if tries_remaining == 0: log.debug('No tries left. I quit.') return if self.meta_tags is None: song = self.raw_song search_url = generate_search_url(song) else: song = internals.generate_songname(const.args.file_format, self.meta_tags) search_url = generate_search_url(song) log.debug('Opening URL: {0}'.format(search_url)) item = urllib.request.urlopen(search_url).read() items_parse = BeautifulSoup(item, "html.parser") videos = [] for x in items_parse.find_all( 'div', {'class': 'yt-lockup-dismissable yt-uix-tile'}): if not is_video(x): continue y = x.find('div', class_='yt-lockup-content') link = y.find('a')['href'][-11:] title = y.find('a')['title'] try: videotime = x.find('span', class_="video-time").get_text() except AttributeError: log.debug( 'Could not find video duration on YouTube, retrying..') return generate_youtube_url(self.raw_song, self.meta_tags, tries_remaining - 1) youtubedetails = { 'link': link, 'title': title, 'videotime': videotime, 'seconds': internals.get_sec(videotime) } videos.append(youtubedetails) if self.meta_tags is None: break return self._best_match(videos)
def test_raise_error(self): with pytest.raises(ValueError): internals.get_sec('10*05') with pytest.raises(ValueError): internals.get_sec('02,28,46')
def test_from_hours(self): expect_secs = 5405 secs = internals.get_sec('1.30.05') assert secs == expect_secs secs = internals.get_sec('1:30:05') assert secs == expect_secs
def test_from_minutes(self): expect_secs = 213 secs = internals.get_sec('3.33') assert secs == expect_secs secs = internals.get_sec('3:33') assert secs == expect_secs
def test_from_seconds(self): expect_secs = 45 secs = internals.get_sec('0:45') assert secs == expect_secs secs = internals.get_sec('0.45') assert secs == expect_secs
def generate_youtube_url(raw_song, meta_tags, tries_remaining=5): """ Search for the song on YouTube and generate a URL to its video. """ # prevents an infinite loop but allows for a few retries if tries_remaining == 0: log.debug('No tries left. I quit.') return if meta_tags is None: song = raw_song search_url = internals.generate_search_url(song, viewsort=False) else: song = generate_songname(meta_tags) search_url = internals.generate_search_url(song, viewsort=True) log.debug('Opening URL: {0}'.format(search_url)) item = urllib.request.urlopen(search_url).read() items_parse = BeautifulSoup(item, "html.parser") videos = [] for x in items_parse.find_all( 'div', {'class': 'yt-lockup-dismissable yt-uix-tile'}): if not is_video(x): continue y = x.find('div', class_='yt-lockup-content') link = y.find('a')['href'] title = y.find('a')['title'] try: videotime = x.find('span', class_="video-time").get_text() except AttributeError: log.debug('Could not find video duration on YouTube, retrying..') return generate_youtube_url(raw_song, meta_tags, tries_remaining - 1) youtubedetails = { 'link': link, 'title': title, 'videotime': videotime, 'seconds': internals.get_sec(videotime) } videos.append(youtubedetails) if meta_tags is None: break if not videos: return None log.debug(pprint.pformat(videos)) if args.manual: log.info(song) log.info('0. Skip downloading this song.\n') # fetch all video links on first page on YouTube for i, v in enumerate(videos): log.info(u'{0}. {1} {2} {3}'.format( i + 1, v['title'], v['videotime'], "http://youtube.com" + v['link'])) # let user select the song to download result = internals.input_link(videos) if result is None: return None else: if meta_tags is None: # if the metadata could not be acquired, take the first result # from Youtube because the proper song length is unknown result = videos[0] log.debug( 'Since no metadata found on Spotify, going with the first result' ) else: # filter out videos that do not have a similar length to the Spotify song duration_tolerance = 10 max_duration_tolerance = 20 possible_videos_by_duration = list() ''' start with a reasonable duration_tolerance, and increment duration_tolerance until one of the Youtube results falls within the correct duration or the duration_tolerance has reached the max_duration_tolerance ''' while len(possible_videos_by_duration) == 0: possible_videos_by_duration = list( filter( lambda x: abs(x['seconds'] - (int(meta_tags[ 'duration_ms']) / 1000)) <= duration_tolerance, videos)) duration_tolerance += 1 if duration_tolerance > max_duration_tolerance: log.error("{0} by {1} was not found.\n".format( meta_tags['name'], meta_tags['artists'][0]['name'])) return None result = possible_videos_by_duration[0] if result: full_link = u'http://youtube.com{0}'.format(result['link']) else: full_link = None log.debug('Best matching video link: {}'.format(full_link)) return full_link