def get_youtube_video_data(video_qs, start): fields = 'atom:entry(batch:status,atom:id,atom:author(name),atom:published,yt:noembed)' while True: videos = dict((v.source_videoid, v) for v in video_qs.filter(Video.date_updated < start).limit(50)) if not videos: break feed_ids = [('videos', id) for id in videos.keys()] for entry in batch_query(feed_ids, dict(fields=fields)).entry: id = entry.id.text[-11:] videos[id].date_updated = datetime.now() if entry.batch_status.code == '200': videos[id].source_username = entry.author[0].name.text videos[id].date_published = _parse_datetime(entry.published.text) if 'noembed' in [e.tag for e in entry.extension_elements]: app.logger.info('%s: marked not visible: noembed', id) videos[id].visible = False elif entry.batch_status.code == '404': app.logger.info('%s: marked not visible: %s', id, entry.batch_status.reason) if not videos[id].source_username: videos[id].source_username = '******' videos[id].visible = False else: app.logger.warning('%s: %s', id, entry.batch_status.reason) time.sleep(1) Video.query.session.commit() if len(videos) == 50: time.sleep(60)
def import_google_movies(): freshold = datetime.now() - timedelta(days=app.config.get('GOOGLE_MOVIE_FRESHOLD', 120)) year_format = re.compile(' \((20\d\d)\)') for channelid, location in app.config['GOOGLE_MOVIE_LOCATIONS']: start = 0 video_ids = set() channel = Channel.query.get(channelid) existing = set(v for v, in VideoInstance.query. filter_by(channel=channelid).join(Video).values('source_videoid')) while True: url = app.config['GOOGLE_MOVIE_URL'] % (location, start) html = get_external_resource(url).read() video_ids.update(re.findall('youtube.com/watch%3Fv%3D(.{11})', html)) next = re.search('<a [^>]*start=(\d+)[^>]*><img[^>]*><br>Next</a>', html) if next: start = int(next.group(1)) time.sleep(1) # Don't get blocked by google else: break feed_ids = [('videos', id) for id in video_ids - existing] if feed_ids: playlist = batch_query(feed_ids, playlist='Googlemovietrailers/uploads') videos = [] for video in playlist.videos: year_match = year_format.search(video.title) if video.date_published > freshold and ( not year_match or int(year_match.group(1)) >= freshold.year): videos.append(video) else: app.logger.debug('Skipped import of trailer "%s" (%s)', video.title, video.date_published) added = Video.add_videos(videos, 1) channel.add_videos(videos) app.logger.info('Added %d trailers to "%s"', added, channel.title)