def get_iterator(self): return vidscraper.auto_feed( self.original_url, max_results=None, api_keys=get_api_keys(), etag=self.external_etag or None, last_modified=self.external_last_modified, )
def fetch_cmd(cfg, parser, parsed, args): if not parsed.quiet: parser.print_byline() projectpath = cfg.get('project', 'projectpath') jsonpath = os.path.join(projectpath, 'json') if not os.path.exists(jsonpath): os.makedirs(jsonpath) try: url = cfg.get('project', 'url') except ConfigParser.NoOptionError: url = '' if not url: err('url not specified in steve.ini project config file.') err('Add "url = ..." to [project] section of steve.ini file.') return 1 if 'youtube' in url: try: youtube_embed = YOUTUBE_EMBED[cfg.get('project', 'youtube_embed')] except KeyError: err('youtube_embed must be either "iframe" or "object".') return 1 else: youtube_embed = None out('Scraping {0}...'.format(url)) video_feed = vidscraper.auto_feed(url) video_feed.load() print 'Found {0} videos...'.format(video_feed.video_count) for i, video in enumerate(video_feed): if video.title: filename = video.title.replace(' ', '_') filename = ''.join([c for c in filename if c in ALLOWED_LETTERS]) filename = '_' + filename else: filename = '' filename = '{0:04d}{1}.json'.format(i, filename[:40]) print 'Working on {0}... ({1})'.format( unicodedata.normalize('NFKD', video.title).encode('ascii', 'ignore'), filename) item = vidscraper_to_dict(video, youtube_embed=youtube_embed) f = open(os.path.join('json', filename), 'w') f.write(convert_to_json(item)) f.close() # TODO: what if there's a file there already? on the first one, # prompt the user whether to stomp on existing files or skip. return 0
def fetch_cmd(cfg, parser, parsed, args): if not parsed.quiet: parser.print_byline() projectpath = cfg.get('project', 'projectpath') jsonpath = os.path.join(projectpath, 'json') if not os.path.exists(jsonpath): os.makedirs(jsonpath) try: url = cfg.get('project', 'url') except ConfigParser.NoOptionError: url = '' if not url: err('url not specified in steve.ini project config file.') err('Add "url = ..." to [project] section of steve.ini file.') return 1 if 'youtube' in url: try: youtube_embed = YOUTUBE_EMBED[cfg.get('project', 'youtube_embed')] except KeyError: err('youtube_embed must be either "iframe" or "object".') return 1 else: youtube_embed = None out('Scraping {0}...'.format(url)) video_feed = vidscraper.auto_feed(url) video_feed.load() print 'Found {0} videos...'.format(video_feed.video_count) for i, video in enumerate(video_feed): if video.title: filename = video.title.replace(' ', '_') filename = ''.join([c for c in filename if c in ALLOWED_LETTERS]) filename = '_' + filename else: filename = '' filename = '{0:04d}{1}.json'.format(i, filename[:40]) print 'Working on {0}... ({1})'.format( unicodedata.normalize('NFKD', video.title).encode( 'ascii', 'ignore'), filename) item = vidscraper_to_dict(video, youtube_embed=youtube_embed) f = open(os.path.join('json', filename), 'w') f.write(convert_to_json(item)) f.close() # TODO: what if there's a file there already? on the first one, # prompt the user whether to stomp on existing files or skip. return 0
def clean_feed_url(self): url = self.cleaned_data['feed_url'] # Get a canonical URL from vidscraper scraped_feed = auto_feed(url, api_keys=API_KEYS) url = scraped_feed.url try: models.Feed.objects.get(feed_url=url, site=settings.SITE_ID) except models.Feed.DoesNotExist: pass else: raise ValidationError("Feed with this URL already exists.") return url
def test_auto_feed(self): feed = auto_feed("http://youtube.com/AssociatedPress") self.assertEqual(feed.url, ('http://gdata.youtube.com/feeds/base/users/' 'AssociatedPress/uploads?alt=rss&v=2')) feed.load() self.assertEqual(feed.title, 'Uploads by AssociatedPress') self.assertEqual( feed.thumbnail_url, 'http://www.youtube.com/img/pic_youtubelogo_123x63.gif') self.assertTrue('AssociatedPress' in feed.webpage) self.assertTrue(feed.entry_count > 50000)
def fetch_videos_from_url(url, youtube_embed=None): """Fetches video data from given url and returns array of dicts :arg url: The url to fetch data from :returns: list of richard-ish dicts Example: >>> fetch_videos_from_url('http://www.youtube.com/user/PyConDE/videos') [...] """ video_feed = vidscraper.auto_feed(url) video_feed.load() return [vidscraper_to_dict(vid, youtube_embed) for vid in video_feed]
def clean_feed_url(self): url = self.cleaned_data['feed_url'] try: scraped_feed = auto_feed(url) url = scraped_feed.url except CantIdentifyUrl: raise forms.ValidationError('It does not appear that %s is an ' 'RSS/Atom feed URL.' % url) site = Site.objects.get_current() if models.Feed.objects.filter(feed_url=url, site=site): raise forms.ValidationError( 'That feed already exists on this site.') self.cleaned_data['scraped_feed'] = scraped_feed return url
def test_auto_feed(self): max_results = 20 feed = auto_feed("http://youtube.com/AssociatedPress", max_results=max_results) self.assertEqual(feed.url, "http://youtube.com/AssociatedPress") self.assertEqual(feed.url_data, {'username': '******'}) feed.load() self.assertEqual(feed.title, 'Uploads by AssociatedPress') self.assertEqual( feed.thumbnail_url, 'http://www.youtube.com/img/pic_youtubelogo_123x63.gif') # YouTube changes this sometimes, so just make sure it's there self.assertTrue(feed.webpage) self.assertTrue(feed.etag is not None) self.assertTrue(feed.video_count > 55000) self.assertEqual(feed.guid, u'tag:youtube.com,2008:user:AssociatedPress:uploads') videos = list(feed) self.assertEqual(len(videos), max_results)