def pull_cmd(cfg, parser, parsed, args): if not parsed.quiet: parser.print_byline() username = get_from_config(cfg, 'username') api_url = get_from_config(cfg, 'api_url') cat_title = get_from_config(cfg, 'category') # Command line api_key overrides config-set api_key api_key = parsed.apikey if not api_key: try: api_key = cfg.get('project', 'api_key') except ConfigParser.NoOptionError: pass if not api_key: err('Specify an api key either in steve.ini, on command line, ' 'or in API_KEY file.') return 1 if not username or not api_url or not cat_title or not api_key: return 1 api = steve.restapi.API(api_url) all_categories = steve.restapi.get_content( api.category.get(username=username, api_key=api_key, limit=0)) cat = [cat_item for cat_item in all_categories['objects'] if cat_item['title'] == cat_title] if not cat: err('Category "{0}" does not exist.'.format(cat_title)) return 1 # Get the category from the list of 1. cat = cat[0] out('Retrieved category.') data = [] for counter, video_url in enumerate(cat['videos']): # Lame, but good enough for now. video_id = video_url.split('/')[-2] video_data = steve.restapi.get_content( api.video(video_id).get(username=username, api_key=api_key)) out('Working on "{0}"'.format(video_data['slug'])) # Nix some tastypie bits from the data. for bad_key in ('resource_uri',): if bad_key in video_data: del video_data[bad_key] # Add id. video_data['id'] = video_id fn = 'json/{0:4d}_{1}.json'.format(counter, video_data['slug']) data.append((fn, video_data)) out('Saving files....') save_json_files(cfg, data) return 0
} return item if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--maxresults", help="Max results", default=50) parser.add_argument("-c", "--channel", action='store_true', help="YouTube channel id") parser.add_argument("-p", "--playlist", action='store_true', help="YouTube playlist id") args = parser.parse_args() cfg = get_project_config() scraper = YouTubeScraper(cfg, max_results=args.maxresults) if args.channel: channel_id = get_from_config(cfg, 'channel_id', 'youtube') print("scraping channel {}".format(channel_id)) data = scraper.scrape_channel(channel_id) save_json_files(cfg, data) elif args.playlist: playlist_id = get_from_config(cfg, 'playlist_id', 'youtube') print("scraping playlist {}".format(playlist_id)) data = scraper.scrape_playlist(playlist_id) save_json_files(cfg, data) else: print("nothing to do. no channel or playlist requested")
def pull_cmd(cfg, parser, parsed, args): if not parsed.quiet: parser.print_byline() username = get_from_config(cfg, 'username') api_url = get_from_config(cfg, 'api_url') cat_title = get_from_config(cfg, 'category') # Command line api_key overrides config-set api_key api_key = parsed.apikey if not api_key: try: api_key = cfg.get('project', 'api_key') except ConfigParser.NoOptionError: pass if not api_key: err('Specify an api key either in steve.ini, on command line, ' 'or in API_KEY file.') return 1 if not username or not api_url or not cat_title or not api_key: return 1 api = steve.restapi.API(api_url) all_categories = steve.restapi.get_content( api.category.get(username=username, api_key=api_key, limit=0)) cat = [ cat_item for cat_item in all_categories['objects'] if cat_item['title'] == cat_title ] if not cat: err('Category "{0}" does not exist.'.format(cat_title)) return 1 # Get the category from the list of 1. cat = cat[0] out('Retrieved category.') data = [] for counter, video_url in enumerate(cat['videos']): # Lame, but good enough for now. video_id = video_url.split('/')[-2] video_data = steve.restapi.get_content( api.video(video_id).get(username=username, api_key=api_key)) out('Working on "{0}"'.format(video_data['slug'])) # Nix some tastypie bits from the data. for bad_key in ('resource_uri', ): if bad_key in video_data: del video_data[bad_key] # Add id. video_data['id'] = video_id fn = 'json/{0:4d}_{1}.json'.format(counter, video_data['slug']) data.append((fn, video_data)) out('Saving files....') save_json_files(cfg, data) return 0
def pull(cfg, ctx, quiet, apikey): """Pulls data from a richard instance.""" if not quiet: click.echo(VERSION) username = get_from_config(cfg, 'username') api_url = get_from_config(cfg, 'api_url') cat_title = get_from_config(cfg, 'category') # Command line api_key overrides config-set api_key if not apikey: try: apikey = cfg.get('project', 'api_key') except NoOptionError: pass if not apikey: raise click.ClickException( u'Specify an api key either in {0}, on command line, ' u'or in API_KEY file.'.format(get_project_config_file_name()) ) if not username or not api_url or not cat_title or not apikey: raise click.ClickException(u'Missing username, api_url or api_key.') api = steve.restapi.API(api_url) all_categories = steve.restapi.get_content( api.category.get(username=username, api_key=apikey, limit=0)) cat = [cat_item for cat_item in all_categories['objects'] if cat_item['title'] == cat_title] if not cat: raise click.ClickException(u'Category "{0}" does not exist.'.format(cat_title)) # Get the category from the list of 1. cat = cat[0] click.echo('Retrieved category.') data = [] for counter, video_url in enumerate(cat['videos']): # Lame, but good enough for now. video_id = video_url.split('/')[-2] video_data = steve.restapi.get_content( api.video(video_id).get(username=username, api_key=apikey)) click.echo('Working on "{0}"'.format(video_data['slug'])) # Nix some tastypie bits from the data. for bad_key in ('resource_uri',): if bad_key in video_data: del video_data[bad_key] # Add id. video_data['id'] = video_id fn = 'json/{0:4d}_{1}.json'.format(counter, video_data['slug']) data.append((fn, video_data)) click.echo('Saving files....') save_json_files(cfg, data)
videos = category['videos'] # In[25]: stevedata = [] for v in videos: r = requests.get(v) j = r.json() stevedata.append(('%s.json' % j['id'], j)) # In[27]: save_json_files(cfg, stevedata) # In[30]: soup = BeautifulSoup(open('talks.html')) # In[38]: proposals_soup = soup.find_all('div', class_='proposal_list_summary') # In[41]: psoup = proposals_soup[0]
def pull(cfg, ctx, quiet, apikey): """Pulls data from a richard instance.""" if not quiet: click.echo(VERSION) username = get_from_config(cfg, 'username') api_url = get_from_config(cfg, 'api_url') cat_title = get_from_config(cfg, 'category') # Command line api_key overrides config-set api_key if not apikey: try: apikey = cfg.get('project', 'api_key') except ConfigParser.NoOptionError: pass if not apikey: raise click.ClickException( u'Specify an api key either in {0}, on command line, ' u'or in API_KEY file.'.format(get_project_config_file_name()) ) if not username or not api_url or not cat_title or not apikey: raise click.ClickException(u'Missing username, api_url or api_key.') api = steve.restapi.API(api_url) all_categories = steve.restapi.get_content( api.category.get(username=username, api_key=apikey, limit=0)) cat = [cat_item for cat_item in all_categories['objects'] if cat_item['title'] == cat_title] if not cat: raise click.ClickException(u'Category "{0}" does not exist.'.format(cat_title)) # Get the category from the list of 1. cat = cat[0] click.echo('Retrieved category.') data = [] for counter, video_url in enumerate(cat['videos']): video_id = get_video_id(video_url) video_data = steve.restapi.get_content( api.video(video_id).get(username=username, api_key=apikey)) click.echo('Working on "{0}"'.format(video_data['slug'])) # Nix some tastypie bits from the data. for bad_key in ('resource_uri',): if bad_key in video_data: del video_data[bad_key] # Add id. video_data['id'] = video_id fn = 'json/{0:4d}_{1}.json'.format(counter, video_data['slug']) data.append((fn, video_data)) click.echo('Saving files....') save_json_files(cfg, data)