def scrape_vim_scripts_repos(num): """Scrape at least num repos from the vim-scripts GitHub user.""" _, user_data = get_api_page('users/vim-scripts') # Calculate how many pages of repositories there are. num_repos = user_data['public_repos'] num_pages = (num_repos + 99) / 100 # ceil(num_repos / 100.0) num_inserted = 0 num_scraped = 0 for page in range(1, num_pages + 1): if num_scraped >= num: break _, repos_data = get_api_page('users/vim-scripts/repos', page=page) for repo_data in repos_data: # Scrape plugin-relevant data. We don't need much info from # vim-scripts because it's a mirror of vim.org. # vimorg_id is required for associating with the corresponding # vim.org-scraped plugin. vimorg_id = util.get_vimorg_id_from_url(repo_data['homepage']) assert vimorg_id repo_name = repo_data['name'] repo = PluginGithubRepos.get_with_owner_repo('vim-scripts', repo_name) num_bundles = repo['plugin_manager_users'] if repo else 0 db.plugins.add_scraped_data({ 'vimorg_id': vimorg_id, 'github_vim_scripts_repo_name': repo_name, 'github_vim_scripts_stars': repo_data['watchers'], 'github_vim_scripts_bundles': num_bundles, }) # Also add to our index of known GitHub plugins. inserted = PluginGithubRepos.upsert_with_owner_repo({ 'owner': 'vim-scripts', 'repo_name': repo_name, 'repo_data': repo_data, }) num_inserted += int(inserted) num_scraped += 1 print ' scraped %s repos' % num_scraped print "\nScraped %s vim-scripts GitHub repos; inserted %s new ones." % ( num_scraped, num_inserted)
def review_vimorg_submission(submission): """Prompts whether to insert data about a plugin with a vimorg_id. Displays info about that submission, and displays an interactive prompt whether to add submitted data about it. If no, will add a field to the submission that it was rejected. If yes, will make the submission data searchable by adding a vimorg_id field so that data from it can be added when scraping vim.org plugins. """ print print json.dumps(submission, indent=2) if not _query_yes_no("Add info about this vim.org submission?"): submission['rejected'] = True r.table('submitted_plugins').insert(submission, upsert=True).run( r_conn()) return print "Ok, will update from this submission data on next vim.org scrape" vimorg_id = util.get_vimorg_id_from_url(submission['vimorg-link']) submission['vimorg_id'] = vimorg_id r.table('submitted_plugins').insert(submission, upsert=True).run(r_conn())
def review_vimorg_submission(submission): """Prompts whether to insert data about a plugin with a vimorg_id. Displays info about that submission, and displays an interactive prompt whether to add submitted data about it. If no, will add a field to the submission that it was rejected. If yes, will make the submission data searchable by adding a vimorg_id field so that data from it can be added when scraping vim.org plugins. """ print print json.dumps(submission, indent=2) if not _query_yes_no("Add info about this vim.org submission?"): submission['rejected'] = True r.table('submitted_plugins').insert(submission, conflict='replace').run(r_conn()) return print "Ok, will update from this submission data on next vim.org scrape" vimorg_id = util.get_vimorg_id_from_url(submission['vimorg-link']) submission['vimorg_id'] = vimorg_id r.table('submitted_plugins').insert(submission, conflict='replace').run(r_conn())
def scrape_vim_scripts_repos(num): """Scrape at least num repos from the vim-scripts GitHub user.""" _, user_data = get_api_page('users/vim-scripts') # Calculate how many pages of repositories there are. num_repos = user_data['public_repos'] num_pages = (num_repos + 99) / 100 # ceil(num_repos / 100.0) num_inserted = 0 num_scraped = 0 for page in range(1, num_pages + 1): if num_scraped >= num: break _, repos_data = get_api_page('users/vim-scripts/repos', page=page) for repo_data in repos_data: # Scrape plugin-relevant data. We don't need much info from # vim-scripts because it's a mirror of vim.org. # vimorg_id is required for associating with the corresponding # vim.org-scraped plugin. vimorg_id = util.get_vimorg_id_from_url(repo_data['homepage']) assert vimorg_id repo_name = repo_data['name'] repo = PluginGithubRepos.get_with_owner_repo( 'vim-scripts', repo_name) num_bundles = repo['plugin_manager_users'] if repo else 0 db.plugins.add_scraped_data({ 'vimorg_id': vimorg_id, 'github_vim_scripts_repo_name': repo_name, 'github_vim_scripts_stars': repo_data['watchers'], 'github_vim_scripts_bundles': num_bundles, }) # Also add to our index of known GitHub plugins. inserted = PluginGithubRepos.upsert_with_owner_repo({ 'owner': 'vim-scripts', 'repo_name': repo_name, 'repo_data': repo_data, }) num_inserted += int(inserted) num_scraped += 1 print ' scraped %s repos' % num_scraped print "\nScraped %s vim-scripts GitHub repos; inserted %s new ones." % ( num_scraped, num_inserted)