def scrape_vim_scripts_repos(num): """Scrape at least num repos from the vim-scripts GitHub user.""" _, user_data = get_api_page('users/vim-scripts') # Calculate how many pages of repositories there are. num_repos = user_data['public_repos'] num_pages = (num_repos + 99) / 100 # ceil(num_repos / 100.0) num_inserted = 0 num_scraped = 0 for page in range(1, num_pages + 1): if num_scraped >= num: break _, repos_data = get_api_page('users/vim-scripts/repos', page=page) for repo_data in repos_data: # Scrape plugin-relevant data. We don't need much info from # vim-scripts because it's a mirror of vim.org. # vimorg_id is required for associating with the corresponding # vim.org-scraped plugin. vimorg_id = util.get_vimorg_id_from_url(repo_data['homepage']) assert vimorg_id repo_name = repo_data['name'] repo = PluginGithubRepos.get_with_owner_repo('vim-scripts', repo_name) num_bundles = repo['plugin_manager_users'] if repo else 0 db.plugins.add_scraped_data({ 'vimorg_id': vimorg_id, 'github_vim_scripts_repo_name': repo_name, 'github_vim_scripts_stars': repo_data['watchers'], 'github_vim_scripts_bundles': num_bundles, }) # Also add to our index of known GitHub plugins. inserted = PluginGithubRepos.upsert_with_owner_repo({ 'owner': 'vim-scripts', 'repo_name': repo_name, 'repo_data': repo_data, }) num_inserted += int(inserted) num_scraped += 1 print ' scraped %s repos' % num_scraped print "\nScraped %s vim-scripts GitHub repos; inserted %s new ones." % ( num_scraped, num_inserted)
def scrape_vim_scripts_repos(num): """Scrape at least num repos from the vim-scripts GitHub user.""" _, user_data = get_api_page('users/vim-scripts') # Calculate how many pages of repositories there are. num_repos = user_data['public_repos'] num_pages = (num_repos + 99) / 100 # ceil(num_repos / 100.0) num_inserted = 0 num_scraped = 0 for page in range(1, num_pages + 1): if num_scraped >= num: break _, repos_data = get_api_page('users/vim-scripts/repos', page=page) for repo_data in repos_data: # Scrape plugin-relevant data. We don't need much info from # vim-scripts because it's a mirror of vim.org. # vimorg_id is required for associating with the corresponding # vim.org-scraped plugin. vimorg_id = _get_vimorg_id_from_url(repo_data['homepage']) assert vimorg_id repo_name = repo_data['name'] repo = PluginGithubRepos.get_with_owner_repo('vim-scripts', repo_name) num_bundles = repo['plugin_manager_users'] if repo else 0 db.plugins.add_scraped_data({ 'vimorg_id': vimorg_id, 'github_vim_scripts_repo_name': repo_name, 'github_vim_scripts_stars': repo_data['watchers'], 'github_vim_scripts_bundles': num_bundles, }) # Also add to our index of known GitHub plugins. inserted = PluginGithubRepos.upsert_with_owner_repo({ 'owner': 'vim-scripts', 'repo_name': repo_name, 'repo_data': repo_data, }) num_inserted += int(inserted) num_scraped += 1 print ' scraped %s repos' % num_scraped print "\nScraped %s vim-scripts GitHub repos; inserted %s new ones." % ( num_scraped, num_inserted)
def get_repos_from_vimorg_descriptions(): """Extract URLs of GitHub repos from the long descriptions on vim.org.""" print "Discovering GitHub repos from vim.org long descriptions ..." # A map of repo URL to the vimorg_ids they were found in. repo_urls_dict = collections.defaultdict(set) all_plugins = r.table('plugins').run(r_conn()) for plugin in all_plugins: for field in ['vimorg_long_desc', 'vimorg_install_details']: if field in plugin and plugin[field]: repo_urls = set(_extract_github_repo_urls(plugin[field])) vimorg_id = plugin['vimorg_id'] # TODO(captbaritone) Re-enable this assertion once there is # only one plugin per `vimorg_id` # assert vimorg_id if not vimorg_id: continue for repo_url in repo_urls: repo_urls_dict[repo_url].add(vimorg_id) num_inserted = 0 for repo_url, vimorg_ids in repo_urls_dict.iteritems(): _, owner, repo_name = repo_url.split('/') inserted = PluginGithubRepos.upsert_with_owner_repo({ 'owner': owner, 'repo_name': repo_name, 'from_vim_scripts': list(vimorg_ids), }) num_inserted += int(inserted) print "Found %s GitHub repos; inserted %s new ones." % ( len(repo_urls_dict), num_inserted)
def get_repos_from_vimorg_descriptions(): """Extract URLs of GitHub repos from the long descriptions on vim.org.""" print "Discovering GitHub repos from vim.org long descriptions ..." # A map of repo URL to the vimorg_ids they were found in. repo_urls_dict = collections.defaultdict(set) all_plugins = r.table('plugins').run(r_conn()) for plugin in all_plugins: for field in ['vimorg_long_desc', 'vimorg_install_details']: if field in plugin and plugin[field]: repo_urls = set(_extract_github_repo_urls(plugin[field])) vimorg_id = plugin['vimorg_id'] assert vimorg_id for repo_url in repo_urls: repo_urls_dict[repo_url].add(vimorg_id) num_inserted = 0 for repo_url, vimorg_ids in repo_urls_dict.iteritems(): _, owner, repo_name = repo_url.split('/') inserted = PluginGithubRepos.upsert_with_owner_repo({ 'owner': owner, 'repo_name': repo_name, 'from_vim_scripts': list(vimorg_ids), }) num_inserted += int(inserted) print "Found %s GitHub repos; inserted %s new ones." % ( len(repo_urls_dict), num_inserted)
def aggregate_repos_from_dotfiles(): """Aggregate plugin references scraped from dotfiles repos on GitHub. Adds newly-discovered GitHub repos of plugins and also updates each GitHub plugin repo with the number of plugin manager users. Prints out some stats at the end. """ # Counter of how many users for each of Pathogen/Vundle/NeoBundle. users_counter = collections.Counter() # Counter of how many times a repo occurs. repos_counter = collections.Counter() # Counter of total bundles for each of Pathogen/Vundle/NeoBundle. manager_counter = collections.Counter() # Map of plugin manager name to column name in dotfiles_github_repos table. managers = { 'vundle': 'vundle_repos', 'pathogen': 'pathogen_repos', 'neobundle': 'neobundle_repos', 'plug': 'vimplug_repos', } query = r.table('dotfiles_github_repos').pluck(managers.values()) all_dotfiles = query.run(r_conn()) for dotfiles_repo in all_dotfiles: for manager, field in managers.iteritems(): plugin_repos = dotfiles_repo[field] users_counter[manager] += 1 if plugin_repos else 0 manager_counter[manager] += len(plugin_repos) for owner_repo in plugin_repos: # Normalize the GitHub URL fragment owner_repo = owner_repo.lower() repos_counter[owner_repo] += 1 num_inserted = 0 for owner_repo, num_users in repos_counter.iteritems(): owner, repo_name = owner_repo.split('/') repo = { 'owner': owner, 'repo_name': repo_name, 'plugin_manager_users': num_users, } newly_inserted = PluginGithubRepos.upsert_with_owner_repo(repo) num_inserted += int(newly_inserted) most_used_repos = '\n'.join(map(str, repos_counter.most_common(10))) print 'Most used plugins:', most_used_repos print 'Users per manager:', users_counter print 'Plugins per manager:', manager_counter print 'Dotfile repos scraped:', query.count().run(r_conn()) print 'New plugin repos inserted:', num_inserted print 'Unique plugin bundles found:', len(repos_counter) print 'Total plugin bundles found:', sum(manager_counter.values())
def scrape_plugin_repos(num): """Scrapes the num plugin repos that have been least recently scraped.""" MIN_FORK_USERS = 3 query = r.table('plugin_github_repos').filter({'is_blacklisted': False}) # We don't want to scrape forks that not many people use. query = query.filter( r.not_((r.row['is_fork'] == True) & ( # NOQA r.row['plugin_manager_users'] < MIN_FORK_USERS)), default=True) # Only scrape repos that don't redirect to other ones (probably renamed). query = query.filter(r.row['redirects_to'] == '') # We scrape vim-scripts separately using the batch /users/:user/repos call query = query.filter(r.row['owner'] != 'vim-scripts') query = query.order_by('last_scraped_at').limit(num) repos = query.run(r_conn()) # TODO(david): Print stats at the end: # successfully scraped, # not found, # # redirects, etc. for repo in repos: repo_name = repo['repo_name'] repo_owner = repo['owner'] # Print w/o newline. print " scraping %s/%s ..." % (repo_owner, repo_name), sys.stdout.flush() # Attempt to fetch data about the plugin. res, repo_data = get_api_page('repos/%s/%s' % (repo_owner, repo_name)) # If the API call 404s, then see if the repo has been renamed by # checking for a redirect in a non-API call. if res.status_code == 404: res = requests.head('https://github.com/%s/%s' % (repo_owner, repo_name)) if res.status_code == 301: location = res.headers.get('location') _, redirect_owner, redirect_repo_name = location.rsplit('/', 2) repo['redirects_to'] = '%s/%s' % (redirect_owner, redirect_repo_name) # Make sure we insert the new location of the repo, which will # be scraped in a future run. PluginGithubRepos.upsert_with_owner_repo({ 'owner': redirect_owner, 'repo_name': redirect_repo_name, # TODO(david): Should append to a list 'redirects_from': ('%s/%s' % (repo_owner, repo_name)), }) # And now change the GitHub repo location of the plugin that # the old repo location pointed to query = r.table('plugins').get_all([repo_owner, repo_name], index='github_owner_repo') db_plugin = db.util.get_first(query) if db_plugin: db_plugin['github_owner'] = redirect_owner db_plugin['github_repo_name'] = redirect_repo_name db.plugins.insert(db_plugin, conflict='replace') print 'redirects to %s/%s.' % (redirect_owner, redirect_repo_name) else: # TODO(david): Insert some metadata in the github repo that # this is not found print 'not found.' plugin_data = None else: plugin_data = get_plugin_data(repo_owner, repo_name, repo_data) repo['repo_data'] = repo_data repo['repo_id'] = str(repo_data.get('id', repo['repo_id'])) PluginGithubRepos.log_scrape(repo) # If this is a fork, note it and ensure we know about original repo. if repo_data.get('fork'): repo['is_fork'] = True PluginGithubRepos.upsert_with_owner_repo({ 'owner': repo_data['parent']['owner']['login'], 'repo_name': repo_data['parent']['name'], }) PluginGithubRepos.upsert_with_owner_repo(repo) # For most cases we don't care about forked repos, unless the forked # repo is used by others. if repo_data.get('fork') and (repo.get('plugin_manager_users', 0) < MIN_FORK_USERS): print 'skipping fork of %s' % repo_data['parent']['full_name'] continue if plugin_data: # Insert the number of plugin manager users across all names/owners # of this repo. # TODO(david): Try to also use repo_id for this (but not all repos # have it), or look at multiple levels of redirects. plugin_manager_users = repo.get('plugin_manager_users', 0) other_repos = r.table('plugin_github_repos').get_all( '%s/%s' % (repo_owner, repo_name), index='redirects_to').run(r_conn()) for other_repo in other_repos: if other_repo['id'] == repo['id']: continue plugin_manager_users += other_repo.get('plugin_manager_users', 0) plugin_data['github_bundles'] = plugin_manager_users if repo.get('from_submission'): _add_submission_data(plugin_data, repo['from_submission']) db.plugins.add_scraped_data(plugin_data, repo, submission=repo.get('from_submission')) print 'done.'
def scrape_plugin_repos(num): """Scrapes the num plugin repos that have been least recently scraped.""" MIN_FORK_USERS = 3 query = r.table('plugin_github_repos').filter({'is_blacklisted': False}) # We don't want to scrape forks that not many people use. query = query.filter(r.not_((r.row['is_fork'] == True) & ( r.row['plugin_manager_users'] < MIN_FORK_USERS)), default=True) # Only scrape repos that don't redirect to other ones (probably renamed). query = query.filter(r.row['redirects_to'] == '') # We scrape vim-scripts separately using the batch /users/:user/repos call query = query.filter(r.row['owner'] != 'vim-scripts') query = query.order_by('last_scraped_at').limit(num) repos = query.run(r_conn()) # TODO(david): Print stats at the end: # successfully scraped, # not found, # # redirects, etc. for repo in repos: repo_name = repo['repo_name'] repo_owner = repo['owner'] # Print w/o newline. print " scraping %s/%s ..." % (repo_owner, repo_name), sys.stdout.flush() # Attempt to fetch data about the plugin. res, repo_data = get_api_page('repos/%s/%s' % (repo_owner, repo_name)) # If the API call 404s, then see if the repo has been renamed by # checking for a redirect in a non-API call. if res.status_code == 404: res = requests.head('https://github.com/%s/%s' % ( repo_owner, repo_name)) if res.status_code == 301: location = res.headers.get('location') _, redirect_owner, redirect_repo_name = location.rsplit('/', 2) repo['redirects_to'] = '%s/%s' % (redirect_owner, redirect_repo_name) # Make sure we insert the new location of the repo, which will # be scraped in a future run. PluginGithubRepos.upsert_with_owner_repo({ 'owner': redirect_owner, 'repo_name': redirect_repo_name, # TODO(david): Should append to a list 'redirects_from': ('%s/%s' % (repo_owner, repo_name)), }) # And now change the GitHub repo location of the plugin that # the old repo location pointed to query = r.table('plugins').get_all( [repo_owner, repo_name], index='github_owner_repo') db_plugin = db.util.get_first(query) if db_plugin: db_plugin['github_owner'] = redirect_owner db_plugin['github_repo_name'] = redirect_repo_name db.plugins.insert(db_plugin, conflict='replace') print 'redirects to %s/%s.' % (redirect_owner, redirect_repo_name) else: # TODO(david): Insert some metadata in the github repo that # this is not found print 'not found.' plugin_data = None else: plugin_data = get_plugin_data(repo_owner, repo_name, repo_data) repo['repo_data'] = repo_data repo['repo_id'] = str(repo_data.get('id', repo['repo_id'])) PluginGithubRepos.log_scrape(repo) # If this is a fork, note it and ensure we know about original repo. if repo_data.get('fork'): repo['is_fork'] = True PluginGithubRepos.upsert_with_owner_repo({ 'owner': repo_data['parent']['owner']['login'], 'repo_name': repo_data['parent']['name'], }) r.table('plugin_github_repos').insert(repo, conflict='replace').run(r_conn()) # For most cases we don't care about forked repos, unless the forked # repo is used by others. if repo_data.get('fork') and ( repo.get('plugin_manager_users', 0) < MIN_FORK_USERS): print 'skipping fork of %s' % repo_data['parent']['full_name'] continue if plugin_data: # Insert the number of plugin manager users across all names/owners # of this repo. # TODO(david): Try to also use repo_id for this (but not all repos # have it), or look at multiple levels of redirects. plugin_manager_users = repo.get('plugin_manager_users', 0) other_repos = r.table('plugin_github_repos').get_all( '%s/%s' % (repo_owner, repo_name), index='redirects_to').run(r_conn()) for other_repo in other_repos: if other_repo['id'] == repo['id']: continue plugin_manager_users += other_repo.get( 'plugin_manager_users', 0) plugin_data['github_bundles'] = plugin_manager_users if repo.get('from_submission'): _add_submission_data(plugin_data, repo['from_submission']) db.plugins.add_scraped_data(plugin_data, repo, submission=repo.get('from_submission')) print 'done.'
def scrape_plugin_repos(num): """Scrapes the num plugin repos that have been least recently scraped.""" MIN_FORK_USERS = 3 query = r.table("plugin_github_repos").filter({"is_blacklisted": False}) # We don't want to scrape forks that not many people use. query = query.filter( r.not_((r.row["is_fork"] == True) & (r.row["plugin_manager_users"] < MIN_FORK_USERS)), default=True # NOQA ) # Only scrape repos that don't redirect to other ones (probably renamed). query = query.filter(r.row["redirects_to"] == "") # We scrape vim-scripts separately using the batch /users/:user/repos call query = query.filter(r.row["owner"] != "vim-scripts") query = query.order_by("last_scraped_at").limit(num) repos = query.run(r_conn()) # TODO(david): Print stats at the end: # successfully scraped, # not found, # # redirects, etc. for repo in repos: repo_name = repo["repo_name"] repo_owner = repo["owner"] # Print w/o newline. print " scraping %s/%s ..." % (repo_owner, repo_name), sys.stdout.flush() # Attempt to fetch data about the plugin. res, repo_data = get_api_page("repos/%s/%s" % (repo_owner, repo_name)) # If the API call 404s, then see if the repo has been renamed by # checking for a redirect in a non-API call. if res.status_code == 404: res = requests.head("https://github.com/%s/%s" % (repo_owner, repo_name)) if res.status_code == 301: location = res.headers.get("location") valid_repo_url = re.compile("^https://github.com/[^/]+/[^/]+") if not valid_repo_url.match(location): print "redirects to invalid GitHub repo URL: %s" % location continue _, redirect_owner, redirect_repo_name = location.rsplit("/", 2) repo["redirects_to"] = "%s/%s" % (redirect_owner, redirect_repo_name) # Make sure we insert the new location of the repo, which will # be scraped in a future run. PluginGithubRepos.upsert_with_owner_repo( { "owner": redirect_owner, "repo_name": redirect_repo_name, # TODO(david): Should append to a list "redirects_from": ("%s/%s" % (repo_owner, repo_name)), } ) # And now change the GitHub repo location of the plugin that # the old repo location pointed to query = r.table("plugins").get_all([repo_owner, repo_name], index="github_owner_repo") db_plugin = db.util.get_first(query) if db_plugin: db_plugin["github_owner"] = redirect_owner db_plugin["github_repo_name"] = redirect_repo_name db.plugins.insert(db_plugin, conflict="replace") print "redirects to %s/%s." % (redirect_owner, redirect_repo_name) else: # TODO(david): Insert some metadata in the github repo that # this is not found print "not found." plugin_data = None else: plugin_data = get_plugin_data(repo_owner, repo_name, repo_data) repo["repo_data"] = repo_data repo["repo_id"] = str(repo_data.get("id", repo["repo_id"])) PluginGithubRepos.log_scrape(repo) # If this is a fork, note it and ensure we know about original repo. if repo_data.get("fork"): repo["is_fork"] = True PluginGithubRepos.upsert_with_owner_repo( {"owner": repo_data["parent"]["owner"]["login"], "repo_name": repo_data["parent"]["name"]} ) PluginGithubRepos.upsert_with_owner_repo(repo) # For most cases we don't care about forked repos, unless the forked # repo is used by others. if repo_data.get("fork") and (repo.get("plugin_manager_users", 0) < MIN_FORK_USERS): print "skipping fork of %s" % repo_data["parent"]["full_name"] continue if plugin_data: # Insert the number of plugin manager users across all names/owners # of this repo. # TODO(david): Try to also use repo_id for this (but not all repos # have it), or look at multiple levels of redirects. plugin_manager_users = repo.get("plugin_manager_users", 0) other_repos = ( r.table("plugin_github_repos") .get_all("%s/%s" % (repo_owner, repo_name), index="redirects_to") .run(r_conn()) ) for other_repo in other_repos: if other_repo["id"] == repo["id"]: continue plugin_manager_users += other_repo.get("plugin_manager_users", 0) plugin_data["github_bundles"] = plugin_manager_users if repo.get("from_submission"): _add_submission_data(plugin_data, repo["from_submission"]) db.plugins.add_scraped_data(plugin_data, repo, submission=repo.get("from_submission")) print "done."