Пример #1
0
def scrape_vim_scripts_repos(num):
    """Scrape at least num repos from the vim-scripts GitHub user."""
    _, user_data = get_api_page('users/vim-scripts')

    # Calculate how many pages of repositories there are.
    num_repos = user_data['public_repos']
    num_pages = (num_repos + 99) / 100  # ceil(num_repos / 100.0)

    num_inserted = 0
    num_scraped = 0

    for page in range(1, num_pages + 1):
        if num_scraped >= num:
            break

        _, repos_data = get_api_page('users/vim-scripts/repos', page=page)

        for repo_data in repos_data:

            # Scrape plugin-relevant data. We don't need much info from
            # vim-scripts because it's a mirror of vim.org.

            # vimorg_id is required for associating with the corresponding
            # vim.org-scraped plugin.
            vimorg_id = util.get_vimorg_id_from_url(repo_data['homepage'])
            assert vimorg_id

            repo_name = repo_data['name']

            repo = PluginGithubRepos.get_with_owner_repo('vim-scripts',
                    repo_name)
            num_bundles = repo['plugin_manager_users'] if repo else 0

            db.plugins.add_scraped_data({
                'vimorg_id': vimorg_id,
                'github_vim_scripts_repo_name': repo_name,
                'github_vim_scripts_stars': repo_data['watchers'],
                'github_vim_scripts_bundles': num_bundles,
            })

            # Also add to our index of known GitHub plugins.
            inserted = PluginGithubRepos.upsert_with_owner_repo({
                'owner': 'vim-scripts',
                'repo_name': repo_name,
                'repo_data': repo_data,
            })

            num_inserted += int(inserted)
            num_scraped += 1

        print '    scraped %s repos' % num_scraped

    print "\nScraped %s vim-scripts GitHub repos; inserted %s new ones." % (
            num_scraped, num_inserted)
Пример #2
0
def scrape_vim_scripts_repos(num):
    """Scrape at least num repos from the vim-scripts GitHub user."""
    _, user_data = get_api_page('users/vim-scripts')

    # Calculate how many pages of repositories there are.
    num_repos = user_data['public_repos']
    num_pages = (num_repos + 99) / 100  # ceil(num_repos / 100.0)

    num_inserted = 0
    num_scraped = 0

    for page in range(1, num_pages + 1):
        if num_scraped >= num:
            break

        _, repos_data = get_api_page('users/vim-scripts/repos', page=page)

        for repo_data in repos_data:

            # Scrape plugin-relevant data. We don't need much info from
            # vim-scripts because it's a mirror of vim.org.

            # vimorg_id is required for associating with the corresponding
            # vim.org-scraped plugin.
            vimorg_id = _get_vimorg_id_from_url(repo_data['homepage'])
            assert vimorg_id

            repo_name = repo_data['name']

            repo = PluginGithubRepos.get_with_owner_repo('vim-scripts',
                    repo_name)
            num_bundles = repo['plugin_manager_users'] if repo else 0

            db.plugins.add_scraped_data({
                'vimorg_id': vimorg_id,
                'github_vim_scripts_repo_name': repo_name,
                'github_vim_scripts_stars': repo_data['watchers'],
                'github_vim_scripts_bundles': num_bundles,
            })

            # Also add to our index of known GitHub plugins.
            inserted = PluginGithubRepos.upsert_with_owner_repo({
                'owner': 'vim-scripts',
                'repo_name': repo_name,
                'repo_data': repo_data,
            })

            num_inserted += int(inserted)
            num_scraped += 1

        print '    scraped %s repos' % num_scraped

    print "\nScraped %s vim-scripts GitHub repos; inserted %s new ones." % (
            num_scraped, num_inserted)
Пример #3
0
def get_repos_from_vimorg_descriptions():
    """Extract URLs of GitHub repos from the long descriptions on vim.org."""
    print "Discovering GitHub repos from vim.org long descriptions ..."

    # A map of repo URL to the vimorg_ids they were found in.
    repo_urls_dict = collections.defaultdict(set)

    all_plugins = r.table('plugins').run(r_conn())
    for plugin in all_plugins:
        for field in ['vimorg_long_desc', 'vimorg_install_details']:
            if field in plugin and plugin[field]:
                repo_urls = set(_extract_github_repo_urls(plugin[field]))
                vimorg_id = plugin['vimorg_id']
                # TODO(captbaritone) Re-enable this assertion once there is
                # only one plugin per `vimorg_id`
                # assert vimorg_id
                if not vimorg_id:
                    continue
                for repo_url in repo_urls:
                    repo_urls_dict[repo_url].add(vimorg_id)

    num_inserted = 0
    for repo_url, vimorg_ids in repo_urls_dict.iteritems():
        _, owner, repo_name = repo_url.split('/')
        inserted = PluginGithubRepos.upsert_with_owner_repo({
            'owner': owner,
            'repo_name': repo_name,
            'from_vim_scripts': list(vimorg_ids),
        })
        num_inserted += int(inserted)

    print "Found %s GitHub repos; inserted %s new ones." % (
            len(repo_urls_dict), num_inserted)
Пример #4
0
def get_repos_from_vimorg_descriptions():
    """Extract URLs of GitHub repos from the long descriptions on vim.org."""
    print "Discovering GitHub repos from vim.org long descriptions ..."

    # A map of repo URL to the vimorg_ids they were found in.
    repo_urls_dict = collections.defaultdict(set)

    all_plugins = r.table('plugins').run(r_conn())
    for plugin in all_plugins:
        for field in ['vimorg_long_desc', 'vimorg_install_details']:
            if field in plugin and plugin[field]:
                repo_urls = set(_extract_github_repo_urls(plugin[field]))
                vimorg_id = plugin['vimorg_id']
                assert vimorg_id
                for repo_url in repo_urls:
                    repo_urls_dict[repo_url].add(vimorg_id)

    num_inserted = 0
    for repo_url, vimorg_ids in repo_urls_dict.iteritems():
        _, owner, repo_name = repo_url.split('/')
        inserted = PluginGithubRepos.upsert_with_owner_repo({
            'owner':
            owner,
            'repo_name':
            repo_name,
            'from_vim_scripts':
            list(vimorg_ids),
        })
        num_inserted += int(inserted)

    print "Found %s GitHub repos; inserted %s new ones." % (
        len(repo_urls_dict), num_inserted)
Пример #5
0
def aggregate_repos_from_dotfiles():
    """Aggregate plugin references scraped from dotfiles repos on GitHub.

    Adds newly-discovered GitHub repos of plugins and also updates each GitHub
    plugin repo with the number of plugin manager users. Prints out some stats
    at the end.
    """
    # Counter of how many users for each of Pathogen/Vundle/NeoBundle.
    users_counter = collections.Counter()

    # Counter of how many times a repo occurs.
    repos_counter = collections.Counter()

    # Counter of total bundles for each of Pathogen/Vundle/NeoBundle.
    manager_counter = collections.Counter()

    # Map of plugin manager name to column name in dotfiles_github_repos table.
    managers = {
        'vundle': 'vundle_repos',
        'pathogen': 'pathogen_repos',
        'neobundle': 'neobundle_repos',
        'plug': 'vimplug_repos',
    }

    query = r.table('dotfiles_github_repos').pluck(managers.values())
    all_dotfiles = query.run(r_conn())

    for dotfiles_repo in all_dotfiles:
        for manager, field in managers.iteritems():
            plugin_repos = dotfiles_repo[field]
            users_counter[manager] += 1 if plugin_repos else 0
            manager_counter[manager] += len(plugin_repos)

            for owner_repo in plugin_repos:
                # Normalize the GitHub URL fragment
                owner_repo = owner_repo.lower()
                repos_counter[owner_repo] += 1

    num_inserted = 0

    for owner_repo, num_users in repos_counter.iteritems():
        owner, repo_name = owner_repo.split('/')
        repo = {
            'owner': owner,
            'repo_name': repo_name,
            'plugin_manager_users': num_users,
        }

        newly_inserted = PluginGithubRepos.upsert_with_owner_repo(repo)
        num_inserted += int(newly_inserted)

    most_used_repos = '\n'.join(map(str, repos_counter.most_common(10)))
    print 'Most used plugins:', most_used_repos
    print 'Users per manager:', users_counter
    print 'Plugins per manager:', manager_counter
    print 'Dotfile repos scraped:', query.count().run(r_conn())
    print 'New plugin repos inserted:', num_inserted
    print 'Unique plugin bundles found:', len(repos_counter)
    print 'Total plugin bundles found:', sum(manager_counter.values())
Пример #6
0
def aggregate_repos_from_dotfiles():
    """Aggregate plugin references scraped from dotfiles repos on GitHub.

    Adds newly-discovered GitHub repos of plugins and also updates each GitHub
    plugin repo with the number of plugin manager users. Prints out some stats
    at the end.
    """
    # Counter of how many users for each of Pathogen/Vundle/NeoBundle.
    users_counter = collections.Counter()

    # Counter of how many times a repo occurs.
    repos_counter = collections.Counter()

    # Counter of total bundles for each of Pathogen/Vundle/NeoBundle.
    manager_counter = collections.Counter()

    # Map of plugin manager name to column name in dotfiles_github_repos table.
    managers = {
        'vundle': 'vundle_repos',
        'pathogen': 'pathogen_repos',
        'neobundle': 'neobundle_repos',
        'plug': 'vimplug_repos',
    }

    query = r.table('dotfiles_github_repos').pluck(managers.values())
    all_dotfiles = query.run(r_conn())

    for dotfiles_repo in all_dotfiles:
        for manager, field in managers.iteritems():
            plugin_repos = dotfiles_repo[field]
            users_counter[manager] += 1 if plugin_repos else 0
            manager_counter[manager] += len(plugin_repos)

            for owner_repo in plugin_repos:
                # Normalize the GitHub URL fragment
                owner_repo = owner_repo.lower()
                repos_counter[owner_repo] += 1

    num_inserted = 0

    for owner_repo, num_users in repos_counter.iteritems():
        owner, repo_name = owner_repo.split('/')
        repo = {
            'owner': owner,
            'repo_name': repo_name,
            'plugin_manager_users': num_users,
        }

        newly_inserted = PluginGithubRepos.upsert_with_owner_repo(repo)
        num_inserted += int(newly_inserted)

    most_used_repos = '\n'.join(map(str, repos_counter.most_common(10)))
    print 'Most used plugins:', most_used_repos
    print 'Users per manager:', users_counter
    print 'Plugins per manager:', manager_counter
    print 'Dotfile repos scraped:', query.count().run(r_conn())
    print 'New plugin repos inserted:', num_inserted
    print 'Unique plugin bundles found:', len(repos_counter)
    print 'Total plugin bundles found:', sum(manager_counter.values())
Пример #7
0
def scrape_plugin_repos(num):
    """Scrapes the num plugin repos that have been least recently scraped."""
    MIN_FORK_USERS = 3

    query = r.table('plugin_github_repos').filter({'is_blacklisted': False})

    # We don't want to scrape forks that not many people use.
    query = query.filter(
        r.not_((r.row['is_fork'] == True) & (  # NOQA
            r.row['plugin_manager_users'] < MIN_FORK_USERS)),
        default=True)

    # Only scrape repos that don't redirect to other ones (probably renamed).
    query = query.filter(r.row['redirects_to'] == '')

    # We scrape vim-scripts separately using the batch /users/:user/repos call
    query = query.filter(r.row['owner'] != 'vim-scripts')

    query = query.order_by('last_scraped_at').limit(num)

    repos = query.run(r_conn())

    # TODO(david): Print stats at the end: # successfully scraped, # not found,
    #     # redirects, etc.
    for repo in repos:
        repo_name = repo['repo_name']
        repo_owner = repo['owner']

        # Print w/o newline.
        print "    scraping %s/%s ..." % (repo_owner, repo_name),
        sys.stdout.flush()

        # Attempt to fetch data about the plugin.
        res, repo_data = get_api_page('repos/%s/%s' % (repo_owner, repo_name))

        # If the API call 404s, then see if the repo has been renamed by
        # checking for a redirect in a non-API call.
        if res.status_code == 404:

            res = requests.head('https://github.com/%s/%s' %
                                (repo_owner, repo_name))

            if res.status_code == 301:
                location = res.headers.get('location')
                _, redirect_owner, redirect_repo_name = location.rsplit('/', 2)

                repo['redirects_to'] = '%s/%s' % (redirect_owner,
                                                  redirect_repo_name)

                # Make sure we insert the new location of the repo, which will
                # be scraped in a future run.
                PluginGithubRepos.upsert_with_owner_repo({
                    'owner':
                    redirect_owner,
                    'repo_name':
                    redirect_repo_name,
                    # TODO(david): Should append to a list
                    'redirects_from': ('%s/%s' % (repo_owner, repo_name)),
                })

                # And now change the GitHub repo location of the plugin that
                # the old repo location pointed to
                query = r.table('plugins').get_all([repo_owner, repo_name],
                                                   index='github_owner_repo')
                db_plugin = db.util.get_first(query)
                if db_plugin:
                    db_plugin['github_owner'] = redirect_owner
                    db_plugin['github_repo_name'] = redirect_repo_name
                    db.plugins.insert(db_plugin, conflict='replace')

                print 'redirects to %s/%s.' % (redirect_owner,
                                               redirect_repo_name)
            else:
                # TODO(david): Insert some metadata in the github repo that
                #     this is not found
                print 'not found.'

            plugin_data = None

        else:
            plugin_data = get_plugin_data(repo_owner, repo_name, repo_data)

        repo['repo_data'] = repo_data
        repo['repo_id'] = str(repo_data.get('id', repo['repo_id']))
        PluginGithubRepos.log_scrape(repo)

        # If this is a fork, note it and ensure we know about original repo.
        if repo_data.get('fork'):
            repo['is_fork'] = True
            PluginGithubRepos.upsert_with_owner_repo({
                'owner':
                repo_data['parent']['owner']['login'],
                'repo_name':
                repo_data['parent']['name'],
            })

        PluginGithubRepos.upsert_with_owner_repo(repo)

        # For most cases we don't care about forked repos, unless the forked
        # repo is used by others.
        if repo_data.get('fork') and (repo.get('plugin_manager_users', 0) <
                                      MIN_FORK_USERS):
            print 'skipping fork of %s' % repo_data['parent']['full_name']
            continue

        if plugin_data:

            # Insert the number of plugin manager users across all names/owners
            # of this repo.
            # TODO(david): Try to also use repo_id for this (but not all repos
            #     have it), or look at multiple levels of redirects.
            plugin_manager_users = repo.get('plugin_manager_users', 0)
            other_repos = r.table('plugin_github_repos').get_all(
                '%s/%s' % (repo_owner, repo_name),
                index='redirects_to').run(r_conn())
            for other_repo in other_repos:
                if other_repo['id'] == repo['id']:
                    continue
                plugin_manager_users += other_repo.get('plugin_manager_users',
                                                       0)

            plugin_data['github_bundles'] = plugin_manager_users

            if repo.get('from_submission'):
                _add_submission_data(plugin_data, repo['from_submission'])

            db.plugins.add_scraped_data(plugin_data,
                                        repo,
                                        submission=repo.get('from_submission'))

            print 'done.'
Пример #8
0
def scrape_plugin_repos(num):
    """Scrapes the num plugin repos that have been least recently scraped."""
    MIN_FORK_USERS = 3

    query = r.table('plugin_github_repos').filter({'is_blacklisted': False})

    # We don't want to scrape forks that not many people use.
    query = query.filter(r.not_((r.row['is_fork'] == True) & (
            r.row['plugin_manager_users'] < MIN_FORK_USERS)),
            default=True)

    # Only scrape repos that don't redirect to other ones (probably renamed).
    query = query.filter(r.row['redirects_to'] == '')

    # We scrape vim-scripts separately using the batch /users/:user/repos call
    query = query.filter(r.row['owner'] != 'vim-scripts')

    query = query.order_by('last_scraped_at').limit(num)

    repos = query.run(r_conn())

    # TODO(david): Print stats at the end: # successfully scraped, # not found,
    #     # redirects, etc.
    for repo in repos:
        repo_name = repo['repo_name']
        repo_owner = repo['owner']

        # Print w/o newline.
        print "    scraping %s/%s ..." % (repo_owner, repo_name),
        sys.stdout.flush()

        # Attempt to fetch data about the plugin.
        res, repo_data = get_api_page('repos/%s/%s' % (repo_owner, repo_name))

        # If the API call 404s, then see if the repo has been renamed by
        # checking for a redirect in a non-API call.
        if res.status_code == 404:

            res = requests.head('https://github.com/%s/%s' % (
                    repo_owner, repo_name))

            if res.status_code == 301:
                location = res.headers.get('location')
                _, redirect_owner, redirect_repo_name = location.rsplit('/', 2)

                repo['redirects_to'] = '%s/%s' % (redirect_owner,
                        redirect_repo_name)

                # Make sure we insert the new location of the repo, which will
                # be scraped in a future run.
                PluginGithubRepos.upsert_with_owner_repo({
                    'owner': redirect_owner,
                    'repo_name': redirect_repo_name,
                    # TODO(david): Should append to a list
                    'redirects_from': ('%s/%s' % (repo_owner, repo_name)),
                })

                # And now change the GitHub repo location of the plugin that
                # the old repo location pointed to
                query = r.table('plugins').get_all(
                        [repo_owner, repo_name], index='github_owner_repo')
                db_plugin = db.util.get_first(query)
                if db_plugin:
                    db_plugin['github_owner'] = redirect_owner
                    db_plugin['github_repo_name'] = redirect_repo_name
                    db.plugins.insert(db_plugin, conflict='replace')

                print 'redirects to %s/%s.' % (redirect_owner,
                        redirect_repo_name)
            else:
                # TODO(david): Insert some metadata in the github repo that
                #     this is not found
                print 'not found.'

            plugin_data = None

        else:
            plugin_data = get_plugin_data(repo_owner, repo_name, repo_data)

        repo['repo_data'] = repo_data
        repo['repo_id'] = str(repo_data.get('id', repo['repo_id']))
        PluginGithubRepos.log_scrape(repo)

        # If this is a fork, note it and ensure we know about original repo.
        if repo_data.get('fork'):
            repo['is_fork'] = True
            PluginGithubRepos.upsert_with_owner_repo({
                'owner': repo_data['parent']['owner']['login'],
                'repo_name': repo_data['parent']['name'],
            })

        r.table('plugin_github_repos').insert(repo,
                conflict='replace').run(r_conn())

        # For most cases we don't care about forked repos, unless the forked
        # repo is used by others.
        if repo_data.get('fork') and (
                repo.get('plugin_manager_users', 0) < MIN_FORK_USERS):
            print 'skipping fork of %s' % repo_data['parent']['full_name']
            continue

        if plugin_data:

            # Insert the number of plugin manager users across all names/owners
            # of this repo.
            # TODO(david): Try to also use repo_id for this (but not all repos
            #     have it), or look at multiple levels of redirects.
            plugin_manager_users = repo.get('plugin_manager_users', 0)
            other_repos = r.table('plugin_github_repos').get_all(
                    '%s/%s' % (repo_owner, repo_name),
                    index='redirects_to').run(r_conn())
            for other_repo in other_repos:
                if other_repo['id'] == repo['id']:
                    continue
                plugin_manager_users += other_repo.get(
                        'plugin_manager_users', 0)

            plugin_data['github_bundles'] = plugin_manager_users

            if repo.get('from_submission'):
                _add_submission_data(plugin_data, repo['from_submission'])

            db.plugins.add_scraped_data(plugin_data, repo,
                    submission=repo.get('from_submission'))

            print 'done.'
Пример #9
0
def scrape_plugin_repos(num):
    """Scrapes the num plugin repos that have been least recently scraped."""
    MIN_FORK_USERS = 3

    query = r.table("plugin_github_repos").filter({"is_blacklisted": False})

    # We don't want to scrape forks that not many people use.
    query = query.filter(
        r.not_((r.row["is_fork"] == True) & (r.row["plugin_manager_users"] < MIN_FORK_USERS)), default=True  # NOQA
    )

    # Only scrape repos that don't redirect to other ones (probably renamed).
    query = query.filter(r.row["redirects_to"] == "")

    # We scrape vim-scripts separately using the batch /users/:user/repos call
    query = query.filter(r.row["owner"] != "vim-scripts")

    query = query.order_by("last_scraped_at").limit(num)

    repos = query.run(r_conn())

    # TODO(david): Print stats at the end: # successfully scraped, # not found,
    #     # redirects, etc.
    for repo in repos:
        repo_name = repo["repo_name"]
        repo_owner = repo["owner"]

        # Print w/o newline.
        print "    scraping %s/%s ..." % (repo_owner, repo_name),
        sys.stdout.flush()

        # Attempt to fetch data about the plugin.
        res, repo_data = get_api_page("repos/%s/%s" % (repo_owner, repo_name))

        # If the API call 404s, then see if the repo has been renamed by
        # checking for a redirect in a non-API call.
        if res.status_code == 404:

            res = requests.head("https://github.com/%s/%s" % (repo_owner, repo_name))

            if res.status_code == 301:
                location = res.headers.get("location")

                valid_repo_url = re.compile("^https://github.com/[^/]+/[^/]+")
                if not valid_repo_url.match(location):
                    print "redirects to invalid GitHub repo URL: %s" % location
                    continue

                _, redirect_owner, redirect_repo_name = location.rsplit("/", 2)

                repo["redirects_to"] = "%s/%s" % (redirect_owner, redirect_repo_name)

                # Make sure we insert the new location of the repo, which will
                # be scraped in a future run.
                PluginGithubRepos.upsert_with_owner_repo(
                    {
                        "owner": redirect_owner,
                        "repo_name": redirect_repo_name,
                        # TODO(david): Should append to a list
                        "redirects_from": ("%s/%s" % (repo_owner, repo_name)),
                    }
                )

                # And now change the GitHub repo location of the plugin that
                # the old repo location pointed to
                query = r.table("plugins").get_all([repo_owner, repo_name], index="github_owner_repo")
                db_plugin = db.util.get_first(query)
                if db_plugin:
                    db_plugin["github_owner"] = redirect_owner
                    db_plugin["github_repo_name"] = redirect_repo_name
                    db.plugins.insert(db_plugin, conflict="replace")

                print "redirects to %s/%s." % (redirect_owner, redirect_repo_name)
            else:
                # TODO(david): Insert some metadata in the github repo that
                #     this is not found
                print "not found."

            plugin_data = None

        else:
            plugin_data = get_plugin_data(repo_owner, repo_name, repo_data)

        repo["repo_data"] = repo_data
        repo["repo_id"] = str(repo_data.get("id", repo["repo_id"]))
        PluginGithubRepos.log_scrape(repo)

        # If this is a fork, note it and ensure we know about original repo.
        if repo_data.get("fork"):
            repo["is_fork"] = True
            PluginGithubRepos.upsert_with_owner_repo(
                {"owner": repo_data["parent"]["owner"]["login"], "repo_name": repo_data["parent"]["name"]}
            )

        PluginGithubRepos.upsert_with_owner_repo(repo)

        # For most cases we don't care about forked repos, unless the forked
        # repo is used by others.
        if repo_data.get("fork") and (repo.get("plugin_manager_users", 0) < MIN_FORK_USERS):
            print "skipping fork of %s" % repo_data["parent"]["full_name"]
            continue

        if plugin_data:

            # Insert the number of plugin manager users across all names/owners
            # of this repo.
            # TODO(david): Try to also use repo_id for this (but not all repos
            #     have it), or look at multiple levels of redirects.
            plugin_manager_users = repo.get("plugin_manager_users", 0)
            other_repos = (
                r.table("plugin_github_repos")
                .get_all("%s/%s" % (repo_owner, repo_name), index="redirects_to")
                .run(r_conn())
            )
            for other_repo in other_repos:
                if other_repo["id"] == repo["id"]:
                    continue
                plugin_manager_users += other_repo.get("plugin_manager_users", 0)

            plugin_data["github_bundles"] = plugin_manager_users

            if repo.get("from_submission"):
                _add_submission_data(plugin_data, repo["from_submission"])

            db.plugins.add_scraped_data(plugin_data, repo, submission=repo.get("from_submission"))

            print "done."