예제 #1
0
def test_import_inputset():
    # test github
    ds = Dataset.import_inputset('files/git_repo.json',
                                 registry='github',
                                 cache_dir=CACHE_DIR,
                                 debug=True,
                                 github_pat=os.getenv('GITHUB_PAT'))
    ds.update(**{'name': 'test', 'version': '1.0'})
    ds.backup('../test.p')
    ds = Dataset.restore('../test.p')
    ds.export_inputset('../test.json')

    ds = Dataset.import_inputset('files/git_repo_commit.json',
                                 registry='github',
                                 cache_dir=CACHE_DIR,
                                 debug=True,
                                 github_pat=os.getenv('GITHUB_PAT'))
    ds.update(**{'name': 'test', 'version': '1.0'})
    ds.backup('../test.p')
    ds = Dataset.restore('../test.p')
    ds.export_inputset('../test.json')

    # test npm
    ds = Dataset.import_inputset('files/name_version.json',
                                 registry='npm',
                                 cache_dir=CACHE_DIR,
                                 debug=True)
    ds.update(**{'name': 'test', 'version': '1.0'})
    ds.backup('../test.p')
    ds = Dataset.restore('../test.p')
    ds.export_inputset('../test.json')

    # test pypi
    ds = Dataset.import_inputset('files/name_version.json',
                                 registry='pypi',
                                 cache_dir=CACHE_DIR,
                                 debug=True)
    ds.update(**{'name': 'test', 'version': '1.0'})
    ds.backup('../test.p')
    ds = Dataset.restore('../test.p')
    ds.export_inputset('../test.json')

    # test vanilla
    ds = Dataset.import_inputset('files/http_url.json',
                                 cache_dir=CACHE_DIR,
                                 debug=True)
    ds.update(**{'name': 'test', 'version': '1.0'})
    ds.backup('../test.p')
    ds = Dataset.restore('../test.p')
    ds.export_inputset('../test.json')

    # cleanup files
    os.remove('../test.p')
    os.remove('../test.json')
예제 #2
0
def sample(ds: Dataset, n: int,
           on_versions: bool = True, seed: str = None) -> None:
    """Samples n projects in place."""

    # seed random, if a seed was provided
    if seed:
        random.seed(seed)

    # select a sample of versions in each project
    if on_versions:
        dropped = 0
        for project in ds.projects:
            dropped += len(project.versions)
            if len(project.versions) > n:
                project.versions = random.sample(project.versions, n)
            dropped -= len(project.versions)

        print('         Sampled {:,} versions from each of {:,} projects ({:,} '
              'total versions dropped).'.format(n, len(ds.projects), dropped))

    # select a sample of projects
    elif len(ds.projects) > n:
        orig_count = len(ds.projects)
        ds.projects = random.sample(ds.projects, n)
        print('         Sampled {:,} projects from {:,} (dropped {:,}).'
              .format(n, orig_count, max(orig_count - n, 0)))

    else:
        # this should never happen...
        raise Exception('Dataset has no projects; cannot sample.')
예제 #3
0
    def load(cls, filepath: str, **kwargs) -> Dataset:
        """Loads a json file."""

        # ensure the user specified which parser to use
        parser = kwargs.pop('parser', None)
        if not parser:
            raise Exception('Please provide the handle to a json parser. '
                            'Valid options are: %s' % list(cls.parsers()))

        # check if the parsing schema exists
        if not parser in cls.parsers():
            raise Exception('Unrecognized json parser name. Review the docs '
                            'to ensure any custom json parsers have been '
                            'properly registered.')

        # initialize a dataset
        ds = Dataset(**kwargs)

        # load the file
        data = json.load(open(filepath))

        # run the appropriate parser
        cls.parsers()[parser](ds, data)

        return ds
예제 #4
0
    def load(cls, name: str, **kwargs) -> Dataset:
        # get the request type (weblist vs. organization)
        from_type = kwargs.pop('from_type')

        # initialize a registry
        ds = Dataset(**kwargs)

        if from_type == 'list':
            # select the correct weblist loader/parser
            weblists = cls.weblists()
            if name not in weblists:
                raise Exception('Unrecognized github weblist name. '
                                'Valid options are: %s' % list(weblists))

            # load the data
            data = weblists[name]['getter'](api=ds.api, **kwargs)

            # parse the data
            weblists[name]['parser'](ds, data)

        elif from_type in ['user', 'org']:
            # load the data
            data = GithubLoader._get_org_or_user_repos(ds.api, name, from_type,
                                                       **kwargs)

            # parse the data
            GithubLoader._parse_github(ds, data)

        return ds
예제 #5
0
파일: datasets.py 프로젝트: jayvdb/pypidb
    def load(cls, name: str, **kwargs) -> Dataset:
        # get the request type (weblist vs. organization)
        from_type = kwargs.pop("from_type")
        if from_type in ["user", "org"]:
            raise Exception(
                "opensuse OBS does not support loading project lists from user/org names."
            )

        # initialize a registry
        ds = Dataset(**kwargs)

        # select the correct weblist loader/parser
        weblists = cls.weblists()
        if name not in weblists:
            raise Exception(
                "Unrecognized opensuse OBS weblist name. Valid "
                "options are: %s" % list(weblists)
            )

        # load the data
        data = weblists[name]["getter"](api=ds.api, project=name, **kwargs)

        # parse the data
        weblists[name]["parser"](ds, data)

        return ds
예제 #6
0
    def _parse_project(ds: Dataset, data: list) -> None:
        # map data keys to project keywords
        uuids = {"name": lambda p: p.project}

        # create the projects
        ds.projects = [
            PypiProject(uuids_=uuids, **d) for d in tqdm(
                data, desc="         Loading", unit="project", leave=False)
        ]
예제 #7
0
def get_fedora_packages():
    ds = Dataset.load_web(
        name="fedora",
        from_type="list",
        registry="portingdb",
        cache_dir=R2C_WEB_CACHE,
    )

    names = set([project.get_name() for project in ds.projects])
    return names
예제 #8
0
def get_top_packages(kind="top4kmonth"):
    ds = Dataset.load_web(
        name=kind,
        from_type="list",
        registry="pypi",
        cache_dir=R2C_WEB_CACHE,
    )

    for project in ds.projects:
        yield project.get_name()
예제 #9
0
def get_opensuse_packages(project):
    ds = Dataset.load_web(
        name=project,
        from_type="list",
        registry="opensuse",
        cache_dir=R2C_WEB_CACHE,
    )

    # Avoid dups like python2-cmd2 and python-cmd2
    names = set([project.get_name() for project in ds.projects])
    return names
예제 #10
0
    def _parse_hugovk(ds: Dataset, data: list) -> None:
        from r2c_isg.structures.projects import PypiProject

        # map data keys to project keywords
        uuids = {'name': lambda p: p.project}

        # create the projects
        ds.projects = [
            PypiProject(uuids_=uuids, **d) for d in tqdm(
                data, desc='         Loading', unit='project', leave=False)
        ]
예제 #11
0
    def _parse_github(ds: Dataset, data: list) -> None:
        from r2c_isg.structures.projects import GithubRepo

        # map data keys to project keywords
        uuids = {'name': lambda p: p.name, 'url': lambda p: p.html_url}
        meta = {
            'org': lambda p: p.url.split('/')[-2],
        }

        # create the projects
        ds.projects = [
            GithubRepo(uuids_=uuids, meta_=meta, **d) for d in tqdm(
                data, desc='         Loading', unit='project', leave=False)
        ]
예제 #12
0
    def _parse_niceregistry(ds: Dataset, data: list):
        from r2c_isg.structures.projects import NpmPackage

        # map data keys to package keywords
        uuids = {'name': lambda p: p.name}

        # create the projects
        # Note: data list is ordered from most dependents to fewest
        ds.projects = []
        i = 1
        for name in data:
            package = NpmPackage(uuids_=uuids, name=name, dependents_rank=i)
            ds.projects.append(package)
            i += 1
예제 #13
0
def load(ctx, registry, from_type, name_or_path, fileargs):
    """Generates a dataset from a weblist name or file path."""
    backup_ds = None

    try:
        backup_ds = deepcopy(ctx.obj.get('dataset', None))

        if registry == 'noreg':
            registry = None

        global TEMP_SETTINGS

        if from_type == 'file':
            # read in a file (fileargs is either a header string for csv
            # or a parser handle for json)
            ds = Dataset.load_file(name_or_path,
                                   registry,
                                   fileargs=fileargs,
                                   **TEMP_SETTINGS)

        else:
            # download a weblist or organization repo list
            ds = Dataset.load_web(name_or_path,
                                  registry,
                                  from_type=from_type,
                                  **TEMP_SETTINGS)

        ctx.obj['dataset'] = ds

        # reset the temporary api/metadata dict
        TEMP_SETTINGS = dict()

    except Exception as e:
        print_error(e, DEBUG)

        # silently restore the dataset
        ctx.obj['dataset'] = backup_ds
예제 #14
0
def restore(ctx, filepath):
    """Restores a pickled dataset file."""
    backup_ds = None

    try:
        backup_ds = deepcopy(ctx.obj.get('dataset', None))

        ds = Dataset.restore(filepath)
        ctx.obj['dataset'] = ds

        # reset the temporary api/metadata dict
        global TEMP_SETTINGS
        TEMP_SETTINGS = dict()

    except Exception as e:
        print_error(e, DEBUG)

        # silently restore the dataset
        ctx.obj['dataset'] = backup_ds
예제 #15
0
def trim(ds: Dataset, n: int, on_versions: bool = False) -> None:
    """Keep only the first n projects inplace."""

    # select a sample of versions in each project
    if on_versions:
        dropped = 0
        for project in ds.projects:
            dropped += len(project.versions)
            project.versions = project.versions[:n]
            dropped -= len(project.versions)

        print('         Trimmed to first {:,} versions in each project '
              '({:,} total versions dropped).'.format(n, dropped))

    # select a sample of projects
    else:
        orig_count = len(ds.projects)
        ds.projects = ds.projects[:n]
        print('         Trimmed to first {:,} projects ({:,} dropped).'.format(
            n, max(orig_count - n, 0)))
예제 #16
0
def test_load_weblist():
    # test github
    ds = Dataset.load_web('top1kstarred',
                          registry='github',
                          from_type='list',
                          cache_dir=CACHE_DIR,
                          debug=True,
                          github_pat=os.getenv('GITHUB_PAT'))
    ds.trim(10)
    ds.get_projects_meta()
    ds.get_project_versions(historical='latest')
    ds.update(**{'name': 'test', 'version': '1.0'})
    ds.backup('../test.p')
    ds = Dataset.restore('../test.p')
    ds.export_inputset('../test.json')

    # test npm
    ds = Dataset.load_web('allbydependents',
                          registry='npm',
                          from_type='list',
                          cache_dir=CACHE_DIR,
                          debug=True)
    ds.trim(10)
    ds.get_projects_meta()
    ds.get_project_versions(historical='latest')
    ds.update(**{'name': 'test', 'version': '1.0'})
    ds.backup('../test.p')
    ds = Dataset.restore('../test.p')
    ds.export_inputset('../test.json')

    # test pypi
    ds = Dataset.load_web('top4kyear',
                          registry='pypi',
                          from_type='list',
                          cache_dir=CACHE_DIR,
                          debug=True)
    ds.trim(10)
    ds.get_projects_meta()
    ds.get_project_versions(historical='latest')
    ds.update(**{'name': 'test', 'version': '1.0'})
    ds.backup('../test.p')
    ds = Dataset.restore('../test.p')
    ds.export_inputset('../test.json')

    # cleanup files
    os.remove('../test.p')
    os.remove('../test.json')
예제 #17
0
def import_(ctx, registry, filepath):
    """Imports an input set json file."""
    backup_ds = None

    try:
        backup_ds = deepcopy(ctx.obj.get('dataset', None))

        if registry == 'noreg':
            registry = None

        global TEMP_SETTINGS

        ds = Dataset.import_inputset(filepath, registry, **TEMP_SETTINGS)
        ctx.obj['dataset'] = ds

        # reset the temporary api/metadata dict
        TEMP_SETTINGS = dict()

    except Exception as e:
        print_error(e, DEBUG)

        # silently restore the dataset
        ctx.obj['dataset'] = backup_ds
예제 #18
0
    def load(cls, name: str, **kwargs) -> Dataset:
        # get the request type (weblist vs. organization)
        from_type = kwargs.pop('from_type')
        if from_type in ['user', 'org']:
            raise Exception(
                'Pypi does not support loading project lists from user/org names.'
            )

        # initialize a registry
        ds = Dataset(**kwargs)

        # select the correct weblist loader/parser
        weblists = cls.weblists()
        if name not in weblists:
            raise Exception('Unrecognized pypi weblist name. Valid '
                            'options are: %s' % list(weblists))

        # load the data
        data = weblists[name]['getter'](api=ds.api, **kwargs)

        # parse the data
        weblists[name]['parser'](ds, data)

        return ds
예제 #19
0
    def load(cls, filepath: str, **kwargs) -> Dataset:
        """Loads an r2c input set json file."""

        # initialize the dataset
        ds = Dataset(**kwargs)

        # load the file
        data = json.load(open(filepath))

        # remove any existing projects
        ds.projects = []

        # don't overwrite previously set metadata
        ds.name = ds.name or data['name']
        ds.version = ds.version or data['version']

        # grab any optional metadata
        ds.description = ds.description or data.get('description', None)
        ds.readme = ds.readme or data.get('readme', None)
        ds.author = ds.author or data.get('author', None)
        ds.email = ds.email or data.get('email', None)

        # generate the projects and versions
        for input_ in tqdm(data['inputs'], desc='         Importing',
                           unit=' inputs', leave=False):
            # split out project- vs. version-level information
            p_data, v_data = {}, {}
            p_keys = ['repo_url', 'url', 'package_name']
            v_keys = ['commit_hash', 'version']
            for k, val in input_.items():
                # add the attribute to the project or version
                if k in v_keys:
                    v_data[k] = val
                elif k in p_keys:
                    p_data[k] = val

            # get or create the new project
            project = ds.find_project(**p_data)
            if project:
                # update the existing project
                project.update(**p_data)

            else:
                # map json headers to project keywords, as applicable
                uuids = {}
                if 'package_name' in p_data:
                    uuids['name'] = lambda p: p.package_name
                if 'repo_url' in p_data:
                    uuids['url'] = lambda p: p.repo_url
                if 'url' in p_data:
                    uuids['url'] = lambda p: p.url

                # create the new project & add it to the dataset
                p_class = project_map.get(ds.registry, DefaultProject)
                project = p_class(uuids_=uuids, **p_data)
                ds.projects.append(project)

            # create the new version, if it doesn't already exist
            if v_data:
                version = project.find_version(**v_data)
                if version:
                    # update the existing version
                    version.update(**v_data)

                else:
                    # map csv headers to version keywords, as applicable
                    uuids = {}
                    if 'version' in v_data:
                        uuids['version'] = lambda v: v.version
                    if 'commit_hash' in v_data:
                        uuids['commit'] = lambda v: v.commit_hash

                    # create the new version & add it to the project
                    v_class = version_map.get(ds.registry, DefaultVersion)
                    project.versions.append(v_class(uuids_=uuids, **v_data))

        return ds
예제 #20
0
    def load(cls, filepath: str, **kwargs) -> Dataset:
        """Loads a csv file."""

        # user-defined headers override default headers
        headers = kwargs.pop('fileargs', None)
        if headers:
            user_defined = True
            headers = headers.split()
        else:
            user_defined = False
            # default headers are name and version string
            headers = ['name', 'v.version']

        # initialize a dataset
        ds = Dataset(**kwargs)

        # load the file
        with open(filepath, mode='r', encoding='utf-8-sig') as file:
            csv_file = csv.reader(file, delimiter=',')
            for row in csv_file:
                if row[0].startswith('!'):
                    # read in a header row
                    if not user_defined:
                        # in-file headers override defaults
                        # (but not user-defined headers from the cli)
                        headers = [h[1:] for h in row]
                else:
                    # ensure we have as many headers as cells in the row
                    if len(row) > len(headers):
                        raise Exception('A column is missing a header. Review '
                                        "the input file's column headers.")

                    # read in a data row
                    p_data, v_data = {}, {}
                    for i, val in enumerate(row):
                        attr = headers[i]

                        # add the data to the project or version
                        if attr.startswith('v.'):
                            v_data[attr[2:]] = val
                        else:
                            p_data[attr] = val

                    # get or create the new project
                    project = ds.find_project(**p_data)
                    if project:
                        # update the existing project
                        project.update(**p_data)

                    else:
                        # map csv headers to project keywords, as applicable
                        uuids, meta = {}, {}
                        if 'name' in p_data:
                            uuids['name'] = lambda p: p.name
                        if 'org' in p_data:
                            meta['org'] = lambda p: p.org
                        if 'url' in p_data:
                            uuids['url'] = lambda p: p.url

                        # create the new project & add it to the dataset
                        p_class = project_map.get(ds.registry, DefaultProject)
                        project = p_class(uuids_=uuids, meta_=meta, **p_data)
                        ds.projects.append(project)

                    # create the new version, if it doesn't already exist
                    if v_data:
                        version = project.find_version(**v_data)
                        if version:
                            # update the existing version
                            version.update(**v_data)

                        else:
                            # map csv headers to version keywords, as applicable
                            uuids = {}
                            if 'version' in v_data:
                                uuids['version'] = lambda v: v.version
                            if 'commit' in v_data:
                                uuids['commit'] = lambda v: v.commit

                            # create the new version & add it to the project
                            v_class = version_map.get(ds.registry,
                                                      DefaultVersion)
                            project.versions.append(
                                v_class(uuids_=uuids, **v_data))

        return ds