예제 #1
0
    def test_execute_npm(self, tmpdir, npm):
        """Test the MercatorTask for the NPM ecosystem."""
        name = 'wrappy'
        version = '1.0.2'
        required = {
            'homepage', 'version', 'declared_licenses', 'code_repository',
            'bug_reporting', 'description', 'name', 'author'
        }
        IndianaJones.fetch_artifact(npm,
                                    artifact=name,
                                    version=version,
                                    target_dir=str(tmpdir))

        args = {'ecosystem': npm.name, 'name': name, 'version': version}
        flexmock(EPVCache).should_receive(
            'get_extracted_source_tarball').and_return(str(tmpdir))
        results = self.m.execute(arguments=args)

        assert isinstance(results, dict) and results
        details = results['details'][0]
        assert set(details.keys()
                   ) >= required  # check at least the required are there
        assert all([details[key]
                    for key in list(required)])  # assert required are not None
        assert details['name'] == name
    def test_execute(self, tmpdir):
        """Start the LinguistTask worker task and check its results."""
        # TODO: reduce cyclomatic complexity
        IndianaJones.fetch_artifact(
            ecosystem=ECOSYSTEM, artifact=MODULE_NAME,
            version=MODULE_VERSION, target_dir=str(tmpdir))

        args = dict.fromkeys(('ecosystem', 'name', 'version'), 'some-value')
        flexmock(EPVCache).should_receive('get_extracted_source_tarball').and_return(str(tmpdir))
        task = LinguistTask.create_test_instance(task_name='languages')
        results = task.execute(args)

        assert results is not None
        assert isinstance(results, dict)
        assert set(results.keys()) == {'details', 'status', 'summary'}
        details = results['details']
        assert len(details) > 3  # tarball, setup.py, LICENSE, README, etc.
        for f in details:
            if f.get('path') and f['path'].endswith('six.py'):
                # {'output': {'language': 'Python',
                #             'lines': '869',
                #             'mime': 'application/x-python',
                #             'sloc': '869',
                #             'type': 'Text'},
                #  'path': 'six-1.10.0/six.py',
                #  'type': ['Python script, ASCII text executable']},
                assert set(f.keys()) == {'output', 'path', 'type'}
                assert set(f['output'].keys()) == {'language', 'lines', 'mime', 'sloc', 'type'}
                assert f['output']['language'] == 'Python'
                assert f['type'].pop().startswith('Python')
        assert results['status'] == 'success'
 def test_fetch_pypi_nonexistent(self, tmpdir, pypi):
     """Test fetching of a non-existent pypi artifact."""
     tmpdir = Path(str(tmpdir))
     with pytest.raises(NotABugTaskError):
         IndianaJones.fetch_artifact(pypi,
                                     artifact='not-in-pypi',
                                     version='1294839',
                                     target_dir=str(tmpdir))
    def test_execute(self, tmpdir):
        artifact_digest, artifact_path = IndianaJones.fetch_artifact(
            Ecosystem(name='pypi', backend=EcosystemBackend.pypi),
            artifact=PYPI_MODULE_NAME,
            version=PYPI_MODULE_VERSION,
            target_dir=str(tmpdir))

        args = dict.fromkeys(('ecosystem', 'name', 'version'), 'some-value')
        # flexmock(EPVCache).should_receive('get_extracted_source_tarball').and_return(str(tmpdir))
        flexmock(EPVCache).should_receive('get_source_tarball').and_return(
            artifact_path)
        task = DigesterTask.create_test_instance(task_name='digests')
        results = task.execute(arguments=args)

        assert results is not None
        assert isinstance(results, dict)
        assert set(results.keys()) == {'details', 'status', 'summary'}
        artifact_details = None
        for details in results['details']:
            assert {'sha256', 'sha1', 'md5', 'ssdeep',
                    'path'}.issubset(set(details.keys()))
            if details.get('artifact'):
                artifact_details = details
        # there are artifact details
        assert artifact_details is not None
        # the artifact digest which Indy returns is the same as the one from DigesterTask
        assert artifact_digest == artifact_details['sha256'] == compute_digest(
            artifact_path)
        assert artifact_details['path'] == 'six-1.0.0.tar.gz'
 def test_fetch_rubygems_specific(self, tmpdir, rubygems, name, version,
                                  expected_digest):
     digest, path = IndianaJones.fetch_artifact(rubygems,
                                                artifact=name,
                                                version=version,
                                                target_dir=str(tmpdir))
     assert digest == expected_digest
     assert path.endswith("{}-{}.gem".format(name, version))
     assert osp.exists(path)
 def test_fetch_go_specific(self, tmpdir, go, name, version,
                            expected_digest):
     digest, path = IndianaJones.fetch_artifact(go,
                                                artifact=name,
                                                version=version,
                                                target_dir=str(tmpdir))
     assert digest == expected_digest
     assert path.endswith('{}.tar.gz'.format(version))
     assert osp.exists(path)
 def test_fetch_nuget_specific(self, tmpdir, nuget, name, version,
                               expected_digest):
     digest, path = IndianaJones.fetch_artifact(nuget,
                                                artifact=name,
                                                version=version,
                                                target_dir=str(tmpdir))
     assert digest == expected_digest
     assert path.endswith('{}.{}.nupkg'.format(name.lower(), version))
     assert osp.exists(path)
예제 #8
0
 def test_fetch_version_range_npm_specific(self, tmpdir, npm, name, version,
                                           expected_digest):
     """Test fetching of npm artifact with version range."""
     with pytest.raises(NotABugTaskError) as excinfo:
         cache_path = Path(
             subprocess.check_output(["npm", "config", "get", "cache"],
                                     universal_newlines=True).strip())
         package_digest, path = IndianaJones.fetch_artifact(
             npm, artifact=name, version=version, target_dir=str(tmpdir))
 def test_fetch_maven_specific(self, tmpdir, maven, name, version,
                               expected_digest):
     digest, path = IndianaJones.fetch_artifact(maven,
                                                artifact=name,
                                                version=version,
                                                target_dir=str(tmpdir))
     _, artifactId = name.split(':', 1)
     assert digest == expected_digest
     assert path.endswith('{}-{}.jar'.format(artifactId, version))
     assert osp.exists(path)
 def test_fetch_rubygems_specific(self, tmpdir, rubygems, name, version,
                                  expected_digest):
     """Test fetching of rubygems artifact."""
     digest, path = IndianaJones.fetch_artifact(rubygems,
                                                artifact=name,
                                                version=version,
                                                target_dir=str(tmpdir))
     assert digest == expected_digest
     path = Path(path)
     assert path.name == "{}-{}.gem".format(name, version)
     assert path.exists()
 def test_fetch_go_specific(self, tmpdir, go, name, version,
                            expected_digest):
     """Test fetching of go artifact."""
     digest, path = IndianaJones.fetch_artifact(go,
                                                artifact=name,
                                                version=version,
                                                target_dir=str(tmpdir))
     assert digest == expected_digest
     path = Path(path)
     assert path.name == '{}.tar.gz'.format(version)
     assert path.exists()
 def test_fetch_nuget_specific(self, tmpdir, nuget, name, version,
                               expected_digest):
     """Test fetching of nuget artifact."""
     digest, path = IndianaJones.fetch_artifact(nuget,
                                                artifact=name,
                                                version=version,
                                                target_dir=str(tmpdir))
     assert digest == expected_digest
     path = Path(path)
     assert path.name == '{}.{}.nupkg'.format(name.lower(), version)
     assert path.exists()
 def test_fetch_pypi_specific(self, tmpdir, pypi, name, version,
                              expected_digest):
     digest, path = IndianaJones.fetch_artifact(pypi,
                                                artifact=name,
                                                version=version,
                                                target_dir=str(tmpdir))
     assert digest == expected_digest
     assert len(os.listdir(str(tmpdir))) > 1
     glob_whl_path = glob.glob(
         osp.join(str(tmpdir), "{}-{}*".format(name, version))).pop()
     assert osp.exists(glob_whl_path)
    def _download_pom_xml(target, ecosystem, arguments):
        artifact_coords = MavenCoordinates.from_str(arguments['name'])
        artifact_coords.packaging = 'pom'
        artifact_coords.classifier = ''  # pom.xml files have no classifiers

        IndianaJones.fetch_artifact(
            ecosystem=ecosystem,
            artifact=artifact_coords.to_str(omit_version=True),
            version=arguments['version'],
            target_dir=target)

        # pom has to be named precisely pom.xml, otherwise mercator's Java handler
        #  which uses maven as subprocess won't see it
        pom_xml_path = os.path.join(target, 'pom.xml')
        os.rename(
            os.path.join(
                target, '{}-{}.pom'.format(artifact_coords.artifactId,
                                           arguments['version'])),
            pom_xml_path)
        return pom_xml_path
 def test_fetch_pypi_specific(self, tmpdir, pypi, name, version,
                              expected_digest):
     """Test fetching of pypi artifact."""
     tmpdir = Path(str(tmpdir))
     digest, path = IndianaJones.fetch_artifact(pypi,
                                                artifact=name,
                                                version=version,
                                                target_dir=str(tmpdir))
     assert digest == expected_digest
     assert len(list(tmpdir.iterdir())) > 1
     glob_whl_path = next(tmpdir.glob("{}-{}*".format(name, version)))
     assert glob_whl_path.exists()
 def test_fetch_maven_specific(self, tmpdir, maven, name, version,
                               expected_digest):
     """Test fetching of maven artifact."""
     digest, path = IndianaJones.fetch_artifact(maven,
                                                artifact=name,
                                                version=version,
                                                target_dir=str(tmpdir))
     _, artifactId = name.split(':', 1)
     assert digest == expected_digest
     path = Path(path)
     assert path.name == '{}-{}.jar'.format(artifactId, version)
     assert path.exists()
 def test_fetch_npm_specific(self, tmpdir, npm, name, version,
                             expected_digest):
     cache_path = subprocess.check_output(["npm", "config", "get", "cache"],
                                          universal_newlines=True).strip()
     assert ".npm" in cache_path
     package_digest, path = IndianaJones.fetch_artifact(
         npm, artifact=name, version=version, target_dir=str(tmpdir))
     assert len(glob.glob(osp.join(cache_path, name, "*"))) == 1,\
         "there should be just one version of the artifact in the NPM cache"
     assert package_digest == expected_digest
     assert osp.exists(path)
     assert osp.exists(osp.join(osp.join(cache_path, name), version))
     assert osp.exists(osp.join(str(tmpdir), "package.tgz"))
 def test_fetch_npm_specific(self, tmpdir, npm, name, version,
                             expected_digest):
     """Test fetching of npm artifact."""
     cache_path = Path(
         subprocess.check_output(["npm", "config", "get", "cache"],
                                 universal_newlines=True).strip())
     assert cache_path.name == ".npm"
     package_digest, path = IndianaJones.fetch_artifact(
         npm, artifact=name, version=version, target_dir=str(tmpdir))
     assert len(list((cache_path / name).glob('*'))) == 1,\
         "there should be just one version of the artifact in the NPM cache"
     assert package_digest == expected_digest
     assert Path(path).exists()
     assert (cache_path / name / version).exists()
     assert Path(str(tmpdir / "package.tgz")).exists()
예제 #19
0
 def _transform_licenses(self):
     if self._raw_data.get('LicenseUrl'):
         from f8a_worker.process import IndianaJones  # download_file
         # It's here due to circular dependencies
         from f8a_worker.workers import LicenseCheckTask  # run_scancode
         self._data['declared_licenses'] = [self._raw_data['LicenseUrl']]
         with TemporaryDirectory() as tmpdir:
             try:
                 # Get file from 'LicenseUrl' and let LicenseCheckTask decide what license it is
                 if IndianaJones.download_file(self._raw_data['LicenseUrl'],
                                               tmpdir):
                     scancode_results = LicenseCheckTask.run_scancode(
                         tmpdir)
                     if scancode_results.get('summary',
                                             {}).get('sure_licenses'):
                         self._data['declared_licenses'] = \
                             scancode_results['summary']['sure_licenses']
             except Exception:
                 # Don't raise if IndianaJones or LicenseCheckTask fail
                 pass
    def _download_source_jar(target, ecosystem, arguments):
        artifact_coords = MavenCoordinates.from_str(arguments['name'])
        artifact_coords.packaging = 'jar'  # source is always jar even for war/aar etc.
        sources_classifiers = ['sources', 'src']

        if artifact_coords.classifier not in sources_classifiers:
            for sources_classifier in sources_classifiers:
                artifact_coords.classifier = sources_classifier
                try:
                    _, source_jar_path = IndianaJones.fetch_artifact(
                        ecosystem=ecosystem,
                        artifact=artifact_coords.to_str(omit_version=True),
                        version=arguments['version'],
                        target_dir=target)
                except Exception:
                    if sources_classifier == sources_classifiers[-1]:
                        # fetching of all variants failed
                        raise
                else:
                    return source_jar_path
    def _handle_dotnet_solution(self, data):
        """Handle nuget package metadata."""
        # TODO: reduce cyclomatic complexity
        if not data.get('Metadata'):
            return {}
        data = data['Metadata']
        key_map = (
            ('Id', 'name'),
            ('Description', ),
            ('ProjectUrl', 'homepage'),
            # ('Summary',), ('Copyright',),
            # ('RequireLicenseAcceptance', 'require_license_acceptance'),
        )
        transformed = self.transform_keys(data, key_map)

        if data.get('Authors'):
            transformed['author'] = ','.join(data['Authors'])

        if data.get('LicenseUrl'):
            from f8a_worker.process import IndianaJones  # download_file
            # It's here due to circular dependencies
            from f8a_worker.workers import LicenseCheckTask  # run_scancode
            transformed['declared_licenses'] = [data['LicenseUrl']]
            with TemporaryDirectory() as tmpdir:
                try:
                    # Get file from 'LicenseUrl' and let LicenseCheckTask decide what license it is
                    if IndianaJones.download_file(data['LicenseUrl'], tmpdir):
                        scancode_results = LicenseCheckTask.run_scancode(
                            tmpdir)
                        if scancode_results.get('summary',
                                                {}).get('sure_licenses'):
                            transformed['declared_licenses'] = \
                                scancode_results['summary']['sure_licenses']
                except Exception:
                    # Don't raise if IndianaJones or LicenseCheckTask fail
                    pass

        # transform
        # "DependencyGroups": [
        #    {
        #        "Packages": [
        #            {
        #                "Id": "NETStandard.Library",
        #                "VersionRange": {"OriginalString": "1.6.0"}
        #            }
        #        ]
        #    }
        # ]
        # to ["NETStandard.Library 1.6.0"]
        deps = set()
        for dep_group in data.get('DependencyGroups', []):
            for package in dep_group.get('Packages', []):
                deps.add('{} {}'.format(
                    package.get('Id', ''),
                    package.get('VersionRange', {}).get('OriginalString', '')))
        if deps:
            transformed['dependencies'] = list(deps)

        repository = data.get('Repository')
        if isinstance(repository, dict) and repository:
            transformed['code_repository'] = {
                'type': repository.get('Type'),
                'url': repository.get('Url')
            }
        elif 'ProjectUrl' in data:
            transformed['code_repository'] = self._identify_gh_repo(
                data['ProjectUrl'])

        version = data.get('Version')
        if isinstance(version, dict) and version:
            transformed['version'] = '{}.{}.{}'.format(
                version.get('Major', ''), version.get('Minor', ''),
                version.get('Patch', ''))

        if data.get('Tags'):
            transformed['keywords'] = self._split_keywords(data['Tags'])

        return transformed
예제 #22
0
    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self.log.debug("Input Arguments: {}".format(arguments))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))
        self._strict_assert(arguments.get('ecosystem'))

        # make sure we store package name based on ecosystem package naming case sensitivity
        arguments['name'] = normalize_package_name(arguments['ecosystem'], arguments['name'])

        db = self.storage.session
        try:
            ecosystem = Ecosystem.by_name(db, arguments['ecosystem'])
        except NoResultFound:
            raise FatalTaskError('Unknown ecosystem: %r' % arguments['ecosystem'])

        p = Package.get_or_create(db, ecosystem_id=ecosystem.id, name=arguments['name'])
        v = Version.get_or_create(db, package_id=p.id, identifier=arguments['version'])

        if not arguments.get('force'):
            # TODO: this is OK for now, but if we will scale and there will be
            # 2+ workers running this task they can potentially schedule two
            # flows of a same type at the same time
            if db.query(Analysis).filter(Analysis.version_id == v.id).count() > 0:
                # we need to propagate flags that were passed to flow, but not
                # E/P/V - this way we are sure that for example graph import is
                # scheduled (arguments['force_graph_sync'] == True)
                arguments.pop('name')
                arguments.pop('version')
                arguments.pop('ecosystem')
                self.log.debug("Arguments returned by initAnalysisFlow without force: {}"
                               .format(arguments))
                return arguments

        cache_path = mkdtemp(dir=self.configuration.WORKER_DATA_DIR)
        epv_cache = ObjectCache.get_from_dict(arguments)

        try:
            if not epv_cache.\
                    has_source_tarball():
                _, source_tarball_path = IndianaJones.fetch_artifact(
                    ecosystem=ecosystem,
                    artifact=arguments['name'],
                    version=arguments['version'],
                    target_dir=cache_path
                )
                epv_cache.put_source_tarball(source_tarball_path)

            if ecosystem.is_backed_by(EcosystemBackend.maven):
                if not epv_cache.has_source_jar():
                    try:
                        source_jar_path = self._download_source_jar(cache_path, ecosystem,
                                                                    arguments)
                        epv_cache.put_source_jar(source_jar_path)
                    except Exception as e:
                        self.log.info(
                            'Failed to fetch source jar for maven artifact "{n}/{v}": {err}'.
                            format(n=arguments.get('name'),
                                   v=arguments.get('version'),
                                   err=str(e))
                        )

                if not epv_cache.has_pom_xml():
                    pom_xml_path = self._download_pom_xml(cache_path, ecosystem, arguments)
                    epv_cache.put_pom_xml(pom_xml_path)
        finally:
            # always clean up cache
            shutil.rmtree(cache_path)

        a = Analysis(version=v, access_count=1, started_at=datetime.datetime.utcnow())
        db.add(a)
        db.commit()

        arguments['document_id'] = a.id

        # export ecosystem backend so we can use it to easily control flow later
        arguments['ecosystem_backend'] = ecosystem.backend.name

        self.log.debug("Arguments returned by InitAnalysisFlow are: {}".format(arguments))
        return arguments
    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self.log.debug("Input Arguments: {}".format(arguments))
        self._strict_assert(isinstance(arguments.get('ecosystem'), str))
        self._strict_assert(isinstance(arguments.get('name'), str))
        self._strict_assert(isinstance(arguments.get('version'), str))

        db = self.storage.session
        try:
            ecosystem = Ecosystem.by_name(db, arguments['ecosystem'])
        except NoResultFound:
            raise FatalTaskError('Unknown ecosystem: %r' %
                                 arguments['ecosystem'])

        # make sure we store package name in its normalized form
        arguments['name'] = normalize_package_name(ecosystem.backend.name,
                                                   arguments['name'])

        if len(pattern_ignore.findall(arguments['version'])) > 0:
            self.log.info("Incorrect version alert {} {}".format(
                arguments['name'], arguments['version']))
            raise NotABugFatalTaskError("Incorrect version alert {} {}".format(
                arguments['name'], arguments['version']))

        # Dont try ingestion for private packages
        if is_pkg_public(arguments['ecosystem'], arguments['name']):
            self.log.info("Ingestion flow for {} {}".format(
                arguments['ecosystem'], arguments['name']))
        else:
            self.log.info("Private package ingestion ignored {} {}".format(
                arguments['ecosystem'], arguments['name']))
            raise NotABugFatalTaskError("Private package alert {} {}".format(
                arguments['ecosystem'], arguments['name']))

        p = Package.get_or_create(db,
                                  ecosystem_id=ecosystem.id,
                                  name=arguments['name'])
        v = Version.get_or_create(db,
                                  package_id=p.id,
                                  identifier=arguments['version'])

        if not arguments.get('force'):
            if db.query(Analysis).filter(
                    Analysis.version_id == v.id).count() > 0:
                arguments['analysis_already_exists'] = True
                self.log.debug(
                    "Arguments returned by initAnalysisFlow without force: {}".
                    format(arguments))
                return arguments

        cache_path = mkdtemp(dir=self.configuration.WORKER_DATA_DIR)
        epv_cache = ObjectCache.get_from_dict(arguments)
        npm_dir = self.configuration.NPM_DATA_DIR

        try:
            if not epv_cache.\
                    has_source_tarball():
                _, source_tarball_path = IndianaJones.fetch_artifact(
                    ecosystem=ecosystem,
                    artifact=arguments['name'],
                    version=arguments['version'],
                    target_dir=cache_path)
                epv_cache.put_source_tarball(source_tarball_path)

            if ecosystem.is_backed_by(EcosystemBackend.maven):
                if not epv_cache.has_source_jar():
                    try:
                        source_jar_path = self._download_source_jar(
                            cache_path, ecosystem, arguments)
                        epv_cache.put_source_jar(source_jar_path)
                    except Exception as e:
                        self.log.info(
                            'Failed to fetch source jar for maven artifact "{n}/{v}": {err}'
                            .format(n=arguments.get('name'),
                                    v=arguments.get('version'),
                                    err=str(e)))

                if not epv_cache.has_pom_xml():
                    pom_xml_path = self._download_pom_xml(
                        cache_path, ecosystem, arguments)
                    epv_cache.put_pom_xml(pom_xml_path)
        finally:
            # always clean up cache
            shutil.rmtree(cache_path)
            if arguments['ecosystem'] == "npm":
                shutil.rmtree(npm_dir, True)

        a = Analysis(version=v,
                     access_count=1,
                     started_at=datetime.datetime.utcnow())
        db.add(a)
        db.commit()

        arguments['document_id'] = a.id

        # export ecosystem backend so we can use it to easily control flow later
        arguments['ecosystem_backend'] = ecosystem.backend.name

        self.log.debug(
            "Arguments returned by InitAnalysisFlow are: {}".format(arguments))
        return arguments