def analyses(app): """Prepare the known set of data used by tests.""" e1 = Ecosystem(name='npm', backend=EcosystemBackend.npm) p1 = Package(ecosystem=e1, name='arrify') v1 = Version(package=p1, identifier='1.0.1') model1 = Analysis(version=v1, started_at=now, finished_at=later) app.rdb.session.add(model1) e2 = Ecosystem(name='pypi', backend=EcosystemBackend.pypi) p2 = Package(ecosystem=e2, name='flexmock') v2 = Version(package=p2, identifier='0.10.1') model2 = Analysis(version=v2, started_at=later, access_count=1) app.rdb.session.add(model2) app.rdb.session.commit() worker_results2 = {'a': 'b', 'c': 'd', 'e': 'f', 'g': 'h', 'i': 'j', 'digests': {'details': [{'artifact': True, 'sha1': '6be7ae55bae2372c7be490321bbe5ead278bb51b'}]}} for w, tr in worker_results2.items(): app.rdb.session.add(WorkerResult(analysis_id=model2.id, worker=w, task_result=tr)) model3 = Analysis(version=v2, started_at=later, access_count=1, audit={'audit': {'audit': 'audit', 'e': 'f', 'g': 'h'}, 'a': 'b', 'c': 'd'}) app.rdb.session.add(model3) app.rdb.session.commit() worker_results3 = {'digests': {'details': [{'artifact': True, 'sha1': '6be7ae55bae2372c7be490321bbe5ead278bb51b'}]}} for w, tr in worker_results3.items(): app.rdb.session.add(WorkerResult(analysis_id=model3.id, worker=w, task_result=tr)) app.rdb.session.commit() return (model1, model2, model3)
def fill_analyses(app): """Prepare static data used by unit tests.""" # TODO can not find any usage of this function ecosystems = [ Ecosystem(name='pypi', backend=EcosystemBackend.pypi, url='https://pypi.python.org/', fetch_url='https://pypi.python.org/pypi'), Ecosystem(name='npm', backend=EcosystemBackend.npm, url='https://www.npmjs.com/', fetch_url='https://registry.npmjs.org/'), Ecosystem(name='go', backend=EcosystemBackend.scm), ] packages = [ Package(name='flexmock', ecosystem=ecosystems[0]), Package(name='requests', ecosystem=ecosystems[0]), Package(name='sequence', ecosystem=ecosystems[1]), Package(name='arrify', ecosystem=ecosystems[1]), Package(name='serve-static', ecosystem=ecosystems[1]), ] versions = [ Version(identifier='0.10.1', package=packages[0]), Version(identifier='0.9.1', package=packages[0]), Version(identifier='2.0.0', package=packages[1]), Version(identifier='2.2.1', package=packages[2]), Version(identifier='1.0.1', package=packages[3]), Version(identifier='1.7.1', package=packages[4]), ] analyses = [ Analysis(version=versions[0], started_at=now), # pypi/flexmock/0.10.1 Analysis(version=versions[0], started_at=later, access_count=1), # pypi/flexmock/0.10.1 Analysis(version=versions[1], started_at=even_later), # pypi/flexmock/0.9.1 Analysis(version=versions[2], started_at=now), # pypi/requests/2.0.0 Analysis(version=versions[3], started_at=later), # npm/sequence/2.2.1 Analysis(version=versions[4], started_at=now, finished_at=later), # npm/arrify/1.0.1 Analysis(version=versions[5], started_at=now, finished_at=later, release='npm:serve-static:1.7.1'), # npm/serve-static/1.7.1 ] # worker results that correspond to analyses above worker_results = [ WorkerResult(worker='digests', analysis=analyses[1], task_result={'details': [{'artifact': True, 'sha1': '6be7ae55bae2372c7be490321bbe5ead278bb51b'}]}), WorkerResult(worker='static_analysis', task_result={'details': []}, analysis=analyses[1]), WorkerResult(worker='source_licenses', task_result={'schema': {'name': 'source_licenses', 'version': '1-0-0'}}, analysis=analyses[1]) ] # TODO: just a placeholder, it won't work in real tests!!! package_gh_usage = None for a in ecosystems + packages + versions + analyses + worker_results + package_gh_usage: app.rdb.session.add(a) app.rdb.session.commit() return (ecosystems, packages, versions, analyses, worker_results, package_gh_usage)
def db_results(): """Mimic SQLAlchemy query result.""" ecosystem = Ecosystem() ecosystem.name = 'maven' package = Package() package.ecosystem = ecosystem package.name = 'net.iharder:base64' upstream = Upstream() upstream.url = 'https://github.com/omalley/base64' upstream.package = package return [upstream]
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) eco = arguments['ecosystem'] pkg = arguments['name'] ver = arguments['version'] try: cache_path = ObjectCache.get_from_dict(arguments).get_sources() except Exception: if not Ecosystem.by_name( StoragePool.get_connected_storage('BayesianPostgres'). session, eco).is_backed_by(EcosystemBackend.maven): self.log.error( 'Could not get sources for package {e}/{p}/{v}'.format( e=eco, p=pkg, v=ver)) raise self.log.info('Could not get sources for maven package {p}/{v},' 'will try to run on binary jar'.format(p=pkg, v=ver)) cache_path = ObjectCache.get_from_dict( arguments).get_extracted_source_tarball() result_data = self.run_scancode(cache_path) return result_data
def test_execute_with_mock_anitya(self, ecosystem, project, md5sum, dist_git): rdb() s = create_db_scoped_session() dummy_homepage = "http://project-homepage.com" dummy_response = Response() dummy_response.status_code = 200 s.add(Ecosystem(name='npm', backend=EcosystemBackend.npm)) s.commit() DownstreamMapCache( )[md5sum] = dist_git # fill in key-value mapping in cache task = AnityaTask.create_test_instance(task_name='anitya') args = {'ecosystem': ecosystem, 'name': project} flexmock(task).should_receive( "_get_project_homepage").once().and_return(dummy_homepage) flexmock(task).should_receive("_get_artifact_hash").once().and_return( md5sum) flexmock(task).should_receive( "_create_anitya_project").once().and_return(dummy_response) flexmock(task).should_receive( "_add_downstream_mapping").once().and_return(dummy_response) results = task.execute(arguments=args) assert results is None
def execute(self, arguments): """Task to mark vulnerable packages in graph. :param arguments: dictionary with task arguments :return: None """ self._strict_assert(arguments.get('ecosystem')) wanted_cves = set(arguments.get('cve_filter', [])) victims_cls = VictimsDB if not wanted_cves else FilteredVictimsDB rdb = StoragePool.get_connected_storage('BayesianPostgres') ecosystem = Ecosystem.by_name(rdb.session, arguments.get('ecosystem')) with victims_cls.build_from_git(wanted=wanted_cves) as db: self.log.info('Storing the VictimsDB zip on S3') db.store_on_s3() vulnerable_packages = self.get_vulnerable_packages(db, ecosystem) self.create_in_graph(vulnerable_packages, ecosystem) self.mark_in_graph(vulnerable_packages, ecosystem) self.notify_gemini(vulnerable_packages, ecosystem)
def retrieve_bookkeeping_for_epv(self, ecosystem, package, version): """Retrieve BookKeeping data for the given ecosystem, package, and version. :param ecosystem: ecosystem for which the data should be retrieved :param package: package for which the data should be retrieved :param version: package version for which the data should be retrieved """ e = Ecosystem.by_name(self.db, ecosystem) p = Package.by_name(self.db, package) v = self.db.query(Version).join(Package).join(Ecosystem). \ filter(Package.ecosystem == e). \ filter(Version.package == p). \ filter(Version.identifier == version).one() stat = self.db.query(WorkerResult).\ join(Analysis).join(Version).\ filter(Analysis.version == v) worker_stats = [] for worker_result in stat.all(): entry = {"worker_name": worker_result.worker, "has_error": worker_result.error, "task_result": worker_result.task_result, "started_at": worker_result.started_at, "ended_at": worker_result.ended_at} worker_stats.append(entry) return {"ecosystem": e.name, "package": p.name, "version": v.identifier, "workers": worker_stats}
def test_execute(self, tmpdir): artifact_digest, artifact_path = IndianaJones.fetch_artifact( Ecosystem(name='pypi', backend=EcosystemBackend.pypi), artifact=PYPI_MODULE_NAME, version=PYPI_MODULE_VERSION, target_dir=str(tmpdir)) args = dict.fromkeys(('ecosystem', 'name', 'version'), 'some-value') # flexmock(EPVCache).should_receive('get_extracted_source_tarball').and_return(str(tmpdir)) flexmock(EPVCache).should_receive('get_source_tarball').and_return( artifact_path) task = DigesterTask.create_test_instance(task_name='digests') results = task.execute(arguments=args) assert results is not None assert isinstance(results, dict) assert set(results.keys()) == {'details', 'status', 'summary'} artifact_details = None for details in results['details']: assert {'sha256', 'sha1', 'md5', 'ssdeep', 'path'}.issubset(set(details.keys())) if details.get('artifact'): artifact_details = details # there are artifact details assert artifact_details is not None # the artifact digest which Indy returns is the same as the one from DigesterTask assert artifact_digest == artifact_details['sha256'] == compute_digest( artifact_path) assert artifact_details['path'] == 'six-1.0.0.tar.gz'
def rubygems(rdb): rubygems = Ecosystem(name='rubygems', backend=EcosystemBackend.rubygems, fetch_url='https://rubygems.org/api/v1') rdb.add(rubygems) rdb.commit() return rubygems
def nuget(rdb): nuget = Ecosystem(name='nuget', backend=EcosystemBackend.nuget, fetch_url='https://api.nuget.org/packages/') rdb.add(nuget) rdb.commit() return nuget
def pypi(rdb): pypi = Ecosystem(name='pypi', backend=EcosystemBackend.pypi, fetch_url='https://pypi.python.org/pypi') rdb.add(pypi) rdb.commit() return pypi
def npm(rdb): npm = Ecosystem(name='npm', backend=EcosystemBackend.npm, fetch_url='https://registry.npmjs.org/') rdb.add(npm) rdb.commit() return npm
def maven(rdb): maven = Ecosystem(name='maven', backend=EcosystemBackend.maven, fetch_url='') rdb.add(maven) rdb.commit() return maven
def fill_packages_for_paging(app, request): e = Ecosystem(name='pypi', backend=EcosystemBackend.pypi) app.rdb.session.add(e) for p in range(0, 11): app.rdb.session.add(Package(ecosystem=e, name=str(p))) app.rdb.session.commit()
def maven(rdb): """Prepare database with Maven ecosystem.""" maven = Ecosystem(name='maven', backend=EcosystemBackend.maven, fetch_url='') rdb.add(maven) rdb.commit() return maven
def npm(rdb): """Prepare database with NPM ecosystem.""" npm = Ecosystem(name='npm', backend=EcosystemBackend.npm, fetch_url='https://registry.npmjs.org/') rdb.add(npm) rdb.commit() return npm
def iter_unknown_dependencies(storage_pool, node_args): """Collect unknown dependencies.""" # Be safe here as fatal errors will cause errors in Dispatcher try: aggregated = storage_pool.get('UnknownDependencyFetcherTask') arguments = [] for element in aggregated["result"]: epv = element.split(':') ecosystem = epv[0] if Ecosystem.by_name( StoragePool.get_connected_storage( 'BayesianPostgres').session, ecosystem).is_backed_by(EcosystemBackend.maven): name = '{}:{}'.format(epv[1], epv[2]) version = epv[3] else: name = epv[1] version = epv[2] analysis_arguments = _create_analysis_arguments( ecosystem, name, version) # TODO: Remove force=True once data-importer is smart enough # to ingest missing packages from s3. analysis_arguments.update({"recursive_limit": 0, "force": True}) arguments.append(analysis_arguments) print('Arguments appended: %s' % ', '.join(str(item) for item in arguments)) logger.info("Arguments for next flows: %s" % str(arguments)) return arguments except Exception as e: logger.exception( "Failed to collect unknown dependencies due to {}".format(e)) return []
def retrieve_bookkeeping_for_ecosystem(ecosystem): """Retrieve BookKeeping data for given Ecosystem. :param ecosystem: ecosystem for which the data should be retrieved """ rdb = StoragePool.get_connected_storage('BayesianPostgres') db = rdb.session try: e = Ecosystem.by_name(db, ecosystem) package_count = _count( db, db.query(Package).filter(Package.ecosystem == e)) pv_count = _count( db, db.query(Version).join(Package).filter(Package.ecosystem == e)) result = { "summary": { "ecosystem": e.name, "package_count": package_count, "package_version_count": pv_count } } except NoResultFound as e: result = {"error": "No such ecosystem: %s" % ecosystem} except SQLAlchemyError as e: result = { "error": "Error encountered while fetching data. Please check logs." } return result
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) rdb_session = StoragePool.get_connected_storage( 'BayesianPostgres').session name = arguments['name'] ecosystem = arguments['ecosystem'] if ecosystem == 'go': name = quote(name, safe='') project_url = self.configuration.libraries_io_project_url( Ecosystem.by_name(rdb_session, ecosystem), name) project = get_response(project_url) versions = project['versions'] details = { 'dependent_repositories': { 'count': project['dependent_repos_count'] }, 'dependents': { 'count': project['dependents_count'] }, 'releases': { 'count': len(versions), 'recent': self.recent_releases(versions) } } return {'status': 'success', 'summary': [], 'details': details}
def rubygems(rdb): """Prepare database with Ruby gems ecosystem.""" rubygems = Ecosystem(name='rubygems', backend=EcosystemBackend.rubygems, fetch_url='https://rubygems.org/api/v1') rdb.add(rubygems) rdb.commit() return rubygems
def nuget(rdb): """Prepare database with Nuget ecosystem.""" nuget = Ecosystem(name='nuget', backend=EcosystemBackend.nuget, fetch_url='https://api.nuget.org/packages/') rdb.add(nuget) rdb.commit() return nuget
def pypi(rdb): """Prepare database with Pypi ecosystem.""" pypi = Ecosystem(name='pypi', backend=EcosystemBackend.pypi, fetch_url='https://pypi.python.org/pypi') rdb.add(pypi) rdb.commit() return pypi
def retrieve_bookkeeping_for_ecosystem_package(self, ecosystem, package): """Retrieve BookKeeping data for given Package and Ecosystem. :param ecosystem: ecosystem for which the data should be retrieved :param package: package for which the data should be retrieved """ e = Ecosystem.by_name(self.db, ecosystem) p = Package.by_name(self.db, package) stat = self.db.query(PackageWorkerResult).\ join(PackageAnalysis).\ filter(PackageAnalysis.package == p) worker_stats = [] for package_worker_result in stat.all(): entry = {"worker_name": package_worker_result.worker, "has_error": package_worker_result.error, "task_result": package_worker_result.task_result, "started_at": package_worker_result.started_at, "ended_at": package_worker_result.ended_at} worker_stats.append(entry) version_count = self.db.query(Version).join(Package).\ filter(Package.ecosystem == e).\ filter(Version.package == p).count() p_versions = self.db.query(Version).join(Package).join(Ecosystem).\ filter(Package.ecosystem == e).\ filter(Version.package == p) return {"ecosystem": e.name, "package": p.name, "package_version_count": version_count, "package_level_workers": worker_stats, "analysed_versions": [v.identifier for v in p_versions]}
def fill_packages_for_paging(app, request): """Create and store set of packages used by unit tests.""" e = Ecosystem(name='pypi', backend=EcosystemBackend.pypi) app.rdb.session.add(e) for p in range(0, 11): app.rdb.session.add(Package(ecosystem=e, name=str(p))) app.rdb.session.commit()
def maven(rdb): """Prepare database with Maven ecosystem.""" maven = Ecosystem(name='maven', backend=EcosystemBackend.maven, fetch_url='https://repo.maven.apache.org/maven2/') rdb.add(maven) rdb.commit() return maven
def _create_analysis_arguments(ecosystem, name, version): """Create arguments for analysis.""" return { 'ecosystem': ecosystem, 'name': MavenCoordinates.normalize_str(name) if Ecosystem.by_name( StoragePool.get_connected_storage('BayesianPostgres').session, ecosystem).is_backed_by( EcosystemBackend.maven) else name, 'version': version }
def normalize_package_name(ecosystem, name): """Normalize package name based on ecosystem.""" normalized_name = name if Ecosystem.by_name( StoragePool.get_connected_storage('BayesianPostgres').session, ecosystem).is_backed_by(EcosystemBackend.pypi): case_sensitivity_transform(ecosystem, name) elif ecosystem == 'go': # go package name is the host+path part of a URL, thus it can be URL encoded normalized_name = unquote(name) return normalized_name
def _normalize_package_name(self, node_args): """Normalize package name in node arguments.""" if not node_args: return if 'name' in node_args and 'ecosystem' in node_args: ecosystem = Ecosystem.by_name(self.postgres.session, node_args['ecosystem']) node_args['name'] = normalize_package_name( ecosystem_backend=ecosystem.backend.name, name=node_args['name'])
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('ecosystem')) # get rid of version if scheduled from the core analyses arguments.pop('version', None) arguments.pop('document_id', None) db = self.storage.session try: ecosystem = Ecosystem.by_name(db, arguments['ecosystem']) except NoResultFound: raise FatalTaskError('Unknown ecosystem: %r' % arguments['ecosystem']) package = Package.get_or_create(db, ecosystem_id=ecosystem.id, name=arguments['name']) url = self.get_upstream_url(arguments) upstream = self.get_upstream_entry(package, url) if upstream is None: upstream = self.add_or_update_upstream(package, url) arguments['url'] = upstream.url if not arguments.get('force'): # can potentially schedule two flows of a same type at the same # time as there is no lock, but let's say it's OK if upstream.updated_at is not None \ and datetime.datetime.utcnow() - upstream.updated_at < self._UPDATE_INTERVAL: self.log.info( 'Skipping upstream package check as data are considered as recent - ' 'last update %s.', upstream.updated_at) # keep track of start, but do not schedule nothing more # discard changes like updates db.rollback() return arguments # if this fails, it's actually OK, as there could be concurrency package_analysis = PackageAnalysis( package_id=package.id, started_at=datetime.datetime.utcnow(), finished_at=None) db.add(package_analysis) # keep track of updates upstream.updated_at = datetime.datetime.utcnow() db.commit() arguments['document_id'] = package_analysis.id return arguments
def retrieve_bookkeeping_for_ecosystem(self, ecosystem): """Retrieve BookKeeping data for given Ecosystem. :param ecosystem: ecosystem for which the data should be retrieved """ e = Ecosystem.by_name(self.db, ecosystem) package_count = self.db.query(Package).filter(Package.ecosystem == e).count() pv_count = self.db.query(Version).join(Package).filter(Package.ecosystem == e).count() return {"ecosystem": e.name, "package_count": package_count, "package_version_count": pv_count}