def analyses(app): """Prepare the known set of data used by tests.""" e1 = Ecosystem(name='npm', backend=EcosystemBackend.npm) p1 = Package(ecosystem=e1, name='arrify') v1 = Version(package=p1, identifier='1.0.1') model1 = Analysis(version=v1, started_at=now, finished_at=later) app.rdb.session.add(model1) e2 = Ecosystem(name='pypi', backend=EcosystemBackend.pypi) p2 = Package(ecosystem=e2, name='flexmock') v2 = Version(package=p2, identifier='0.10.1') model2 = Analysis(version=v2, started_at=later, access_count=1) app.rdb.session.add(model2) app.rdb.session.commit() worker_results2 = {'a': 'b', 'c': 'd', 'e': 'f', 'g': 'h', 'i': 'j', 'digests': {'details': [{'artifact': True, 'sha1': '6be7ae55bae2372c7be490321bbe5ead278bb51b'}]}} for w, tr in worker_results2.items(): app.rdb.session.add(WorkerResult(analysis_id=model2.id, worker=w, task_result=tr)) model3 = Analysis(version=v2, started_at=later, access_count=1, audit={'audit': {'audit': 'audit', 'e': 'f', 'g': 'h'}, 'a': 'b', 'c': 'd'}) app.rdb.session.add(model3) app.rdb.session.commit() worker_results3 = {'digests': {'details': [{'artifact': True, 'sha1': '6be7ae55bae2372c7be490321bbe5ead278bb51b'}]}} for w, tr in worker_results3.items(): app.rdb.session.add(WorkerResult(analysis_id=model3.id, worker=w, task_result=tr)) app.rdb.session.commit() return (model1, model2, model3)
def fill_analyses(app): """Prepare static data used by unit tests.""" # TODO can not find any usage of this function ecosystems = [ Ecosystem(name='pypi', backend=EcosystemBackend.pypi, url='https://pypi.python.org/', fetch_url='https://pypi.python.org/pypi'), Ecosystem(name='npm', backend=EcosystemBackend.npm, url='https://www.npmjs.com/', fetch_url='https://registry.npmjs.org/'), Ecosystem(name='go', backend=EcosystemBackend.scm), ] packages = [ Package(name='flexmock', ecosystem=ecosystems[0]), Package(name='requests', ecosystem=ecosystems[0]), Package(name='sequence', ecosystem=ecosystems[1]), Package(name='arrify', ecosystem=ecosystems[1]), Package(name='serve-static', ecosystem=ecosystems[1]), ] versions = [ Version(identifier='0.10.1', package=packages[0]), Version(identifier='0.9.1', package=packages[0]), Version(identifier='2.0.0', package=packages[1]), Version(identifier='2.2.1', package=packages[2]), Version(identifier='1.0.1', package=packages[3]), Version(identifier='1.7.1', package=packages[4]), ] analyses = [ Analysis(version=versions[0], started_at=now), # pypi/flexmock/0.10.1 Analysis(version=versions[0], started_at=later, access_count=1), # pypi/flexmock/0.10.1 Analysis(version=versions[1], started_at=even_later), # pypi/flexmock/0.9.1 Analysis(version=versions[2], started_at=now), # pypi/requests/2.0.0 Analysis(version=versions[3], started_at=later), # npm/sequence/2.2.1 Analysis(version=versions[4], started_at=now, finished_at=later), # npm/arrify/1.0.1 Analysis(version=versions[5], started_at=now, finished_at=later, release='npm:serve-static:1.7.1'), # npm/serve-static/1.7.1 ] # worker results that correspond to analyses above worker_results = [ WorkerResult(worker='digests', analysis=analyses[1], task_result={'details': [{'artifact': True, 'sha1': '6be7ae55bae2372c7be490321bbe5ead278bb51b'}]}), WorkerResult(worker='static_analysis', task_result={'details': []}, analysis=analyses[1]), WorkerResult(worker='source_licenses', task_result={'schema': {'name': 'source_licenses', 'version': '1-0-0'}}, analysis=analyses[1]) ] # TODO: just a placeholder, it won't work in real tests!!! package_gh_usage = None for a in ecosystems + packages + versions + analyses + worker_results + package_gh_usage: app.rdb.session.add(a) app.rdb.session.commit() return (ecosystems, packages, versions, analyses, worker_results, package_gh_usage)
def db_results(): """Mimic SQLAlchemy query result.""" ecosystem = Ecosystem() ecosystem.name = 'maven' package = Package() package.ecosystem = ecosystem package.name = 'net.iharder:base64' upstream = Upstream() upstream.url = 'https://github.com/omalley/base64' upstream.package = package return [upstream]
def retrieve_bookkeeping_for_ecosystem_package(self, ecosystem, package): """Retrieve BookKeeping data for given Package and Ecosystem. :param ecosystem: ecosystem for which the data should be retrieved :param package: package for which the data should be retrieved """ e = Ecosystem.by_name(self.db, ecosystem) p = Package.by_name(self.db, package) stat = self.db.query(PackageWorkerResult).\ join(PackageAnalysis).\ filter(PackageAnalysis.package == p) worker_stats = [] for package_worker_result in stat.all(): entry = {"worker_name": package_worker_result.worker, "has_error": package_worker_result.error, "task_result": package_worker_result.task_result, "started_at": package_worker_result.started_at, "ended_at": package_worker_result.ended_at} worker_stats.append(entry) version_count = self.db.query(Version).join(Package).\ filter(Package.ecosystem == e).\ filter(Version.package == p).count() p_versions = self.db.query(Version).join(Package).join(Ecosystem).\ filter(Package.ecosystem == e).\ filter(Version.package == p) return {"ecosystem": e.name, "package": p.name, "package_version_count": version_count, "package_level_workers": worker_stats, "analysed_versions": [v.identifier for v in p_versions]}
def retrieve_bookkeeping_for_epv(self, ecosystem, package, version): """Retrieve BookKeeping data for the given ecosystem, package, and version. :param ecosystem: ecosystem for which the data should be retrieved :param package: package for which the data should be retrieved :param version: package version for which the data should be retrieved """ e = Ecosystem.by_name(self.db, ecosystem) p = Package.by_name(self.db, package) v = self.db.query(Version).join(Package).join(Ecosystem). \ filter(Package.ecosystem == e). \ filter(Version.package == p). \ filter(Version.identifier == version).one() stat = self.db.query(WorkerResult).\ join(Analysis).join(Version).\ filter(Analysis.version == v) worker_stats = [] for worker_result in stat.all(): entry = {"worker_name": worker_result.worker, "has_error": worker_result.error, "task_result": worker_result.task_result, "started_at": worker_result.started_at, "ended_at": worker_result.ended_at} worker_stats.append(entry) return {"ecosystem": e.name, "package": p.name, "version": v.identifier, "workers": worker_stats}
def fill_packages_for_paging(app, request): e = Ecosystem(name='pypi', backend=EcosystemBackend.pypi) app.rdb.session.add(e) for p in range(0, 11): app.rdb.session.add(Package(ecosystem=e, name=str(p))) app.rdb.session.commit()
def fill_packages_for_paging(app, request): """Create and store set of packages used by unit tests.""" e = Ecosystem(name='pypi', backend=EcosystemBackend.pypi) app.rdb.session.add(e) for p in range(0, 11): app.rdb.session.add(Package(ecosystem=e, name=str(p))) app.rdb.session.commit()
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('ecosystem')) # get rid of version if scheduled from the core analyses arguments.pop('version', None) arguments.pop('document_id', None) db = self.storage.session try: ecosystem = Ecosystem.by_name(db, arguments['ecosystem']) except NoResultFound: raise FatalTaskError('Unknown ecosystem: %r' % arguments['ecosystem']) package = Package.get_or_create(db, ecosystem_id=ecosystem.id, name=arguments['name']) url = self.get_upstream_url(arguments) upstream = self.get_upstream_entry(package, url) if upstream is None: upstream = self.add_or_update_upstream(package, url) arguments['url'] = upstream.url if not arguments.get('force'): # can potentially schedule two flows of a same type at the same # time as there is no lock, but let's say it's OK if upstream.updated_at is not None \ and datetime.datetime.utcnow() - upstream.updated_at < self._UPDATE_INTERVAL: self.log.info( 'Skipping upstream package check as data are considered as recent - ' 'last update %s.', upstream.updated_at) # keep track of start, but do not schedule nothing more # discard changes like updates db.rollback() return arguments # if this fails, it's actually OK, as there could be concurrency package_analysis = PackageAnalysis( package_id=package.id, started_at=datetime.datetime.utcnow(), finished_at=None) db.add(package_analysis) # keep track of updates upstream.updated_at = datetime.datetime.utcnow() db.commit() arguments['document_id'] = package_analysis.id return arguments
def retrieve_bookkeeping_for_ecosystem_package(ecosystem, package): """Retrieve BookKeeping data for given Package and Ecosystem. :param ecosystem: ecosystem for which the data should be retrieved :param package: package for which the data should be retrieved """ rdb = StoragePool.get_connected_storage('BayesianPostgres') db = rdb.session try: e = Ecosystem.by_name(db, ecosystem) p = Package.by_name(db, package) version_count = _count( db, db.query(Version).join(Package).filter( Package.ecosystem == e).filter(Version.package == p)) stat = db.query(PackageWorkerResult.worker, PackageWorkerResult.error, PackageWorkerResult.task_result).join(PackageAnalysis). \ filter(PackageAnalysis.package == p). \ all() worker_stats = [] for worker_name, has_error, task_result in stat: entry = { "worker_name": worker_name, "has_error": has_error, "task_result": task_result } worker_stats.append(entry) p_versions = db.query(Version).join(Package).join(Ecosystem). \ filter(Package.ecosystem == e). \ filter(Version.package == p) result = { "summary": { "ecosystem": e.name, "package": p.name, "package_version_count": version_count, "package_level_workers": worker_stats, "analysed_versions": [v.identifier for v in p_versions] } } except NoResultFound as e: result = {"error": "No such package: %s/%s" % (ecosystem, package)} except SQLAlchemyError as e: result = { "error": "Error encountered while fetching data. Please check logs." } return result
def retrieve_bookkeeping_for_epv(ecosystem, package, version): """Retrieve BookKeeping data for the given ecosystem, package, and version. :param ecosystem: ecosystem for which the data should be retrieved :param package: package for which the data should be retrieved :param version: package version for which the data should be retrieved """ rdb = StoragePool.get_connected_storage('BayesianPostgres') db = rdb.session try: e = Ecosystem.by_name(db, ecosystem) p = Package.by_name(db, package) v = db.query(Version).join(Package).join(Ecosystem). \ filter(Package.ecosystem == e). \ filter(Version.package == p). \ filter(Version.identifier == version).one() stat = db.query(WorkerResult.worker, WorkerResult.error, WorkerResult.task_result). \ join(Analysis).join(Version). \ filter(Analysis.version == v).all() worker_stats = [] for worker_name, has_error, task_result in stat: entry = { "worker_name": worker_name, "has_error": has_error, "task_result": task_result } worker_stats.append(entry) result = { "summary": { "ecosystem": e.name, "package": p.name, "version": v.identifier, "workers": worker_stats } } except NoResultFound as e: return { "error": "No such version: %s/%s/%s" % (ecosystem, package, version) } except SQLAlchemyError as e: result = { "error": "Error encountered while fetching data. Please check logs." } return result
def setup_method(self, method): rdb() self.s = create_db_scoped_session() self.en = 'foo' self.pn = 'bar' self.vi = '1.1.1' self.e = Ecosystem(name=self.en, backend=EcosystemBackend.maven) self.p = Package(ecosystem=self.e, name=self.pn) self.v = Version(package=self.p, identifier=self.vi) self.a = Analysis(version=self.v, finished_at=datetime.datetime.now()) self.a2 = Analysis(version=self.v, finished_at=datetime.datetime.now() + datetime.timedelta(seconds=10)) self.s.add(self.a) self.s.add(self.a2) self.s.commit() self.bp = BayesianPostgres( connection_string=configuration.POSTGRES_CONNECTION)
def execute(self, arguments): self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('ecosystem')) # get rid of version if scheduled from the core analyses arguments.pop('version', None) db = self.storage.session ecosystem = Ecosystem.by_name(db, arguments['ecosystem']) package = Package.get_or_create(db, ecosystem_id=ecosystem.id, name=arguments['name']) upstream = self.get_upstream_entry(db, package, self.get_upstream_url(arguments)) arguments['url'] = upstream.url if not arguments.get('force'): # can potentially schedule two flows of a same type at the same # time as there is no lock, but let's say it's OK if upstream.updated_at is not None \ and upstream.updated_at - datetime.datetime.now() < self._UPDATE_INTERVAL: self.log.info( 'Skipping upstream package check as data are considered as recent - ' 'last update %s.', upstream.updated_at) # keep track of start, but do not schedule nothing more # discard changes like updates db.rollback() return arguments # if this fails, it's actually OK, as there could be concurrency package_analysis = PackageAnalysis(package_id=package.id, started_at=datetime.datetime.now(), finished_at=None) db.add(package_analysis) # keep track of updates upstream.updated_at = datetime.datetime.now() db.commit() arguments['document_id'] = package_analysis.id return arguments
def test_f8a_fetcher(self, rdb, npm): """Test F8aReleasesFetcher.""" # create initial dataset package = Package(ecosystem=npm, name='f8a') rdb.add(package) rdb.commit() versions = { '0.5.0', '0.5.1', '0.6.0', '0.6.4', '0.7.0', '0.8.0', '0.9.0', '1.0.0', '1.0.5' } for v in versions: version = Version(package=package, identifier=v) rdb.add(version) rdb.commit() analysis = Analysis(version=version) # Fetcher only selects finished analyses analysis.finished_at = datetime.datetime.utcnow() rdb.add(analysis) rdb.commit() f = F8aReleasesFetcher(npm, rdb) r = f.fetch_releases('f8a')[1] # make sure we fetched the same stuff we inserted assert set(r) == versions # first should be the latest assert r.pop() == '1.0.5' # try different dependency specs s = get_ecosystem_solver(npm, with_fetcher=f) assert s.solve(['f8a ^0.5.0'])['f8a'] == '0.5.1' assert s.solve(['f8a 0.x.x'])['f8a'] == '0.9.0' assert s.solve(['f8a >1.0.0'])['f8a'] == '1.0.5' assert s.solve(['f8a ~>0.6.0'])['f8a'] == '0.6.4' # check that with `all_versions` we return all the relevant ones assert set(s.solve(['f8a >=0.6.0'], all_versions=True)['f8a']) == \ (versions - {'0.5.0', '0.5.1'})
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self.log.debug("Input Arguments: {}".format(arguments)) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) self._strict_assert(arguments.get('ecosystem')) # make sure we store package name based on ecosystem package naming case sensitivity arguments['name'] = normalize_package_name(arguments['ecosystem'], arguments['name']) db = self.storage.session try: ecosystem = Ecosystem.by_name(db, arguments['ecosystem']) except NoResultFound: raise FatalTaskError('Unknown ecosystem: %r' % arguments['ecosystem']) p = Package.get_or_create(db, ecosystem_id=ecosystem.id, name=arguments['name']) v = Version.get_or_create(db, package_id=p.id, identifier=arguments['version']) if not arguments.get('force'): # TODO: this is OK for now, but if we will scale and there will be # 2+ workers running this task they can potentially schedule two # flows of a same type at the same time if db.query(Analysis).filter(Analysis.version_id == v.id).count() > 0: # we need to propagate flags that were passed to flow, but not # E/P/V - this way we are sure that for example graph import is # scheduled (arguments['force_graph_sync'] == True) arguments.pop('name') arguments.pop('version') arguments.pop('ecosystem') self.log.debug("Arguments returned by initAnalysisFlow without force: {}" .format(arguments)) return arguments cache_path = mkdtemp(dir=self.configuration.WORKER_DATA_DIR) epv_cache = ObjectCache.get_from_dict(arguments) try: if not epv_cache.\ has_source_tarball(): _, source_tarball_path = IndianaJones.fetch_artifact( ecosystem=ecosystem, artifact=arguments['name'], version=arguments['version'], target_dir=cache_path ) epv_cache.put_source_tarball(source_tarball_path) if ecosystem.is_backed_by(EcosystemBackend.maven): if not epv_cache.has_source_jar(): try: source_jar_path = self._download_source_jar(cache_path, ecosystem, arguments) epv_cache.put_source_jar(source_jar_path) except Exception as e: self.log.info( 'Failed to fetch source jar for maven artifact "{n}/{v}": {err}'. format(n=arguments.get('name'), v=arguments.get('version'), err=str(e)) ) if not epv_cache.has_pom_xml(): pom_xml_path = self._download_pom_xml(cache_path, ecosystem, arguments) epv_cache.put_pom_xml(pom_xml_path) finally: # always clean up cache shutil.rmtree(cache_path) a = Analysis(version=v, access_count=1, started_at=datetime.datetime.utcnow()) db.add(a) db.commit() arguments['document_id'] = a.id # export ecosystem backend so we can use it to easily control flow later arguments['ecosystem_backend'] = ecosystem.backend.name self.log.debug("Arguments returned by InitAnalysisFlow are: {}".format(arguments)) return arguments
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self.log.debug("Input Arguments: {}".format(arguments)) self._strict_assert(isinstance(arguments.get('ecosystem'), str)) self._strict_assert(isinstance(arguments.get('name'), str)) self._strict_assert(isinstance(arguments.get('version'), str)) db = self.storage.session try: ecosystem = Ecosystem.by_name(db, arguments['ecosystem']) except NoResultFound: raise FatalTaskError('Unknown ecosystem: %r' % arguments['ecosystem']) # make sure we store package name in its normalized form arguments['name'] = normalize_package_name(ecosystem.backend.name, arguments['name']) if len(pattern_ignore.findall(arguments['version'])) > 0: self.log.info("Incorrect version alert {} {}".format( arguments['name'], arguments['version'])) raise NotABugFatalTaskError("Incorrect version alert {} {}".format( arguments['name'], arguments['version'])) # Dont try ingestion for private packages if is_pkg_public(arguments['ecosystem'], arguments['name']): self.log.info("Ingestion flow for {} {}".format( arguments['ecosystem'], arguments['name'])) else: self.log.info("Private package ingestion ignored {} {}".format( arguments['ecosystem'], arguments['name'])) raise NotABugFatalTaskError("Private package alert {} {}".format( arguments['ecosystem'], arguments['name'])) p = Package.get_or_create(db, ecosystem_id=ecosystem.id, name=arguments['name']) v = Version.get_or_create(db, package_id=p.id, identifier=arguments['version']) if not arguments.get('force'): if db.query(Analysis).filter( Analysis.version_id == v.id).count() > 0: arguments['analysis_already_exists'] = True self.log.debug( "Arguments returned by initAnalysisFlow without force: {}". format(arguments)) return arguments cache_path = mkdtemp(dir=self.configuration.WORKER_DATA_DIR) epv_cache = ObjectCache.get_from_dict(arguments) npm_dir = self.configuration.NPM_DATA_DIR try: if not epv_cache.\ has_source_tarball(): _, source_tarball_path = IndianaJones.fetch_artifact( ecosystem=ecosystem, artifact=arguments['name'], version=arguments['version'], target_dir=cache_path) epv_cache.put_source_tarball(source_tarball_path) if ecosystem.is_backed_by(EcosystemBackend.maven): if not epv_cache.has_source_jar(): try: source_jar_path = self._download_source_jar( cache_path, ecosystem, arguments) epv_cache.put_source_jar(source_jar_path) except Exception as e: self.log.info( 'Failed to fetch source jar for maven artifact "{n}/{v}": {err}' .format(n=arguments.get('name'), v=arguments.get('version'), err=str(e))) if not epv_cache.has_pom_xml(): pom_xml_path = self._download_pom_xml( cache_path, ecosystem, arguments) epv_cache.put_pom_xml(pom_xml_path) finally: # always clean up cache shutil.rmtree(cache_path) if arguments['ecosystem'] == "npm": shutil.rmtree(npm_dir, True) a = Analysis(version=v, access_count=1, started_at=datetime.datetime.utcnow()) db.add(a) db.commit() arguments['document_id'] = a.id # export ecosystem backend so we can use it to easily control flow later arguments['ecosystem_backend'] = ecosystem.backend.name self.log.debug( "Arguments returned by InitAnalysisFlow are: {}".format(arguments)) return arguments