def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) eco = arguments['ecosystem'] pkg = arguments['name'] ver = arguments['version'] try: cache_path = ObjectCache.get_from_dict(arguments).get_sources() except Exception: if not Ecosystem.by_name( StoragePool.get_connected_storage('BayesianPostgres'). session, eco).is_backed_by(EcosystemBackend.maven): self.log.error( 'Could not get sources for package {e}/{p}/{v}'.format( e=eco, p=pkg, v=ver)) raise self.log.info('Could not get sources for maven package {p}/{v},' 'will try to run on binary jar'.format(p=pkg, v=ver)) cache_path = ObjectCache.get_from_dict( arguments).get_extracted_source_tarball() result_data = self.run_scancode(cache_path) return result_data
def retrieve_bookkeeping_for_ecosystem_package(self, ecosystem, package): """Retrieve BookKeeping data for given Package and Ecosystem. :param ecosystem: ecosystem for which the data should be retrieved :param package: package for which the data should be retrieved """ e = Ecosystem.by_name(self.db, ecosystem) p = Package.by_name(self.db, package) stat = self.db.query(PackageWorkerResult).\ join(PackageAnalysis).\ filter(PackageAnalysis.package == p) worker_stats = [] for package_worker_result in stat.all(): entry = {"worker_name": package_worker_result.worker, "has_error": package_worker_result.error, "task_result": package_worker_result.task_result, "started_at": package_worker_result.started_at, "ended_at": package_worker_result.ended_at} worker_stats.append(entry) version_count = self.db.query(Version).join(Package).\ filter(Package.ecosystem == e).\ filter(Version.package == p).count() p_versions = self.db.query(Version).join(Package).join(Ecosystem).\ filter(Package.ecosystem == e).\ filter(Version.package == p) return {"ecosystem": e.name, "package": p.name, "package_version_count": version_count, "package_level_workers": worker_stats, "analysed_versions": [v.identifier for v in p_versions]}
def retrieve_bookkeeping_for_epv(self, ecosystem, package, version): """Retrieve BookKeeping data for the given ecosystem, package, and version. :param ecosystem: ecosystem for which the data should be retrieved :param package: package for which the data should be retrieved :param version: package version for which the data should be retrieved """ e = Ecosystem.by_name(self.db, ecosystem) p = Package.by_name(self.db, package) v = self.db.query(Version).join(Package).join(Ecosystem). \ filter(Package.ecosystem == e). \ filter(Version.package == p). \ filter(Version.identifier == version).one() stat = self.db.query(WorkerResult).\ join(Analysis).join(Version).\ filter(Analysis.version == v) worker_stats = [] for worker_result in stat.all(): entry = {"worker_name": worker_result.worker, "has_error": worker_result.error, "task_result": worker_result.task_result, "started_at": worker_result.started_at, "ended_at": worker_result.ended_at} worker_stats.append(entry) return {"ecosystem": e.name, "package": p.name, "version": v.identifier, "workers": worker_stats}
def iter_unknown_dependencies(storage_pool, node_args): """Collect unknown dependencies.""" # Be safe here as fatal errors will cause errors in Dispatcher try: aggregated = storage_pool.get('UnknownDependencyFetcherTask') arguments = [] for element in aggregated["result"]: epv = element.split(':') ecosystem = epv[0] if Ecosystem.by_name( StoragePool.get_connected_storage( 'BayesianPostgres').session, ecosystem).is_backed_by(EcosystemBackend.maven): name = '{}:{}'.format(epv[1], epv[2]) version = epv[3] else: name = epv[1] version = epv[2] analysis_arguments = _create_analysis_arguments( ecosystem, name, version) # TODO: Remove force=True once data-importer is smart enough # to ingest missing packages from s3. analysis_arguments.update({"recursive_limit": 0, "force": True}) arguments.append(analysis_arguments) print('Arguments appended: %s' % ', '.join(str(item) for item in arguments)) logger.info("Arguments for next flows: %s" % str(arguments)) return arguments except Exception as e: logger.exception( "Failed to collect unknown dependencies due to {}".format(e)) return []
def retrieve_bookkeeping_for_ecosystem(ecosystem): """Retrieve BookKeeping data for given Ecosystem. :param ecosystem: ecosystem for which the data should be retrieved """ rdb = StoragePool.get_connected_storage('BayesianPostgres') db = rdb.session try: e = Ecosystem.by_name(db, ecosystem) package_count = _count( db, db.query(Package).filter(Package.ecosystem == e)) pv_count = _count( db, db.query(Version).join(Package).filter(Package.ecosystem == e)) result = { "summary": { "ecosystem": e.name, "package_count": package_count, "package_version_count": pv_count } } except NoResultFound as e: result = {"error": "No such ecosystem: %s" % ecosystem} except SQLAlchemyError as e: result = { "error": "Error encountered while fetching data. Please check logs." } return result
def execute(self, arguments): """Task to mark vulnerable packages in graph. :param arguments: dictionary with task arguments :return: None """ self._strict_assert(arguments.get('ecosystem')) wanted_cves = set(arguments.get('cve_filter', [])) victims_cls = VictimsDB if not wanted_cves else FilteredVictimsDB rdb = StoragePool.get_connected_storage('BayesianPostgres') ecosystem = Ecosystem.by_name(rdb.session, arguments.get('ecosystem')) with victims_cls.build_from_git(wanted=wanted_cves) as db: self.log.info('Storing the VictimsDB zip on S3') db.store_on_s3() vulnerable_packages = self.get_vulnerable_packages(db, ecosystem) self.create_in_graph(vulnerable_packages, ecosystem) self.mark_in_graph(vulnerable_packages, ecosystem) self.notify_gemini(vulnerable_packages, ecosystem)
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) rdb_session = StoragePool.get_connected_storage( 'BayesianPostgres').session name = arguments['name'] ecosystem = arguments['ecosystem'] if ecosystem == 'go': name = quote(name, safe='') project_url = self.configuration.libraries_io_project_url( Ecosystem.by_name(rdb_session, ecosystem), name) project = get_response(project_url) versions = project['versions'] details = { 'dependent_repositories': { 'count': project['dependent_repos_count'] }, 'dependents': { 'count': project['dependents_count'] }, 'releases': { 'count': len(versions), 'recent': self.recent_releases(versions) } } return {'status': 'success', 'summary': [], 'details': details}
def _create_analysis_arguments(ecosystem, name, version): """Create arguments for analysis.""" return { 'ecosystem': ecosystem, 'name': MavenCoordinates.normalize_str(name) if Ecosystem.by_name( StoragePool.get_connected_storage('BayesianPostgres').session, ecosystem).is_backed_by( EcosystemBackend.maven) else name, 'version': version }
def retrieve_bookkeeping_for_ecosystem(self, ecosystem): """Retrieve BookKeeping data for given Ecosystem. :param ecosystem: ecosystem for which the data should be retrieved """ e = Ecosystem.by_name(self.db, ecosystem) package_count = self.db.query(Package).filter(Package.ecosystem == e).count() pv_count = self.db.query(Version).join(Package).filter(Package.ecosystem == e).count() return {"ecosystem": e.name, "package_count": package_count, "package_version_count": pv_count}
def normalize_package_name(ecosystem, name): """Normalize package name based on ecosystem.""" normalized_name = name if Ecosystem.by_name( StoragePool.get_connected_storage('BayesianPostgres').session, ecosystem).is_backed_by(EcosystemBackend.pypi): case_sensitivity_transform(ecosystem, name) elif ecosystem == 'go': # go package name is the host+path part of a URL, thus it can be URL encoded normalized_name = unquote(name) return normalized_name
def _normalize_package_name(self, node_args): """Normalize package name in node arguments.""" if not node_args: return if 'name' in node_args and 'ecosystem' in node_args: ecosystem = Ecosystem.by_name(self.postgres.session, node_args['ecosystem']) node_args['name'] = normalize_package_name( ecosystem_backend=ecosystem.backend.name, name=node_args['name'])
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('ecosystem')) # get rid of version if scheduled from the core analyses arguments.pop('version', None) arguments.pop('document_id', None) db = self.storage.session try: ecosystem = Ecosystem.by_name(db, arguments['ecosystem']) except NoResultFound: raise FatalTaskError('Unknown ecosystem: %r' % arguments['ecosystem']) package = Package.get_or_create(db, ecosystem_id=ecosystem.id, name=arguments['name']) url = self.get_upstream_url(arguments) upstream = self.get_upstream_entry(package, url) if upstream is None: upstream = self.add_or_update_upstream(package, url) arguments['url'] = upstream.url if not arguments.get('force'): # can potentially schedule two flows of a same type at the same # time as there is no lock, but let's say it's OK if upstream.updated_at is not None \ and datetime.datetime.utcnow() - upstream.updated_at < self._UPDATE_INTERVAL: self.log.info( 'Skipping upstream package check as data are considered as recent - ' 'last update %s.', upstream.updated_at) # keep track of start, but do not schedule nothing more # discard changes like updates db.rollback() return arguments # if this fails, it's actually OK, as there could be concurrency package_analysis = PackageAnalysis( package_id=package.id, started_at=datetime.datetime.utcnow(), finished_at=None) db.add(package_analysis) # keep track of updates upstream.updated_at = datetime.datetime.utcnow() db.commit() arguments['document_id'] = package_analysis.id return arguments
def retrieve_bookkeeping_for_ecosystem_package(ecosystem, package): """Retrieve BookKeeping data for given Package and Ecosystem. :param ecosystem: ecosystem for which the data should be retrieved :param package: package for which the data should be retrieved """ rdb = StoragePool.get_connected_storage('BayesianPostgres') db = rdb.session try: e = Ecosystem.by_name(db, ecosystem) p = Package.by_name(db, package) version_count = _count( db, db.query(Version).join(Package).filter( Package.ecosystem == e).filter(Version.package == p)) stat = db.query(PackageWorkerResult.worker, PackageWorkerResult.error, PackageWorkerResult.task_result).join(PackageAnalysis). \ filter(PackageAnalysis.package == p). \ all() worker_stats = [] for worker_name, has_error, task_result in stat: entry = { "worker_name": worker_name, "has_error": has_error, "task_result": task_result } worker_stats.append(entry) p_versions = db.query(Version).join(Package).join(Ecosystem). \ filter(Package.ecosystem == e). \ filter(Version.package == p) result = { "summary": { "ecosystem": e.name, "package": p.name, "package_version_count": version_count, "package_level_workers": worker_stats, "analysed_versions": [v.identifier for v in p_versions] } } except NoResultFound as e: result = {"error": "No such package: %s/%s" % (ecosystem, package)} except SQLAlchemyError as e: result = { "error": "Error encountered while fetching data. Please check logs." } return result
def get_sources(self): """Get path to source files. :return: path to source files """ if not self._eco_obj: self._eco_obj = Ecosystem.by_name(self._postgres.session, self.ecosystem) if self._eco_obj.is_backed_by(EcosystemBackend.maven): return self.get_extracted_source_jar() else: return self.get_extracted_source_tarball()
def case_sensitivity_transform(ecosystem, name): """Transform package name to lowercase for ecosystem that are not case sensitive. :param ecosystem: name of ecosystem in which the package is sits :param name: name of ecosystem :return: transformed package name base on ecosystem package case sensitivity """ if Ecosystem.by_name(StoragePool.get_connected_storage('BayesianPostgres').session, ecosystem).is_backed_by(EcosystemBackend.pypi): return name.lower() return name
def has_sources(self): """Test if the given EPV has available sources. :return: true if the given EPV has available sources """ if not self._eco_obj: self._eco_obj = Ecosystem.by_name(self._postgres.session, self.ecosystem) if self._eco_obj.is_backed_by(EcosystemBackend.maven): return self._s3.object_exists(self._source_jar_object_key) else: self._construct_source_tarball_names() return self._s3.object_exists(self._source_tarball_object_key)
def retrieve_bookkeeping_for_epv(ecosystem, package, version): """Retrieve BookKeeping data for the given ecosystem, package, and version. :param ecosystem: ecosystem for which the data should be retrieved :param package: package for which the data should be retrieved :param version: package version for which the data should be retrieved """ rdb = StoragePool.get_connected_storage('BayesianPostgres') db = rdb.session try: e = Ecosystem.by_name(db, ecosystem) p = Package.by_name(db, package) v = db.query(Version).join(Package).join(Ecosystem). \ filter(Package.ecosystem == e). \ filter(Version.package == p). \ filter(Version.identifier == version).one() stat = db.query(WorkerResult.worker, WorkerResult.error, WorkerResult.task_result). \ join(Analysis).join(Version). \ filter(Analysis.version == v).all() worker_stats = [] for worker_name, has_error, task_result in stat: entry = { "worker_name": worker_name, "has_error": has_error, "task_result": task_result } worker_stats.append(entry) result = { "summary": { "ecosystem": e.name, "package": p.name, "version": v.identifier, "workers": worker_stats } } except NoResultFound as e: return { "error": "No such version: %s/%s/%s" % (ecosystem, package, version) } except SQLAlchemyError as e: result = { "error": "Error encountered while fetching data. Please check logs." } return result
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) if not arguments.get('force_graph_sync'): self._strict_assert(arguments.get('document_id')) rdb = StoragePool.get_connected_storage('BayesianPostgres') ecosystem_backend = Ecosystem.by_name( rdb.session, arguments.get('ecosystem')).backend.name package_list = [{ 'ecosystem': ecosystem_backend, 'name': arguments['name'], 'version': arguments.get('version'), 'source_repo': arguments.get('ecosystem') }] # If we force graph sync, sync all task results, otherwise only # finished in this analysis run if not arguments.get('force_graph_sync'): # Tasks that need sync to graph start lowercase. param = { 'select_ingest': [ task_name for task_name in self.storage.get_finished_task_names( arguments['document_id']) if task_name[0].islower() ], 'package_list': package_list } endpoint = self._SELECTIVE_API_URL else: param = package_list endpoint = self._INGEST_API_URL self.log.info("Invoke graph importer at url: '%s' for %s", endpoint, param) response = requests.post(endpoint, json=param) if response.status_code != 200: raise RuntimeError("Failed to invoke graph import at '%s' for %s" % (endpoint, param)) self.log.info("Graph import succeeded with response: %s", response.text)
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) rdb = StoragePool.get_connected_storage('BayesianPostgres') ecosystem = Ecosystem.by_name(rdb.session, arguments.get('ecosystem')) if arguments['ecosystem'] in ('maven', 'pypi', 'npm'): return self._victims_scan(arguments, ecosystem) elif arguments['ecosystem'] == 'nuget': return self._nuget_scan(arguments) else: raise RequestError('Unsupported ecosystem')
def execute(self, arguments): self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('ecosystem')) # get rid of version if scheduled from the core analyses arguments.pop('version', None) db = self.storage.session ecosystem = Ecosystem.by_name(db, arguments['ecosystem']) package = Package.get_or_create(db, ecosystem_id=ecosystem.id, name=arguments['name']) upstream = self.get_upstream_entry(db, package, self.get_upstream_url(arguments)) arguments['url'] = upstream.url if not arguments.get('force'): # can potentially schedule two flows of a same type at the same # time as there is no lock, but let's say it's OK if upstream.updated_at is not None \ and upstream.updated_at - datetime.datetime.now() < self._UPDATE_INTERVAL: self.log.info( 'Skipping upstream package check as data are considered as recent - ' 'last update %s.', upstream.updated_at) # keep track of start, but do not schedule nothing more # discard changes like updates db.rollback() return arguments # if this fails, it's actually OK, as there could be concurrency package_analysis = PackageAnalysis(package_id=package.id, started_at=datetime.datetime.now(), finished_at=None) db.add(package_analysis) # keep track of updates upstream.updated_at = datetime.datetime.now() db.commit() arguments['document_id'] = package_analysis.id return arguments
def get_analysis_count(ecosystem, package): """Get count of previously scheduled analyses for given ecosystem-package. :param ecosystem: str, Ecosystem name :param package: str, Package name :return: analysis count """ if Ecosystem.by_name(PostgresBase.session, ecosystem).is_backed_by(EcosystemBackend.maven): package = MavenCoordinates.normalize_str(package) try: count = PostgresBase.session.query(PackageAnalysis).\ join(Package).join(Ecosystem).\ filter(Ecosystem.name == ecosystem).\ filter(Package.name == package).\ count() except SQLAlchemyError: PostgresBase.session.rollback() raise return count
def post(): """Handle the POST REST API call.""" input_json = request.get_json() if not request.json: raise HTTPError(400, error="Expected JSON request") stack_id = input_json.get('stack_id') recommendation_type = input_json.get('recommendation_type') package_name = input_json.get('package_name') feedback_type = input_json.get('feedback_type') ecosystem_name = input_json.get('ecosystem') conditions = [ is_valid(stack_id), is_valid(recommendation_type), is_valid(package_name), is_valid(feedback_type), is_valid(ecosystem_name) ] if not all(conditions): raise HTTPError(400, error="Expected parameters missing") # Insert in a single commit. Gains - a) performance, b) avoid insert inconsistencies # for a single request try: ecosystem_obj = Ecosystem.by_name(rdb.session, name=ecosystem_name) req = RecommendationFeedback( stack_id=stack_id, package_name=package_name, recommendation_type=recommendation_type, feedback_type=feedback_type, ecosystem_id=ecosystem_obj.id) rdb.session.add(req) rdb.session.commit() return {'status': 'success'} except SQLAlchemyError as e: # TODO: please log the actual error too here logger.exception('Failed to create new analysis request') raise HTTPError( 500, "Error inserting log for request {t}".format( t=stack_id)) from e
def get_ecosystem(self, name): """Get ecosystem by name.""" if not self.is_connected(): self.connect() return Ecosystem.by_name(PostgresBase.session, name)
def execute(self, arguments, db, manifests, source=None): """Dependency finder logic.""" # TODO: reduce cyclomatic complexity # If we receive a manifest file we need to save it first result = [] for manifest in manifests: content_hash = None if source == 'osio': content_hash = generate_content_hash(manifest['content']) current_app.logger.info("{} file digest is {}".format(manifest['filename'], content_hash)) s3 = AmazonS3(bucket_name='boosters-manifest') try: s3.connect() manifest['content'] = s3.retrieve_blob(content_hash).decode('utf-8') except ClientError as e: current_app.logger.error("Unexpected error while retrieving S3 data: %s" % e) raise with TemporaryDirectory() as temp_path: with open(os.path.join(temp_path, manifest['filename']), 'a+') as fd: fd.write(manifest['content']) # mercator-go does not work if there is no package.json if 'shrinkwrap' in manifest['filename'].lower(): with open(os.path.join(temp_path, 'package.json'), 'w') as f: f.write(json.dumps({})) # Create instance manually since stack analysis is not handled by dispatcher subtask = MercatorTask.create_test_instance(task_name='metadata') arguments['ecosystem'] = manifest['ecosystem'] out = subtask.run_mercator(arguments, temp_path, resolve_poms=False) if not out["details"]: raise FatalTaskError("No metadata found processing manifest file '{}'" .format(manifest['filename'])) if 'dependencies' not in out['details'][0] and out.get('status', None) == 'success': raise FatalTaskError("Dependencies could not be resolved from manifest file '{}'" .format(manifest['filename'])) out["details"][0]['manifest_file'] = manifest['filename'] out["details"][0]['ecosystem'] = manifest['ecosystem'] out["details"][0]['manifest_file_path'] = manifest.get('filepath', 'File path not available') # If we're handling an external request we need to convert dependency specifications to # concrete versions that we can query later on in the `AggregatorTask` manifest_descriptor = get_manifest_descriptor_by_filename(manifest['filename']) if 'external_request_id' in arguments: manifest_dependencies = [] if manifest_descriptor.has_resolved_deps: # npm-shrinkwrap.json, pom.xml if "_dependency_tree_lock" in out["details"][0]: # npm-shrinkwrap.json if 'dependencies' in out['details'][0]["_dependency_tree_lock"]: manifest_dependencies = out["details"][0]["_dependency_tree_lock"].get( "dependencies", []) else: # pom.xml if 'dependencies' in out['details'][0]: manifest_dependencies = out["details"][0].get("dependencies", []) if manifest_descriptor.has_recursive_deps: # npm-shrinkwrap.json def _flatten(deps, collect): for dep in deps: collect.append({'package': dep['name'], 'version': dep['version']}) _flatten(dep['dependencies'], collect) resolved_deps = [] _flatten(manifest_dependencies, resolved_deps) else: # pom.xml resolved_deps =\ [{'package': x.split(' ')[0], 'version': x.split(' ')[1]} for x in manifest_dependencies] else: # package.json, requirements.txt try: resolved_deps = self._handle_external_deps( Ecosystem.by_name(db, arguments['ecosystem']), out["details"][0]["dependencies"]) except Exception: raise out["details"][0]['_resolved'] = resolved_deps result.append(out) return {'result': result}
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self.log.debug("Input Arguments: {}".format(arguments)) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) self._strict_assert(arguments.get('ecosystem')) # make sure we store package name based on ecosystem package naming case sensitivity arguments['name'] = normalize_package_name(arguments['ecosystem'], arguments['name']) db = self.storage.session try: ecosystem = Ecosystem.by_name(db, arguments['ecosystem']) except NoResultFound: raise FatalTaskError('Unknown ecosystem: %r' % arguments['ecosystem']) p = Package.get_or_create(db, ecosystem_id=ecosystem.id, name=arguments['name']) v = Version.get_or_create(db, package_id=p.id, identifier=arguments['version']) if not arguments.get('force'): # TODO: this is OK for now, but if we will scale and there will be # 2+ workers running this task they can potentially schedule two # flows of a same type at the same time if db.query(Analysis).filter(Analysis.version_id == v.id).count() > 0: # we need to propagate flags that were passed to flow, but not # E/P/V - this way we are sure that for example graph import is # scheduled (arguments['force_graph_sync'] == True) arguments.pop('name') arguments.pop('version') arguments.pop('ecosystem') self.log.debug("Arguments returned by initAnalysisFlow without force: {}" .format(arguments)) return arguments cache_path = mkdtemp(dir=self.configuration.WORKER_DATA_DIR) epv_cache = ObjectCache.get_from_dict(arguments) try: if not epv_cache.\ has_source_tarball(): _, source_tarball_path = IndianaJones.fetch_artifact( ecosystem=ecosystem, artifact=arguments['name'], version=arguments['version'], target_dir=cache_path ) epv_cache.put_source_tarball(source_tarball_path) if ecosystem.is_backed_by(EcosystemBackend.maven): if not epv_cache.has_source_jar(): try: source_jar_path = self._download_source_jar(cache_path, ecosystem, arguments) epv_cache.put_source_jar(source_jar_path) except Exception as e: self.log.info( 'Failed to fetch source jar for maven artifact "{n}/{v}": {err}'. format(n=arguments.get('name'), v=arguments.get('version'), err=str(e)) ) if not epv_cache.has_pom_xml(): pom_xml_path = self._download_pom_xml(cache_path, ecosystem, arguments) epv_cache.put_pom_xml(pom_xml_path) finally: # always clean up cache shutil.rmtree(cache_path) a = Analysis(version=v, access_count=1, started_at=datetime.datetime.utcnow()) db.add(a) db.commit() arguments['document_id'] = a.id # export ecosystem backend so we can use it to easily control flow later arguments['ecosystem_backend'] = ecosystem.backend.name self.log.debug("Arguments returned by InitAnalysisFlow are: {}".format(arguments)) return arguments
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self.log.debug("Input Arguments: {}".format(arguments)) self._strict_assert(isinstance(arguments.get('ecosystem'), str)) self._strict_assert(isinstance(arguments.get('name'), str)) self._strict_assert(isinstance(arguments.get('version'), str)) db = self.storage.session try: ecosystem = Ecosystem.by_name(db, arguments['ecosystem']) except NoResultFound: raise FatalTaskError('Unknown ecosystem: %r' % arguments['ecosystem']) # make sure we store package name in its normalized form arguments['name'] = normalize_package_name(ecosystem.backend.name, arguments['name']) if len(pattern_ignore.findall(arguments['version'])) > 0: self.log.info("Incorrect version alert {} {}".format( arguments['name'], arguments['version'])) raise NotABugFatalTaskError("Incorrect version alert {} {}".format( arguments['name'], arguments['version'])) # Dont try ingestion for private packages if is_pkg_public(arguments['ecosystem'], arguments['name']): self.log.info("Ingestion flow for {} {}".format( arguments['ecosystem'], arguments['name'])) else: self.log.info("Private package ingestion ignored {} {}".format( arguments['ecosystem'], arguments['name'])) raise NotABugFatalTaskError("Private package alert {} {}".format( arguments['ecosystem'], arguments['name'])) p = Package.get_or_create(db, ecosystem_id=ecosystem.id, name=arguments['name']) v = Version.get_or_create(db, package_id=p.id, identifier=arguments['version']) if not arguments.get('force'): if db.query(Analysis).filter( Analysis.version_id == v.id).count() > 0: arguments['analysis_already_exists'] = True self.log.debug( "Arguments returned by initAnalysisFlow without force: {}". format(arguments)) return arguments cache_path = mkdtemp(dir=self.configuration.WORKER_DATA_DIR) epv_cache = ObjectCache.get_from_dict(arguments) npm_dir = self.configuration.NPM_DATA_DIR try: if not epv_cache.\ has_source_tarball(): _, source_tarball_path = IndianaJones.fetch_artifact( ecosystem=ecosystem, artifact=arguments['name'], version=arguments['version'], target_dir=cache_path) epv_cache.put_source_tarball(source_tarball_path) if ecosystem.is_backed_by(EcosystemBackend.maven): if not epv_cache.has_source_jar(): try: source_jar_path = self._download_source_jar( cache_path, ecosystem, arguments) epv_cache.put_source_jar(source_jar_path) except Exception as e: self.log.info( 'Failed to fetch source jar for maven artifact "{n}/{v}": {err}' .format(n=arguments.get('name'), v=arguments.get('version'), err=str(e))) if not epv_cache.has_pom_xml(): pom_xml_path = self._download_pom_xml( cache_path, ecosystem, arguments) epv_cache.put_pom_xml(pom_xml_path) finally: # always clean up cache shutil.rmtree(cache_path) if arguments['ecosystem'] == "npm": shutil.rmtree(npm_dir, True) a = Analysis(version=v, access_count=1, started_at=datetime.datetime.utcnow()) db.add(a) db.commit() arguments['document_id'] = a.id # export ecosystem backend so we can use it to easily control flow later arguments['ecosystem_backend'] = ecosystem.backend.name self.log.debug( "Arguments returned by InitAnalysisFlow are: {}".format(arguments)) return arguments
def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) eco = arguments['ecosystem'] pkg = arguments['name'] tool_responses = {} result_summary = { 'package_names': [], 'registered_srpms': [], 'all_rhn_channels': [], 'all_rhsm_content_sets': [], 'all_rhsm_product_names': [] } result_data = {'status': 'error', 'summary': result_summary, 'details': tool_responses } # bail out early; we need access to internal services or the package is # from Maven ecosystem, otherwise we can't comment on downstream usage is_maven = Ecosystem.by_name(self.storage.session, eco).is_backed_by(EcosystemBackend.maven) if not self._is_inside_rh() and not is_maven: return result_data self.log.debug('Fetching {e}/{p} from Anitya'.format(e=eco, p=pkg)) res = self._fetch_anitya_project(eco, pkg) anitya_rpm_names = [] anitya_mvn_names = [] if res is None: result_data['status'] = 'error' elif res.status_code == 200: self.log.debug('Retrieved {e}/{p} from Anitya'.format(e=eco, p=pkg)) anitya_response = res.json() tool_responses['redhat_anitya'] = anitya_response # For now, we assume all downstreams are ones we care about for entry in anitya_response['packages']: if entry['distro'] == RH_RPM_DISTRO_NAME: anitya_rpm_names.append(entry['package_name']) elif entry['distro'] == RH_MVN_DISTRO_NAME: anitya_mvn_names.append(entry['package_name']) else: self.log.warning( 'Unknown distro {d} for downstream package {o} (package {p}) in Anitya'. format(d=entry['distro'], o=entry['package_name'], p=pkg) ) self.log.debug('Candidate RPM names from Anitya: {}'.format(anitya_rpm_names)) self.log.debug('Candidate MVN names from Anitya: {}'.format(anitya_mvn_names)) # TODO: Report 'partial' here and switch to 'success' at the end result_data['status'] = 'success' else: msg = 'Failed to find Anitya project {e}/{p}. Anitya response: {r}' self.log.error(msg.format(e=eco, p=pkg, r=res.text)) result_data['status'] = 'error' if self._is_inside_rh(): # we have candidate downstream name mappings, check them against Brew seed_names = anitya_rpm_names or [self._prefix_package_name(pkg, eco)] self.log.debug('Checking candidate names in Brew: {}'.format(seed_names)) args = ['brew-utils-cli', '--version', arguments['version']] artifact_hash = self._get_artifact_hash(algorithm='sha256') if artifact_hash: args += ['--digest', artifact_hash] args += seed_names self.log.debug("Executing command, timeout={timeout}: {cmd}".format( timeout=self._BREWUTILS_CLI_TIMEOUT, cmd=args)) tc = TimedCommand(args) status, output, error = tc.run(timeout=self._BREWUTILS_CLI_TIMEOUT) self.log.debug("status = %s, error = %s", status, error) output = ''.join(output) self.log.debug("output = %s", output) if not output: raise TaskError("Error running command %s" % args) brew = json.loads(output) result_summary['package_names'] = brew['packages'] result_summary['registered_srpms'] = brew['response']['registered_srpms'] tool_responses['brew'] = brew['response']['brew'] # we have SRPM details, fetch details on where the RPMs are shipped tool_responses['pulp_cdn'] = pulp_responses = [] rhn_channels = set() rhsm_content_sets = set() rhsm_product_names = set() for srpm_summary in result_summary['registered_srpms']: srpm_filename = "{n}-{v}-{r}.src.rpm".format(n=srpm_summary['package_name'], v=srpm_summary['version'], r=srpm_summary['release']) cdn_metadata = self._get_cdn_metadata(srpm_filename) if cdn_metadata is None: msg = 'Error getting shipping data for {e}/{p} SRPM: {srpm}' self.log.error(msg.format(e=eco, p=pkg, srpm=srpm_filename)) continue pulp_responses.append(cdn_metadata) srpm_summary['published_in'] = cdn_metadata['rhsm_product_names'] rhn_channels.update(cdn_metadata['rhn_channels']) rhsm_content_sets.update(cdn_metadata['rhsm_content_sets']) rhsm_product_names.update(cdn_metadata['rhsm_product_names']) result_summary['all_rhn_channels'] = sorted(rhn_channels) result_summary['all_rhsm_content_sets'] = sorted(rhsm_content_sets) result_summary['all_rhsm_product_names'] = sorted(rhsm_product_names) self._add_mvn_results(result_summary, anitya_mvn_names, arguments['version']) return result_data
def components_to_scan(self, previous_sync_timestamp, only_already_scanned): """Get EPV that were recently updated in OSS Index, so they can contain new vulnerabilities. Get components (e:p:v) that were recently (since previous_sync_timestamp) updated in OSS Index, which means that they can contain new vulnerabilities. :param previous_sync_timestamp: timestamp of previous check :param only_already_scanned: include already scanned components only :return: generator of e:p:v """ # TODO: reduce cyclomatic complexity to_scan = [] rdb = StoragePool.get_connected_storage('BayesianPostgres') for ecosystem in ['nuget']: ecosystem_solver = get_ecosystem_solver( self.storage.get_ecosystem(ecosystem), with_parser=OSSIndexDependencyParser()) self.log.debug("Retrieving new %s vulnerabilities from OSS Index", ecosystem) ossindex_updated_packages = CVEcheckerTask.\ query_ossindex_vulnerability_fromtill(ecosystem=ecosystem, from_time=previous_sync_timestamp) for ossindex_updated_package in ossindex_updated_packages: if Ecosystem.by_name(rdb.session, ecosystem).is_backed_by( EcosystemBackend.maven): package_name = "{g}:{n}".format( g=ossindex_updated_package['group'], n=ossindex_updated_package['name']) else: package_name = ossindex_updated_package['name'] package_affected_versions = set() for vulnerability in ossindex_updated_package.get( 'vulnerabilities', []): for version_string in vulnerability.get('versions', []): try: resolved_versions = ecosystem_solver.\ solve(["{} {}".format(package_name, version_string)], all_versions=True) except Exception: self.log.exception( "Failed to resolve %r for %s:%s", version_string, ecosystem, package_name) continue resolved_versions = resolved_versions.get( package_name, []) if only_already_scanned: already_scanned_versions =\ [ver for ver in resolved_versions if self.storage.get_analysis_count(ecosystem, package_name, ver) > 0] package_affected_versions.update( already_scanned_versions) else: package_affected_versions.update(resolved_versions) for version in package_affected_versions: to_scan.append({ 'ecosystem': ecosystem, 'name': package_name, 'version': version }) msg = "Components to be {prefix}scanned for vulnerabilities: {components}".\ format(prefix="re-" if only_already_scanned else "", components=to_scan) self.log.info(msg) return to_scan