def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) eco = arguments['ecosystem'] pkg = arguments['name'] ver = arguments['version'] try: cache_path = ObjectCache.get_from_dict(arguments).get_sources() except Exception: if not Ecosystem.by_name( StoragePool.get_connected_storage('BayesianPostgres'). session, eco).is_backed_by(EcosystemBackend.maven): self.log.error( 'Could not get sources for package {e}/{p}/{v}'.format( e=eco, p=pkg, v=ver)) raise self.log.info('Could not get sources for maven package {p}/{v},' 'will try to run on binary jar'.format(p=pkg, v=ver)) cache_path = ObjectCache.get_from_dict( arguments).get_extracted_source_tarball() result_data = self.run_scancode(cache_path) return result_data
def execute(self, arguments): """Execute mercator and convert it's output to JSON object.""" self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) if self.storage.get_ecosystem(arguments['ecosystem']).is_backed_by( EcosystemBackend.maven): # cache_path now points directly to the pom cache_path = ObjectCache.get_from_dict(arguments).get_pom_xml() else: cache_path = ObjectCache.get_from_dict( arguments).get_extracted_source_tarball() return self.run_mercator(arguments, cache_path)
def execute(self, arguments): """Run oscryptocatcher tool for matching crypto algorithms.""" self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) cache_path = ObjectCache.get_from_dict( arguments).get_extracted_source_tarball() results = {'status': 'unknown', 'summary': {}, 'details': []} try: oscc = TimedCommand.get_command_output( ['oscryptocatcher', '--subdir-in-result', cache_path], graceful=False, is_json=True) self.log.debug("oscryptocatcher %s output: %s", cache_path, oscc) results['details'] = oscc['details'] results['summary'] = oscc['summary'] results['status'] = 'success' except Exception: raise FatalTaskError('oscryptocatcher failed') return results
def _python_scan(self, arguments): """Run OWASP dependency-check experimental analyzer for Python artifacts. https://jeremylong.github.io/DependencyCheck/analyzers/python.html """ extracted_tarball = ObjectCache.get_from_dict(arguments).get_extracted_source_tarball() # depcheck needs to be pointed to a specific file, we can't just scan whole directory egg_info = pkg_info = metadata = None for root, _, files in os.walk(extracted_tarball): if root.endswith('.egg-info') or root.endswith('.dist-info'): egg_info = root if 'PKG-INFO' in files: pkg_info = os.path.join(root, 'PKG-INFO') if 'METADATA' in files: metadata = os.path.join(root, 'METADATA') scan_path = egg_info or pkg_info or metadata if pkg_info and not egg_info: # Work-around for dependency-check ignoring PKG-INFO outside .dist-info/ # https://github.com/jeremylong/DependencyCheck/issues/896 egg_info_dir = os.path.join(extracted_tarball, arguments['name'] + '.egg-info') try: os.mkdir(egg_info_dir) copy(pkg_info, egg_info_dir) scan_path = egg_info_dir except os.error: self.log.warning('Failed to copy %s to %s', pkg_info, egg_info_dir) if not scan_path: raise FatalTaskError('File types not supported by OWASP dependency-check') return self._run_owasp_dep_check(scan_path, experimental=True)
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) cache_path = ObjectCache.get_from_dict(arguments).get_source_tarball() results = [] for path in get_all_files_from(cache_path, path_filter=skip_git_files): self.log.debug("path = %s", path) bw = TimedCommand(['binwalk', '-B', path]) status, output, error = bw.run(timeout=60) self.log.debug("status = %s, error = %s", status, error) self.log.debug("output = %s", output) parsed_binwalk = self.parse_binwalk(output) results.append({ "path": os.path.relpath(path, cache_path), "output": parsed_binwalk, }) return {'summary': [], 'status': 'success', 'details': results}
def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) results = [] cache_path = ObjectCache.get_from_dict(arguments).get_extracted_source_tarball() def worker(path): mime = TimedCommand.get_command_output(['file', path, '-b', '-i']).pop() self.log.debug("%s mime = %s", path, mime) typ = TimedCommand.get_command_output(['file', path, '-b']) self.log.debug("%s filetype = %s", path, typ) linguist = None if 'charset=binary' not in mime: linguist = self._parse_linguist( TimedCommand.get_command_output(['linguist', path]) ) self.log.debug("%s linguist output = %s", path, linguist) results.append({ "type": typ, "output": linguist, "path": os.path.relpath(path, cache_path), }) with ThreadPool(target=worker) as tp: for path in get_all_files_from(cache_path, path_filter=skip_git_files): tp.add_task(path) return {'summary': [], 'status': 'success', 'details': results}
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) source_path = ObjectCache.get_from_dict(arguments).get_sources() header, language_stats = self._get_generic_result(source_path) for language in language_stats.keys(): for handler in self._LANGUAGE_ANALYZER_HANDLERS.get(language, []): metrics_data = handler(self, source_path) if not metrics_data: continue if 'metrics' not in language_stats[language]: language_stats[language]['metrics'] = {} language_stats[language]['metrics'].update(metrics_data) # we don't want to have possibly unique keys and we want to avoid # enumerating all languages that are supported by cloc - convert a dict # to a list of language-specific entries result = {'languages': []} for language in language_stats.keys(): record = language_stats.get(language) record['language'] = language result['languages'].append(record) return {'summary': header, 'status': 'success', 'details': result}
def run(self, node_args): """To be transparently called by Selinon. Selinon transparently calls run(), which takes care of task audit and some additional checks and calls execute(). """ # SQS guarantees 'deliver at least once', so there could be multiple # messages of a type, give up immediately if self.storage and isinstance(self.storage, (BayesianPostgres, PackagePostgres)): if self.storage.get_worker_id_count(self.task_id) > 0: raise TaskAlreadyExistsError("Task with ID '%s'" " was already processed" % self.task_id) start = datetime.utcnow() try: result = self.execute(node_args) finally: # remove all files that were downloaded for this task ObjectCache.wipe() end = datetime.utcnow() if result: # Ensure result complies with the defined schema (if any) before saving self.validate_result(result) if result is None: # Keep track of None results and add _audit and _release keys result = {} if self.add_audit_info: # `_audit` key is added to every analysis info submitted result['_audit'] = { 'started_at': json_serial(start), 'ended_at': json_serial(end), 'version': 'v1' } ecosystem_name = node_args.get('ecosystem') result['_release'] = '{}:{}:{}'.format(ecosystem_name, node_args.get('name'), node_args.get('version')) return result
def execute(self, arguments): """Execute mercator and convert it's output to JSON object.""" self._strict_assert(arguments.get('ecosystem')) if 'url' in arguments: # run mercator on a git repo return self.run_mercator_on_git_repo(arguments) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) # TODO: make this even uglier; looks like we didn't get the abstraction quite right # when we were adding support for Java/Maven. if self.storage.get_ecosystem(arguments['ecosystem']).is_backed_by( EcosystemBackend.maven): # cache_path now points directly to the pom cache_path = ObjectCache.get_from_dict(arguments).get_pom_xml() else: cache_path = ObjectCache.get_from_dict( arguments).get_extracted_source_tarball() return self.run_mercator(arguments, cache_path)
def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) eco = arguments['ecosystem'] pkg = arguments['name'] ver = arguments['version'] try: cache_path = ObjectCache.get_from_dict(arguments).get_sources() except Exception: if arguments['ecosystem'] != 'maven': self.log.error( 'Could not get sources for package {e}/{p}/{v}'.format( e=eco, p=pkg, v=ver)) raise self.log.info('Could not get sources for maven package {p}/{v},' 'will try to run on binary jar'.format(p=pkg, v=ver)) cache_path = ObjectCache.get_from_dict( arguments).get_extracted_source_tarball() result_data = self.run_scancode(cache_path) return result_data
def _maven_scan(self, arguments): """Run OWASP dependency-check & Victims CVE DB CLI.""" jar_path = ObjectCache.get_from_dict(arguments).get_source_tarball() results = self._run_owasp_dep_check(jar_path, experimental=False) if results.get('status') != 'success': return results # merge with Victims CVE DB results victims_cve_db_results = self._run_victims_cve_db_cli(arguments) for vulnerability in victims_cve_db_results: vulnerability = self._filter_victims_db_entry(vulnerability) if not vulnerability: continue if vulnerability['id'] not in results['summary']: results['summary'].append(vulnerability['id']) results['details'].append(vulnerability) return results
def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) result_data = {'status': 'unknown', 'summary': [], 'details': {}} if self._is_valid_ecosystem(arguments['ecosystem']): hub = self._get_hub() # BlackDuck project doesn't have a notion of ecosystem, so we need to # namespace the project names ourselves, so for package `crumb` in the NPM ecosystem # we'll end up with the name `npm-crumb` project = self._get_project_name(arguments) version = arguments['version'] # Check if the given project had already been scanned data = self._release_data(hub, project, version) if not data and self._allow_cli_scan: self.log.debug("No data available for project {p} {v}".format( p=project, v=version)) # No data available, issue a new scan and re-query release data source_tarball_path = ObjectCache.get_from_dict( arguments).get_source_tarball() command = self._prepare_command(project, version, source_tarball_path) self.log.debug( "Executing command, timeout={timeout}: {cmd}".format( timeout=self._BLACKDUCK_CLI_TIMEOUT, cmd=command)) bd = TimedCommand(command) status, output, error = \ bd.run(timeout=self._BLACKDUCK_CLI_TIMEOUT, update_env={'BD_HUB_PASSWORD': self.configuration.BLACKDUCK_PASSWORD}) self.log.debug("status = %s, error = %s", status, error) self.log.debug("output = %s", output) data = self._release_data(hub, project, version) self.log.debug("Release data for project {p} {v}: {d}".format( p=project, v=version, d=data)) result_data['details'] = data result_data['status'] = 'success' if data else 'error' else: result_data['status'] = 'error' return result_data
def execute(self, arguments): """ task code :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) result_data = {'status': 'unknown', 'summary': [], 'details': []} source_tarball_path = ObjectCache.get_from_dict( arguments).get_source_tarball() sa = StaticAnalysis(source_tarball_path) try: analysis_result = sa.analyze() # make output reproducible - scanning the same # input multiple times should always produce # the same output del analysis_result["scan"]["time-created"] del analysis_result["scan"]["time-finished"] del analysis_result["scan"]["host"] del analysis_result["scan"]["store-results-to"] stats = {} for defect in analysis_result["defects"]: stats.setdefault(defect["checker"], {"count": 0}) stats[defect["checker"]]["count"] += 1 try: stats[defect["checker"]]["cwe"] = defect["cwe"] except KeyError: pass result_data['summary'] = stats result_data['status'] = 'success' result_data['details'] = analysis_result except Exception as ex: self.log.error("static analysis was not successful: %r", ex) result_data['status'] = 'error' return result_data
def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) epv_cache = ObjectCache.get_from_dict(arguments) # cache_path = epv_cache.get_extracted_source_tarball() results = [] # We don't compute digests of files in extracted tarball, only the tarball itself # for f in get_all_files_from(cache_path, path_filter=skip_git_files): # results.append(self.compute_digests(cache_path, f)) source_tarball_path = epv_cache.get_source_tarball() # Compute digests of tarball and mark it as such results.append( self.compute_digests(None, source_tarball_path, artifact=True)) return {'summary': [], 'status': 'success', 'details': results}
def run(self, node_args): """To be transparently called by Selinon. Selinon transparently calls run(), which takes care of task audit and some additional checks and calls execute(). """ # SQS guarantees 'deliver at least once', so there could be multiple # messages of a type, give up immediately if self.storage and isinstance(self.storage, (BayesianPostgres, PackagePostgres)): if self.storage.get_worker_id_count(self.task_id) > 0: raise TaskAlreadyExistsError("Task with ID '%s'" " was already processed" % self.task_id) start = datetime.utcnow() try: result = self.execute(node_args) except Exception as exc: if self.add_audit_info: # `_audit` key is added to every analysis info submitted end = datetime.utcnow() result = dict() self._add_audit_info( task_result=result, task_start=start, task_end=end, node_args=node_args, ) # write the audit info to the storage self.storage.store_error( node_args=node_args, flow_name=self.flow_name, task_name=self.task_name, task_id=self.task_id, exc_info=sys.exc_info(), result=result ) raise exc finally: # remove all files that were downloaded for this task ObjectCache.wipe() end = datetime.utcnow() if result: # Ensure result complies with the defined schema (if any) before saving self.validate_result(result) if result is None: # Keep track of None results and add _audit and _release keys result = {} if self.add_audit_info: # `_audit` key is added to every analysis info submitted self._add_audit_info( task_result=result, task_start=start, task_end=end, node_args=node_args, ) return result
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self.log.debug("Input Arguments: {}".format(arguments)) self._strict_assert(isinstance(arguments.get('ecosystem'), str)) self._strict_assert(isinstance(arguments.get('name'), str)) self._strict_assert(isinstance(arguments.get('version'), str)) db = self.storage.session try: ecosystem = Ecosystem.by_name(db, arguments['ecosystem']) except NoResultFound: raise FatalTaskError('Unknown ecosystem: %r' % arguments['ecosystem']) # make sure we store package name in its normalized form arguments['name'] = normalize_package_name(ecosystem.backend.name, arguments['name']) if len(pattern_ignore.findall(arguments['version'])) > 0: self.log.info("Incorrect version alert {} {}".format( arguments['name'], arguments['version'])) raise NotABugFatalTaskError("Incorrect version alert {} {}".format( arguments['name'], arguments['version'])) # Dont try ingestion for private packages if is_pkg_public(arguments['ecosystem'], arguments['name']): self.log.info("Ingestion flow for {} {}".format( arguments['ecosystem'], arguments['name'])) else: self.log.info("Private package ingestion ignored {} {}".format( arguments['ecosystem'], arguments['name'])) raise NotABugFatalTaskError("Private package alert {} {}".format( arguments['ecosystem'], arguments['name'])) p = Package.get_or_create(db, ecosystem_id=ecosystem.id, name=arguments['name']) v = Version.get_or_create(db, package_id=p.id, identifier=arguments['version']) if not arguments.get('force'): if db.query(Analysis).filter( Analysis.version_id == v.id).count() > 0: arguments['analysis_already_exists'] = True self.log.debug( "Arguments returned by initAnalysisFlow without force: {}". format(arguments)) return arguments cache_path = mkdtemp(dir=self.configuration.WORKER_DATA_DIR) epv_cache = ObjectCache.get_from_dict(arguments) npm_dir = self.configuration.NPM_DATA_DIR try: if not epv_cache.\ has_source_tarball(): _, source_tarball_path = IndianaJones.fetch_artifact( ecosystem=ecosystem, artifact=arguments['name'], version=arguments['version'], target_dir=cache_path) epv_cache.put_source_tarball(source_tarball_path) if ecosystem.is_backed_by(EcosystemBackend.maven): if not epv_cache.has_source_jar(): try: source_jar_path = self._download_source_jar( cache_path, ecosystem, arguments) epv_cache.put_source_jar(source_jar_path) except Exception as e: self.log.info( 'Failed to fetch source jar for maven artifact "{n}/{v}": {err}' .format(n=arguments.get('name'), v=arguments.get('version'), err=str(e))) if not epv_cache.has_pom_xml(): pom_xml_path = self._download_pom_xml( cache_path, ecosystem, arguments) epv_cache.put_pom_xml(pom_xml_path) finally: # always clean up cache shutil.rmtree(cache_path) if arguments['ecosystem'] == "npm": shutil.rmtree(npm_dir, True) a = Analysis(version=v, access_count=1, started_at=datetime.datetime.utcnow()) db.add(a) db.commit() arguments['document_id'] = a.id # export ecosystem backend so we can use it to easily control flow later arguments['ecosystem_backend'] = ecosystem.backend.name self.log.debug( "Arguments returned by InitAnalysisFlow are: {}".format(arguments)) return arguments
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self.log.debug("Input Arguments: {}".format(arguments)) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) self._strict_assert(arguments.get('ecosystem')) # make sure we store package name based on ecosystem package naming case sensitivity arguments['name'] = normalize_package_name(arguments['ecosystem'], arguments['name']) db = self.storage.session try: ecosystem = Ecosystem.by_name(db, arguments['ecosystem']) except NoResultFound: raise FatalTaskError('Unknown ecosystem: %r' % arguments['ecosystem']) p = Package.get_or_create(db, ecosystem_id=ecosystem.id, name=arguments['name']) v = Version.get_or_create(db, package_id=p.id, identifier=arguments['version']) if not arguments.get('force'): # TODO: this is OK for now, but if we will scale and there will be # 2+ workers running this task they can potentially schedule two # flows of a same type at the same time if db.query(Analysis).filter(Analysis.version_id == v.id).count() > 0: # we need to propagate flags that were passed to flow, but not # E/P/V - this way we are sure that for example graph import is # scheduled (arguments['force_graph_sync'] == True) arguments.pop('name') arguments.pop('version') arguments.pop('ecosystem') self.log.debug("Arguments returned by initAnalysisFlow without force: {}" .format(arguments)) return arguments cache_path = mkdtemp(dir=self.configuration.WORKER_DATA_DIR) epv_cache = ObjectCache.get_from_dict(arguments) try: if not epv_cache.\ has_source_tarball(): _, source_tarball_path = IndianaJones.fetch_artifact( ecosystem=ecosystem, artifact=arguments['name'], version=arguments['version'], target_dir=cache_path ) epv_cache.put_source_tarball(source_tarball_path) if ecosystem.is_backed_by(EcosystemBackend.maven): if not epv_cache.has_source_jar(): try: source_jar_path = self._download_source_jar(cache_path, ecosystem, arguments) epv_cache.put_source_jar(source_jar_path) except Exception as e: self.log.info( 'Failed to fetch source jar for maven artifact "{n}/{v}": {err}'. format(n=arguments.get('name'), v=arguments.get('version'), err=str(e)) ) if not epv_cache.has_pom_xml(): pom_xml_path = self._download_pom_xml(cache_path, ecosystem, arguments) epv_cache.put_pom_xml(pom_xml_path) finally: # always clean up cache shutil.rmtree(cache_path) a = Analysis(version=v, access_count=1, started_at=datetime.datetime.utcnow()) db.add(a) db.commit() arguments['document_id'] = a.id # export ecosystem backend so we can use it to easily control flow later arguments['ecosystem_backend'] = ecosystem.backend.name self.log.debug("Arguments returned by InitAnalysisFlow are: {}".format(arguments)) return arguments