def _python_scan(self, arguments): """ Run OWASP dependency-check experimental analyzer for Python artifacts https://jeremylong.github.io/DependencyCheck/analyzers/python-analyzer.html """ tarball = ObjectCache.get_from_dict(arguments).get_source_tarball() if tarball.endswith('zip') or tarball.endswith('.whl'): # tar.gz seems to be not supported scan_path = tarball else: extracted_tarball = ObjectCache.get_from_dict(arguments).get_extracted_source_tarball() # depcheck needs to be pointed to a specific file, we can't just scan whole directory egg_info, pkg_info, metadata = None, None, None for root, dirs, files in os.walk(extracted_tarball): if root.endswith('.egg-info'): egg_info = root if 'PKG-INFO' in files: pkg_info = os.path.join(root, 'PKG-INFO') if 'METADATA' in files: metadata = os.path.join(root, 'METADATA') scan_path = egg_info or pkg_info or metadata if not scan_path: return {'summary': ['File types not supported by OWASP dependency-check'], 'status': 'error', 'details': []} return self._run_owasp_dep_check(scan_path, experimental=True)
def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) source_path = ObjectCache.get_from_dict(arguments).get_sources() header, language_stats = self._get_generic_result(source_path) for language in language_stats.keys(): for handler in self._LANGUAGE_ANALYZER_HANDLERS.get(language, []): metrics_data = handler(self, source_path) if not metrics_data: continue if 'metrics' not in language_stats[language]: language_stats[language]['metrics'] = {} language_stats[language]['metrics'].update(metrics_data) # we don't want to have possibly unique keys and we want to avoid enumerating all languages that are # supported by cloc - convert a dict to a list of language-specific entries result = {'languages': []} for language in language_stats.keys(): record = language_stats.get(language) record['language'] = language result['languages'].append(record) return {'summary': header, 'status': 'success', 'details': result}
def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) results = [] cache_path = ObjectCache.get_from_dict( arguments).get_extracted_source_tarball() def worker(path): mime = TimedCommand.get_command_output(['file', path, '-b', '-i']).pop() self.log.debug("%s mime = %s", path, mime) typ = TimedCommand.get_command_output(['file', path, '-b']) self.log.debug("%s filetype = %s", path, typ) linguist = None if 'charset=binary' not in mime: linguist = self._parse_linguist( TimedCommand.get_command_output(['linguist', path])) self.log.debug("%s linguist output = %s", path, linguist) results.append({ "type": typ, "output": linguist, "path": os.path.relpath(path, cache_path), }) with ThreadPool(target=worker) as tp: for path in get_all_files_from(cache_path, path_filter=skip_git_files): tp.add_task(path) return {'summary': [], 'status': 'success', 'details': results}
def execute(self, arguments): "Execute mercator and convert it's output to JSON object" self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) # TODO: make this even uglier; looks like we didn't get the abstraction quite right # when we were adding support for Java/Maven. if self.storage.get_ecosystem(arguments['ecosystem']).is_backed_by( EcosystemBackend.maven): # cache_path now points directly to the pom cache_path = ObjectCache.get_from_dict(arguments).get_pom_xml() else: cache_path = ObjectCache.get_from_dict( arguments).get_extracted_source_tarball() return self.run_mercator(arguments, cache_path)
def execute(self, arguments): """ task code :param arguments: dictionary with arguments :return: {}, results """ self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) try: cache_path = ObjectCache.get_from_dict(arguments).get_sources() except Exception as e: eco = arguments.get('ecosystem') pkg = arguments.get('name') ver = arguments.get('version') if arguments['ecosystem'] != 'maven': self.log.error( 'Could not get sources for package {e}/{p}/{v}'.format( e=eco, p=pkg, v=ver)) raise self.log.info('Could not get sources for maven package {p}/{v},' 'will try to run on binary jar'.format(p=pkg, v=ver)) cache_path = ObjectCache.get_from_dict( arguments).get_extracted_source_tarball() result_data = {'status': 'unknown', 'summary': {}, 'details': {}} try: result_data['details'] = TimedCommand.get_command_output( ['license_check.py', cache_path], graceful=False, is_json=True) result_data['status'] = result_data['details'].pop('status') result_data['summary'] = result_data['details'].pop('summary') except: self.log.exception("License scan failed") result_data['status'] = 'error' return result_data
def run(self, node_args): if self.storage and isinstance(self.storage, BayesianPostgres): # SQS guarantees 'deliver at least once', so there could be multiple messages of a type, give up immediately if self.storage.get_worker_id_count(self.task_id) > 0: raise FatalTaskError( "Task with ID '%s' was already processed" % self.task_id) start = datetime.now() try: result = self.execute(node_args) finally: # remove all files that were downloaded for this task ObjectCache.wipe() end = datetime.now() if result: # Ensure result complies with the defined schema (if any) before saving self.validate_result(result) if result is None: # Keep track of None results and add _audit and _release keys result = {} if self.add_audit_info: # `_audit` key is added to every analysis info submitted result['_audit'] = { 'started_at': json_serial(start), 'ended_at': json_serial(end), 'version': 'v1' } ecosystem_name = node_args.get('ecosystem') result['_release'] = '{}:{}:{}'.format(ecosystem_name, node_args.get('name'), node_args.get('version')) return result
def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) result_data = {'status': 'unknown', 'summary': [], 'details': {}} if self._is_valid_ecosystem(arguments['ecosystem']): hub = self._get_hub() # BlackDuck project doesn't have a notion of ecosystem, so we need to # namespace the project names ourselves, so for package `crumb` in the NPM ecosystem # we'll end up with the name `npm-crumb` project = self._get_project_name(arguments) version = arguments['version'] # Check if the given project had already been scanned data = self._release_data(hub, project, version) if not data and self._allow_cli_scan: self.log.debug("No data available for project {p} {v}".format( p=project, v=version)) # No data available, issue a new scan and re-query release data source_tarball_path = ObjectCache.get_from_dict( arguments).get_source_tarball() command = self._prepare_command(project, version, source_tarball_path) self.log.debug( "Executing command, timeout={timeout}: {cmd}".format( timeout=self._BLACKDUCK_CLI_TIMEOUT, cmd=command)) bd = TimedCommand(command) status, output, error = bd.run( timeout=self._BLACKDUCK_CLI_TIMEOUT, update_env={'BD_HUB_PASSWORD': config.blackduck_password}) self.log.debug("status = %s, error = %s", status, error) self.log.debug("output = %s", output) data = self._release_data(hub, project, version) self.log.debug("Release data for project {p} {v}: {d}".format( p=project, v=version, d=data)) result_data['details'] = data result_data['status'] = 'success' if data else 'error' else: result_data['status'] = 'error' return result_data
def execute(self, arguments): """ task code :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) result_data = {'status': 'unknown', 'summary': [], 'details': []} source_tarball_path = ObjectCache.get_from_dict( arguments).get_source_tarball() sa = StaticAnalysis(source_tarball_path) try: analysis_result = sa.analyze() # make output reproducible - scanning the same # input multiple times should always produce # the same output del analysis_result["scan"]["time-created"] del analysis_result["scan"]["time-finished"] del analysis_result["scan"]["host"] del analysis_result["scan"]["store-results-to"] stats = {} for defect in analysis_result["defects"]: stats.setdefault(defect["checker"], {"count": 0}) stats[defect["checker"]]["count"] += 1 try: stats[defect["checker"]]["cwe"] = defect["cwe"] except KeyError: pass result_data['summary'] = stats result_data['status'] = 'success' result_data['details'] = analysis_result except Exception as ex: self.log.error("static analysis was not successful: %r", ex) result_data['status'] = 'error' return result_data
def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) cache_path = ObjectCache.get_from_dict(arguments).get_source_tarball() results = [] for path in get_all_files_from(cache_path, path_filter=skip_git_files): self.log.debug("path = %s", path) bw = TimedCommand(['binwalk', '-B', path]) status, output, error = bw.run(timeout=60) self.log.debug("status = %s, error = %s", status, error) self.log.debug("output = %s", output) parsed_binwalk = self.parse_binwalk(output) results.append({ "path": os.path.relpath(path, cache_path), "output": parsed_binwalk, }) return {'summary': [], 'status': 'success', 'details': results}
def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) cache_path = ObjectCache.get_from_dict(arguments).get_extracted_source_tarball() results = {'status': 'unknown', 'summary': {}, 'details': []} try: oscc = TimedCommand.get_command_output(['oscryptocatcher', '--subdir-in-result', cache_path], graceful=False, is_json=True) self.log.debug("oscryptocatcher %s output: %s", cache_path, oscc) results['details'] = oscc['details'] results['summary'] = oscc['summary'] results['status'] = 'success' except: results['status'] = 'error' return results
def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) epv_cache = ObjectCache.get_from_dict(arguments) cache_path = epv_cache.get_extracted_source_tarball() results = [] for f in get_all_files_from(cache_path, path_filter=skip_git_files): results.append(self.compute_digests(cache_path, f)) # In case of nodejs, prior to npm-2.x.x (Fedora 24) # npm client was repackaging modules on download. # It modified file permissions inside package.tgz so they matched UID/GID # of a user running npm command. Therefore its digest was different # then of a tarball downloaded directly from registry.npmjs.org. source_tarball_path = epv_cache.get_source_tarball() results.append( self.compute_digests(source_tarball_path, source_tarball_path, artifact=True)) return {'summary': [], 'status': 'success', 'details': results}
def execute(self, arguments): self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) self._strict_assert(arguments.get('ecosystem')) db = self.storage.session e = Ecosystem.by_name(db, arguments['ecosystem']) p = Package.get_or_create(db, ecosystem_id=e.id, name=arguments['name']) v = Version.get_or_create(db, package_id=p.id, identifier=arguments['version']) if not arguments.get('force'): # TODO: this is OK for now, but if we will scale and there will be 2+ workers running this task # they can potentially schedule two flows of a same type at the same time if db.query(Analysis).filter( Analysis.version_id == v.id).count() > 0: # we need to propagate flags that were passed to flow, but not E/P/V - this way we are sure that for # example graph import is scheduled (arguments['force_graph_sync'] == True) arguments.pop('name') arguments.pop('version') arguments.pop('ecosystem') return arguments cache_path = mkdtemp(dir=self.configuration.worker_data_dir) epv_cache = ObjectCache.get_from_dict(arguments) ecosystem = Ecosystem.by_name(db, arguments['ecosystem']) try: if not epv_cache.has_source_tarball(): _, source_tarball_path = IndianaJones.fetch_artifact( ecosystem=ecosystem, artifact=arguments['name'], version=arguments['version'], target_dir=cache_path) epv_cache.put_source_tarball(source_tarball_path) if ecosystem.is_backed_by(EcosystemBackend.maven): if not epv_cache.has_source_jar(): try: source_jar_path = self._download_source_jar( cache_path, ecosystem, arguments) epv_cache.put_source_jar(source_jar_path) except Exception as e: self.log.info( 'Failed to fetch source jar for maven artifact "{e}/{p}/{v}": {err}' .format(e=arguments.get('ecosystem'), p=arguments.get('name'), v=arguments.get('version'), err=str(e))) if not epv_cache.has_pom_xml(): pom_xml_path = self._download_pom_xml( cache_path, ecosystem, arguments) epv_cache.put_pom_xml(pom_xml_path) finally: # always clean up cache shutil.rmtree(cache_path) a = Analysis(version=v, access_count=1, started_at=datetime.datetime.now()) db.add(a) db.commit() arguments['document_id'] = a.id return arguments
def _maven_scan(self, arguments): """ Run OWASP dependency-check """ jar_path = ObjectCache.get_from_dict(arguments).get_source_tarball() return self._run_owasp_dep_check(jar_path, experimental=False)
def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) try: cache_path = ObjectCache.get_from_dict(arguments).get_sources() except Exception: eco = arguments.get('ecosystem') pkg = arguments.get('name') ver = arguments.get('version') if arguments['ecosystem'] != 'maven': self.log.error( 'Could not get sources for package {e}/{p}/{v}'.format( e=eco, p=pkg, v=ver)) raise self.log.info('Could not get sources for maven package {p}/{v},' 'will try to run on binary jar'.format(p=pkg, v=ver)) cache_path = ObjectCache.get_from_dict( arguments).get_extracted_source_tarball() result_data = {'status': 'unknown', 'summary': {}, 'details': {}} try: command = [ os.path.join( os.getenv('SCANCODE_PATH', '/opt/scancode-toolkit/'), 'scancode'), # Scan for licenses '--license', # Do not return license matches with scores lower than this score '--license-score', SCANCODE_LICENSE_SCORE, # Files without findings are omitted '--only-findings', # Use n parallel processes '--processes', SCANCODE_PROCESSES, # Do not print summary or progress messages '--quiet', # Strip the root directory segment of all paths '--strip-root', # Stop scanning a file if scanning takes longer than a timeout in seconds '--timeout', SCANCODE_TIMEOUT, cache_path ] output = TimedCommand.get_command_output(command, graceful=False, is_json=True, timeout=600) details = self.process_output(output) result_data['details'] = details result_data['status'] = 'success' result_data['summary'] = { 'sure_licenses': list(details['licenses'].keys()) } except: self.log.exception("License scan failed") result_data['status'] = 'error' return result_data