class BlackDuckRelease(object): """ Release object consist of version string, unique identifier and `datetime.datetime` information when this particular version was released """ @schema.input(SchemaRef("blackduck-release", "1-0-0")) def __init__(self, json_data, project): self._version = json_data['version'] self._id = json_data['versionId'] self._released_at = datetime.strptime(json_data['releasedOn'], "%Y-%m-%dT%H:%M:%S.%fZ") self._project = project @property def project(self): return self._project @property def version(self): """ Release version """ return self._version @property def id(self): """ Unique identifier """ return self._id @property def released_at(self): """ Release date time """ return self._released_at
class OSCryptoCatcherTask(BaseTask): _analysis_name = 'crypto_algorithms' description = "Runs oscryptocatcher tool for matching crypto algorithms" schema_ref = SchemaRef(_analysis_name, '1-0-0') def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) cache_path = ObjectCache.get_from_dict(arguments).get_extracted_source_tarball() results = {'status': 'unknown', 'summary': {}, 'details': []} try: oscc = TimedCommand.get_command_output(['oscryptocatcher', '--subdir-in-result', cache_path], graceful=False, is_json=True) self.log.debug("oscryptocatcher %s output: %s", cache_path, oscc) results['details'] = oscc['details'] results['summary'] = oscc['summary'] results['status'] = 'success' except: results['status'] = 'error' return results
class ComponentAnalyses(ResourceWithSchema): method_decorators = [login_required] schema_ref = SchemaRef('analyses_graphdb', '1-2-0') @staticmethod def get(ecosystem, package, version): if ecosystem == 'maven': package = MavenCoordinates.normalize_str(package) package = case_sensitivity_transform(ecosystem, package) result = get_analyses_from_graph(ecosystem, package, version) current_app.logger.warn("%r" % result) if result is not None: # Known component for Bayesian return result if os.environ.get("INVOKE_API_WORKERS", "") == "1": # Enter the unknown path server_create_analysis(ecosystem, package, version, api_flow=True, force=False, force_graph_sync=True) msg = "Package {ecosystem}/{package}/{version} is unavailable. The package will be available shortly,"\ " please retry after some time.".format(ecosystem=ecosystem, package=package, version=version) raise HTTPError(202, msg) else: server_create_analysis(ecosystem, package, version, api_flow=False, force=False, force_graph_sync=True) msg = "No data found for {ecosystem} Package {package}/{version}".format(ecosystem=ecosystem, package=package, version=version) raise HTTPError(404, msg)
class ComponentsInRange(ResourceWithSchema): schema_ref = SchemaRef('version_range_resolver', '1-0-0') def get(self, ecosystem): query = request.args.get('q') eco = Ecosystem.by_name(rdb.session, ecosystem) fetcher = CucosReleasesFetcher(eco, rdb.session) now = datetime.datetime.now() # Instantiate two different solvers, one using a custom fetcher to fetch # matching releases from Bayesian DB and the other one fetching from # upstream repositories. # The data from these two solvers then provide information as to: # 1) Which packages in the range we have already analysed and have information # about # 2) Other packages from upstream repositories which match the version specification cucos_solver, solver = get_ecosystem_solver(eco, with_fetcher=fetcher),\ get_ecosystem_solver(eco) ours = cucos_solver.solve([query], all_versions=True) upstream = solver.solve([query], all_versions=True) ours_nums = set() if not ours else set(next(iter(ours.values()))) upstreams_nums = set() if not upstream else set( next(iter(upstream.values()))) return { 'query': query, 'detail': { 'analysed': ours, 'upstream': upstream, 'difference': list(upstreams_nums - ours_nums) }, 'resolved_at': str(now) }
class AnalysisBase(ResourceWithSchema): """Base class for different endpoints returning analyses.""" schema_ref = SchemaRef('component_analyses', '1-1-3') def add_schema(self, response, status_code, method): """Overrides add_schema to be able to add component analyses schemas.""" super().add_schema(response, status_code, method) if status_code == 200 and method == 'GET': for analysis_name, analysis in response.get('analyses', {}).items(): if analysis is not None and 'schema' in analysis: analysis['schema'][ 'url'] = PublishedSchemas.get_component_analysis_schema_url( name=analysis['schema']['name'], version=analysis['schema']['version']) return response def _parse_args(self): args = ['fields', 'debuginfo'] arg_parser = reqparse.RequestParser() for arg in args: arg_parser.add_argument(arg, default='') parsed_args = arg_parser.parse_args() result = {k: parsed_args[k] for k in args} result['debuginfo'] = result['debuginfo'].lower() == 'true' return result def _get_projection(self, fields): projection = {} if fields: for f in fields.split(','): projection[f] = 1 return projection or None def _do_analysis_projection(self, analysis, fields): pass def _inc_access_counter(self, analysis): analysis.access_count += 1 rdb.session.commit() def _sanitize_result(self, result, debuginfo=False): result['_release'] = result.pop('release', None) if debuginfo: result['_audit'] = result.pop('audit', None) else: result.pop('id', None) result.pop('audit', None) result.pop('subtasks', None) # Do not show init task result.get('analyses', {}).pop('InitAnalysisFlow', None) for analysis in result.get('analyses', {}): if result['analyses'][analysis]: result['analyses'][analysis].pop('_audit', None) return result
class LinguistTask(BaseTask): _analysis_name = 'languages' description = "GitHub's tool to figure out what language is used in code" schema_ref = SchemaRef(_analysis_name, '1-0-0') def _parse_linguist(self, output): if not output: return None def extract_value(line): """ `language: Python` -> `Python` """ return line.split(':', 1)[1].strip() lines_matcher = re.compile('(\d+) lines \((\d+) sloc\)') m = lines_matcher.search(output[0]) lines, sloc = 0, 0 if m: lines, sloc = int(m.groups(1)[0]), int(m.groups(2)[0]) tml = zip(['type', 'mime', 'language'], [extract_value(l) for l in output[1:4]]) data = dict(tml, lines=lines, sloc=sloc) return data def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) results = [] cache_path = ObjectCache.get_from_dict( arguments).get_extracted_source_tarball() def worker(path): mime = TimedCommand.get_command_output(['file', path, '-b', '-i']).pop() self.log.debug("%s mime = %s", path, mime) typ = TimedCommand.get_command_output(['file', path, '-b']) self.log.debug("%s filetype = %s", path, typ) linguist = None if 'charset=binary' not in mime: linguist = self._parse_linguist( TimedCommand.get_command_output(['linguist', path])) self.log.debug("%s linguist output = %s", path, linguist) results.append({ "type": typ, "output": linguist, "path": os.path.relpath(path, cache_path), }) with ThreadPool(target=worker) as tp: for path in get_all_files_from(cache_path, path_filter=skip_git_files): tp.add_task(path) return {'summary': [], 'status': 'success', 'details': results}
class DigesterTask(BaseTask): _analysis_name = 'digests' schema_ref = SchemaRef(_analysis_name, '1-0-0') description = 'Computes various digests of all files found in target cache path' def compute_ssdeep(self, target): """ Compute SSdeep piece-wise linear hash of target """ # 0 : ssdeep header # 1 : hash,filename data = TimedCommand.get_command_output(['ssdeep', '-c', '-s', target]) try: return data[1].split(',')[0].strip() except IndexError: self.log.error("unable to compute ssdeep of %r", target) raise RuntimeError("can't compute digest of %r" % target) def compute_digests(self, cache_path, f, artifact=False): f_digests = { 'sha256': compute_digest(f, 'sha256'), 'sha1': compute_digest(f, 'sha1'), 'md5': compute_digest(f, 'md5'), 'ssdeep': self.compute_ssdeep(f) } if artifact: f_digests['artifact'] = True f_digests['path'] = os.path.basename(f) else: f_digests['path'] = os.path.relpath(f, cache_path) return f_digests def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) epv_cache = ObjectCache.get_from_dict(arguments) cache_path = epv_cache.get_extracted_source_tarball() results = [] for f in get_all_files_from(cache_path, path_filter=skip_git_files): results.append(self.compute_digests(cache_path, f)) # In case of nodejs, prior to npm-2.x.x (Fedora 24) # npm client was repackaging modules on download. # It modified file permissions inside package.tgz so they matched UID/GID # of a user running npm command. Therefore its digest was different # then of a tarball downloaded directly from registry.npmjs.org. source_tarball_path = epv_cache.get_source_tarball() results.append( self.compute_digests(source_tarball_path, source_tarball_path, artifact=True)) return {'summary': [], 'status': 'success', 'details': results}
def test_schema_lookup(self, tmpdir): library = SchemaLibrary(str(tmpdir)) requested_schema = SchemaRef("example", "1-0-0") with pytest.raises(SchemaLookupError): library.load_schema(requested_schema) schema_path = tmpdir.join("example-v1-0-0.schema.json") dummy_schema = {"dummy-schema": "example"} serialized_schema = json.dumps(dummy_schema).encode('utf-8') schema_path.write_binary(serialized_schema) assert library.read_binary_schema( requested_schema) == serialized_schema assert library.load_schema(requested_schema) == dummy_schema
class StackAnalysesByGraphGET(ResourceWithSchema): method_decorators = [login_required] schema_ref = SchemaRef('stack_analyses', '2-1-4') @staticmethod def get(external_request_id): try: results = rdb.session.query(WorkerResult)\ .filter(WorkerResult.external_request_id == external_request_id, or_(WorkerResult.worker == "stack_aggregator", WorkerResult.worker == "recommendation")) if results.count() <= 0: raise HTTPError( 202, "Analysis for request ID '{t}' is in progress".format( t=external_request_id)) except SQLAlchemyError: raise HTTPError( 500, "Worker result for request ID '{t}' doesn't exist yet".format( t=external_request_id)) try: recommendation_result = {} audit = "" external_request_id = "" manifest_response = [] for row in results: result = row.to_dict() if result["worker"] == "stack_aggregator": audit = result["task_result"]["_audit"] external_request_id = result["external_request_id"] manifest_response.append(result["task_result"]) else: recommendation_result = { "recommendations": result["task_result"]["recommendations"] } response = { "started_at": audit["started_at"], "finished_at": audit["ended_at"], "request_id": external_request_id, "result": manifest_response, "recommendation": recommendation_result } return response except: raise HTTPError( 500, "Error creating response for request {t}".format( t=external_request_id))
def test_bundled_schema_lookup(self, tmpdir): pkgdir = tmpdir.mkdir(tmpdir.basename) pkgdir.ensure("__init__.py") schemadir = pkgdir.mkdir("schemas") module = pkgdir.pyimport() library = BundledSchemaLibrary("schemas", module.__name__) requested_schema = SchemaRef("example", "1-0-0") with pytest.raises(SchemaLookupError): library.load_schema(requested_schema) schema_path = schemadir.join("example-v1-0-0.schema.json") dummy_schema = {"dummy-schema": "example"} serialized_schema = json.dumps(dummy_schema).encode('utf-8') schema_path.write_binary(serialized_schema) assert library.read_binary_schema( requested_schema) == serialized_schema assert library.load_schema(requested_schema) == dummy_schema
class LicenseCheckTask(BaseTask): _analysis_name = 'source_licenses' description = "Check licences of all files of a package" schema_ref = SchemaRef(_analysis_name, '2-0-0') def execute(self, arguments): """ task code :param arguments: dictionary with arguments :return: {}, results """ self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) try: cache_path = ObjectCache.get_from_dict(arguments).get_sources() except Exception as e: eco = arguments.get('ecosystem') pkg = arguments.get('name') ver = arguments.get('version') if arguments['ecosystem'] != 'maven': self.log.error( 'Could not get sources for package {e}/{p}/{v}'.format( e=eco, p=pkg, v=ver)) raise self.log.info('Could not get sources for maven package {p}/{v},' 'will try to run on binary jar'.format(p=pkg, v=ver)) cache_path = ObjectCache.get_from_dict( arguments).get_extracted_source_tarball() result_data = {'status': 'unknown', 'summary': {}, 'details': {}} try: result_data['details'] = TimedCommand.get_command_output( ['license_check.py', cache_path], graceful=False, is_json=True) result_data['status'] = result_data['details'].pop('status') result_data['summary'] = result_data['details'].pop('summary') except: self.log.exception("License scan failed") result_data['status'] = 'error' return result_data
class BinwalkTask(BaseTask): _analysis_name = 'binary_data' schema_ref = SchemaRef(_analysis_name, '1-0-0') description = "Find and extract interesting files / data from binary images" def parse_binwalk(self, output): if not output: return None import re matcher = re.compile('^\d{,8}\s*0x[A-Fa-f0-9]{,8}\s*(.*)$') matched = [] for line in output: match = matcher.match(line) if match: matched.append(match.groups(1)[0]) return matched def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) cache_path = ObjectCache.get_from_dict(arguments).get_source_tarball() results = [] for path in get_all_files_from(cache_path, path_filter=skip_git_files): self.log.debug("path = %s", path) bw = TimedCommand(['binwalk', '-B', path]) status, output, error = bw.run(timeout=60) self.log.debug("status = %s, error = %s", status, error) self.log.debug("output = %s", output) parsed_binwalk = self.parse_binwalk(output) results.append({ "path": os.path.relpath(path, cache_path), "output": parsed_binwalk, }) return {'summary': [], 'status': 'success', 'details': results}
class StackAnalysesByGraphGET(ResourceWithSchema): method_decorators = [login_required] schema_ref = SchemaRef('stack_analyses', '2-1-4') @staticmethod def get(external_request_id): stack_result = retrieve_worker_result(rdb, external_request_id, "stack_aggregator") reco_result = retrieve_worker_result(rdb, external_request_id, "recommendation") if stack_result is None and reco_result is None: raise HTTPError(202, "Analysis for request ID '{t}' is in progress".format(t=external_request_id)) if stack_result == -1 and reco_result == -1: raise HTTPError(404, "Worker result for request ID '{t}' doesn't exist yet".format(t=external_request_id)) started_at = None finished_at = None manifest_response = [] recommendation = {} if stack_result != None and 'task_result' in stack_result: if stack_result["task_result"] != None: started_at = stack_result["task_result"]["_audit"]["started_at"] finished_at = stack_result["task_result"]["_audit"]["ended_at"] manifest_response.append(stack_result["task_result"]) if reco_result is not None and 'task_result' in reco_result: if reco_result["task_result"] != None: recommendation = reco_result['task_result']['recommendations'] return { "started_at": started_at, "finished_at": finished_at, "request_id": external_request_id, "result": manifest_response, "recommendation": recommendation }
class BlackDuckProject(object): """ Project contains information about specific {ecosystem}-{package} pair """ @schema.input(SchemaRef("blackduck-project", "1-0-0")) def __init__(self, json_data): self._source = json_data self._name = json_data['name'] self._id = json_data['id'] self._canonical_release_id = json_data['canonicalReleaseId'] self._urls = {k: v for k, v in json_data.items() if k.endswith('Url')} @property def name(self): """ Name of the project """ return self._name @property def id(self): """ Unique identifier of the project """ return self._id @property def urls(self): """ Flat list of additional URLs for this project """ return self._urls @property def canonical_release_id(self): """ Latest release for the given project (in terms of version number) """ return self._canonical_release_id @property def source(self): """ Source JSON from which this object was parsed """ return self._source
def test_bundled_dynamic_schema_lookup(self, tmpdir, monkeypatch): pkgdir = tmpdir.mkdir(tmpdir.basename) pkgdir.ensure("__init__.py") schemadir = pkgdir.mkdir("schemas") schemadir.ensure("__init__.py") library = BundledDynamicSchemaLibrary('.'.join( [tmpdir.basename, "schemas"])) schema1 = SchemaRef("example", "1-0-0") schema2 = SchemaRef("example2", "1-0-0") schema3 = SchemaRef("example3", "1-0-0") schema4 = SchemaRef("example4", "1-0-0") schema5 = SchemaRef("example4", "2-0-0") # intentionally example4 schema6 = SchemaRef("example6", "2-0-0") with pytest.raises(SchemaImportError): library.load_schema_class_and_role(schema1) # sch2 doesn't have the ROLE_v1_0_0 variable sch2 = "import jsl;\nclass Schema(jsl.Document):\n x = jsl.StringField()\n" # sch3 doesn't have THE_SCHEMA variable sch3 = sch2 + "\nROLE_v1_0_0 = 'v1-0-0'\n" # sch4 is ok sch4 = sch3 + "\nTHE_SCHEMA = Schema\n" # no sch5; sch6 is ok and has two roles sch6 = sch4 + "\nROLE_v2_0_0 = 'v2-0-0'\n" schemadir.join("example2.py").write(sch2) schemadir.join("example3.py").write(sch3) schemadir.join("example4.py").write(sch4) schemadir.join("example6.py").write(sch6) monkeypatch.syspath_prepend(pkgdir.dirname) with pytest.raises(SchemaModuleAttributeError): library.load_schema_class_and_role(schema2) with pytest.raises(SchemaModuleAttributeError): library.load_schema_class_and_role(schema3) klass, role = library.load_schema_class_and_role(schema4) assert "x" in dir(klass) assert role == "v1-0-0" with pytest.raises(SchemaModuleAttributeError): # example 5 is the same as example 4, but doesn't have the required version 2-0-0 library.load_schema_class_and_role(schema5) klass6, role6 = library.load_schema_class_and_role(schema6) assert "x" in dir(klass) assert role6 == "v2-0-0"
class DownstreamUsageTask(BaseTask): """Queries Red Hat's internal toolchain for downstream component usage - queries Anitya for downstream package names - uses the package name and component version to query: - Brew for internal SRPM and build details - the Pulp CDN for redistribution details """ _analysis_name = 'redhat_downstream' description = 'Queries Red Hat internal toolchain for downstream usage' schema_ref = SchemaRef(_analysis_name, '2-2-1') _backend_to_anitya_ecosystem = { EcosystemBackend.npm: 'npm', EcosystemBackend.maven: 'maven', EcosystemBackend.pypi: 'pypi', EcosystemBackend.rubygems: 'rubygems' } _ecosystem_to_prefix = { 'npm': 'nodejs', 'pypi': 'python', 'rubygems': 'rubygem' } # Give CLI 10 minutes to retrieve results _BREWUTILS_CLI_TIMEOUT = 600 def _get_artifact_hash(self, algorithm=None): wr = self.parent_task_result('digests') if wr: for details in wr['details']: if details.get('artifact'): return details[algorithm or 'md5'] return None @staticmethod def _prefix_package_name(name, ecosystem): prefix = DownstreamUsageTask._ecosystem_to_prefix.get(ecosystem, '') if prefix: return '{p}-{n}'.format(p=prefix, n=name) return name def _fetch_anitya_project(self, ecosystem, package): eco_model = self.storage.get_ecosystem(ecosystem) backend = self._backend_to_anitya_ecosystem.get(eco_model.backend, None) if backend is None: raise ValueError('Don\'t know how to add ecosystem {e} with backend {b} to Anitya'. format(e=ecosystem, b=eco_model.backend)) api_path = '/api/by_ecosystem/{e}/{p}/'.format(e=ecosystem, p=package) anitya_url = config.anitya_url try: return _query_anitya_url(anitya_url, api_path) except (requests.HTTPError, requests.ConnectionError): msg = 'Failed to contact Anitya server at {}' self.log.exception(msg.format(config.anitya_url)) return None def _get_cdn_metadata(self, srpm_filename): """Try to retrieve Pulp CDN metadata""" try: pulp = Pulp() except ValueError as e: self.log.error(e) return None try: metadata = pulp.get_cdn_metadata_for_srpm(srpm_filename) except Exception as e: self.log.exception(e) return None return metadata def _add_mvn_results(self, result_summary, anitya_mvn_names, version): def _compare_version(downstream, upstream): dv = downstream if 'redhat' in dv: # remove ".redhat-X" or "-redhat-X" suffix dv = dv[:dv.find('redhat')-1] if dv == upstream: return True else: return False downstream_rebuilds = [] for name in anitya_mvn_names: metadata_url = '{repo}/{pkg}/maven-metadata.xml'.format(repo=RH_MVN_GA_REPO, pkg=mvn_pkg_to_repo_path(name)) res = requests.get(metadata_url) if res.status_code != 200: self.log.info('Metadata for package {pkg} not found in {repo} (status {code})'. format(pkg=name, repo=RH_MVN_GA_REPO, code=res.status_code)) continue versions = anymarkup.parse(res.text)['metadata']['versioning']['versions']['version'] # make sure 'versions' is a list (it's a string if there is just one version) if not isinstance(versions, list): versions = [versions] self.log.info('Found versions {v} for package {p}'.format(v=versions, p=name)) for v in versions: if _compare_version(v, version): downstream_rebuilds.append(v) result_summary['rh_mvn_matched_versions'] = downstream_rebuilds if downstream_rebuilds: # For now, we don't distinguish products, we just use general "Middleware" # for all Maven artifacts result_summary['all_rhsm_product_names'].append('Middleware') @staticmethod def _is_inside_rh(): """Returns True if running on RH network, False otherwise.""" is_inside = False try: is_inside = int(os.environ.get("OPENSHIFT_DEPLOYMENT", 0)) == 0 except ValueError: pass return is_inside def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) eco = arguments['ecosystem'] pkg = arguments['name'] tool_responses = {} result_summary = { 'package_names': [], 'registered_srpms': [], 'all_rhn_channels': [], 'all_rhsm_content_sets': [], 'all_rhsm_product_names': [] } result_data = {'status': 'error', 'summary': result_summary, 'details': tool_responses } # bail out early; we need access to internal services or the package is from Maven ecosystem, # otherwise we can't comment on downstream usage is_maven = Ecosystem.by_name(self.storage.session, eco).is_backed_by(EcosystemBackend.maven) if not self._is_inside_rh() and not is_maven: return result_data self.log.debug('Fetching {e}/{p} from Anitya'.format(e=eco, p=pkg)) res = self._fetch_anitya_project(eco, pkg) anitya_rpm_names = [] anitya_mvn_names = [] if res is None: result_data['status'] = 'error' elif res.status_code == 200: self.log.debug('Retrieved {e}/{p} from Anitya'.format(e=eco, p=pkg)) anitya_response = res.json() tool_responses['redhat_anitya'] = anitya_response # For now, we assume all downstreams are ones we care about for entry in anitya_response['packages']: if entry['distro'] == RH_RPM_DISTRO_NAME: anitya_rpm_names.append(entry['package_name']) elif entry['distro'] == RH_MVN_DISTRO_NAME: anitya_mvn_names.append(entry['package_name']) else: self.log.warning( 'Unknown distro {d} for downstream package {o} (package {p}) in Anitya'. format(d=entry['distro'], o=entry['package_name'], p=pkg) ) self.log.debug('Candidate RPM names from Anitya: {}'.format(anitya_rpm_names)) self.log.debug('Candidate MVN names from Anitya: {}'.format(anitya_mvn_names)) # TODO: Report 'partial' here and switch to 'success' at the end result_data['status'] = 'success' else: msg = 'Failed to find Anitya project {e}/{p}. Anitya response: {r}' self.log.error(msg.format(e=eco, p=pkg, r=res.text)) result_data['status'] = 'error' if self._is_inside_rh(): # we have candidate downstream name mappings, check them against Brew seed_names = anitya_rpm_names or [self._prefix_package_name(pkg, eco)] self.log.debug('Checking candidate names in Brew: {}'.format(seed_names)) args = ['brew-utils-cli', '--version', arguments['version']] artifact_hash = self._get_artifact_hash(algorithm='sha256') if artifact_hash: args += ['--digest', artifact_hash] args += seed_names self.log.debug("Executing command, timeout={timeout}: {cmd}".format(timeout=self._BREWUTILS_CLI_TIMEOUT, cmd=args)) tc = TimedCommand(args) status, output, error = tc.run(timeout=self._BREWUTILS_CLI_TIMEOUT) self.log.debug("status = %s, error = %s", status, error) output = ''.join(output) self.log.debug("output = %s", output) if not output: raise TaskError("Error running command %s" % args) brew = json.loads(output) result_summary['package_names'] = brew['packages'] result_summary['registered_srpms'] = brew['response']['registered_srpms'] tool_responses['brew'] = brew['response']['brew'] # we have SRPM details, fetch details on where the RPMs are shipped tool_responses['pulp_cdn'] = pulp_responses = [] rhn_channels = set() rhsm_content_sets = set() rhsm_product_names = set() for srpm_summary in result_summary['registered_srpms']: srpm_filename = "{n}-{v}-{r}.src.rpm".format(n=srpm_summary['package_name'], v=srpm_summary['version'], r=srpm_summary['release']) cdn_metadata = self._get_cdn_metadata(srpm_filename) if cdn_metadata is None: msg = 'Error getting shipping data for {e}/{p} SRPM: {srpm}' self.log.error(msg.format(e=eco, p=pkg, srpm=srpm_filename)) continue pulp_responses.append(cdn_metadata) srpm_summary['published_in'] = cdn_metadata['rhsm_product_names'] rhn_channels.update(cdn_metadata['rhn_channels']) rhsm_content_sets.update(cdn_metadata['rhsm_content_sets']) rhsm_product_names.update(cdn_metadata['rhsm_product_names']) result_summary['all_rhn_channels'] = sorted(rhn_channels) result_summary['all_rhsm_content_sets'] = sorted(rhsm_content_sets) result_summary['all_rhsm_product_names'] = sorted(rhsm_product_names) self._add_mvn_results(result_summary, anitya_mvn_names, arguments['version']) return result_data
class DependencySnapshotTask(BaseTask): _analysis_name = 'dependency_snapshot' description = 'Task that analyzes dependencies' schema_ref = SchemaRef(_analysis_name, '1-0-0') def _collect_dependencies(self): """ Return all dependencies for current analysis flow (operates on parent mercator result) :return: List[str], list of dependencies """ wr = self.parent_task_result('metadata') if not isinstance(wr, dict): raise TaskError('metadata task result has unexpected type: {}; expected dict'. format(type(wr))) # there can be details about multiple manifests in the metadata, therefore we will collect dependency # specifications from all of them and exclude obvious duplicates along the way dependencies = list({dep for m in wr.get('details', []) if m.get('dependencies') for dep in m.get('dependencies', [])}) return dependencies def _resolve_dependency(self, ecosystem, dep): ret = {'ecosystem': ecosystem.name, 'declaration': dep, 'resolved_at': json_serial(datetime.datetime.now())} # first, if this is a Github dependency, return it right away (we don't resolve these yet) if ' ' in dep: # we have both package name and version (version can be an URL) name, spec = dep.split(' ', 1) if gh_dep.match(spec): ret['name'] = name ret['version'] = 'https://github.com/' + spec elif urllib.parse.urlparse(spec).scheme is not '': ret['name'] = name ret['version'] = spec else: if gh_dep.match(dep): ret['name'] = 'https://github.com/' + dep ret['version'] = None elif urllib.parse.urlparse(dep).scheme is not '': ret['name'] = dep ret['version'] = None if 'name' in ret: return ret # second, figure out what is the latest upstream version matching the spec and return it solver = get_ecosystem_solver(ecosystem) pkgspec = solver.solve([dep]) if not pkgspec: raise TaskError("invalid dependency: {}".format(dep)) package, version = pkgspec.popitem() if not version: raise TaskError("bad version resolved for {}".format(dep)) ret['name'] = package ret['version'] = version return ret def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) result = {'summary': {'errors': [], 'dependency_counts': {}}, 'status': 'success', 'details': {}} ecosystem = self.storage.get_ecosystem(arguments.get('ecosystem')) try: deps = self._collect_dependencies() except TaskError as e: self.log.error(str(e)) result['summary']['errors'].append(str(e)) result['status'] = 'error' return result resolved_deps = [] for dep in deps: try: resolved = self._resolve_dependency(ecosystem, dep) except TaskError as e: self.log.error(str(e)) result['summary']['errors'].append(str(e)) result['status'] = 'error' self.log.info('resolved dependency %s as %s', resolved, dep) resolved_deps.append(resolved) # in future, we may want to provide also build/test dependencies, not just runtime result['details']['runtime'] = resolved_deps result['summary']['dependency_counts']['runtime'] = len(resolved_deps) return result
class StackAnalysesById(ResourceWithSchema): schema_ref = SchemaRef('stack_analyses', '2-1-3') def get(self, external_request_id): submitted_at = "" manifest_appstackid_map = {} try: results = rdb.session.query(StackAnalysisRequest)\ .filter(StackAnalysisRequest.id == external_request_id) if results.count() <= 0: raise HTTPError( 404, "Invalid request ID '{id}' received".format( id=external_request_id)) row = results.first().to_dict() submitted_at = row["submitTime"] request_json = json.loads(row["requestJson"]) for manifest in request_json["manifest"]: if manifest.get('appstack_id', 0): manifest_appstackid_map[ manifest["filename"]] = manifest["appstack_id"] except SQLAlchemyError: raise HTTPError( 500, "Error fetching data for request ID '{id}'".format( id=external_request_id)) try: results = rdb.session.query(WorkerResult)\ .filter(WorkerResult.external_request_id == external_request_id, WorkerResult.worker == "dependency_aggregator") if results.count() <= 0: raise HTTPError( 202, "Analysis for request ID '{t}' is in progress".format( t=external_request_id)) except SQLAlchemyError: raise HTTPError( 500, "Worker result for request ID '{t}' doesn't exist yet".format( t=external_request_id)) try: if results.count() > 0: result = results.first().to_dict() audit = result["task_result"]["_audit"] manifest_response = [] # TODO: this will probably need some refactoring for manifest in result["task_result"]["result"]: for component in manifest["components"]: component["latest_version"] = safe_get_latest_version( component["ecosystem"], component["name"]) component["dependents_count"] = get_dependents_count( component["ecosystem"], component["name"], component["version"], rdb.session) component["relative_usage"] = usage_rank2str( get_component_percentile_rank( component["ecosystem"], component["name"], component["version"], rdb.session)) manifest_appstack_id = manifest_appstackid_map.get( manifest["manifest_name"], '') if manifest_appstack_id != '': url = current_app.config['BAYESIAN_ANALYTICS_URL'] endpoint = "{analytics_baseurl}/api/v1.0/recommendation/{appstack_id}".format( analytics_baseurl=url, appstack_id=manifest_appstack_id) resp = requests.get(endpoint) if resp.status_code == 200: recommendation = resp.json() # Adding URI of the stacks to the recommendation if recommendation.get("input_stack", {}).get( "appstack_id", "") != "": recommendation["input_stack"][ "uri"] = "{analytics_baseurl}/api/v1.0/appstack/{appstack_id}".format( analytics_baseurl=url, appstack_id=recommendation[ "input_stack"]["appstack_id"]) if recommendation.get("recommendations", {}).get( "similar_stacks", "") != "": for r in recommendation["recommendations"][ "similar_stacks"]: if r["stack_id"] != "": r["uri"] = "{analytics_baseurl}/api/v1.0/appstack/{appstack_id}".format( analytics_baseurl=url, appstack_id=r["stack_id"]) manifest["recommendation"] = recommendation else: current_app.logger.warn("{status}: {error}".format( status=resp.status_code, error=resp.content)) manifest_response.append(manifest) response = { "status": result["task_result"]["status"], "submitted_at": submitted_at, "started_at": audit["started_at"], "finished_at": audit["ended_at"], "request_id": result["external_request_id"], "result": manifest_response } return response except: raise HTTPError( 500, "Error creating response for request {t}".format( t=external_request_id))
class BlackDuckTask(BaseTask): _analysis_name = 'blackduck' description = 'Scan the package using Black Duck' _valid_ecosystems = ["npm", "maven", "pypi"] _allow_cli_scan = True schema_ref = SchemaRef(_analysis_name, '1-0-0') _BLACKDUCK_CLI_TIMEOUT = 600 def _format_hub_url(self): """ Format Hub connection string from supplied config :return: """ return "{scheme}://{host}:{port}/".format( scheme=config.blackduck_scheme, host=config.blackduck_host, port=config.blackduck_port) def _is_valid_ecosystem(self, ecosystem_id): """ Determine whether the given ecosystem is valid for Black Duck analysis :param ecosystem_id: int, the ID of the ecosystem :return: bool """ return ecosystem_id in self._valid_ecosystems def _find_blackduck_cli_root(self): """ Find the base directory where the BlackDuck CLI got extracted :return: str, path to the CLI root """ base = config.blackduck_path dirs = listdir(base) if not dirs: raise TaskError("Unable to find BlackDuck CLI directory") if len(dirs) > 1: raise TaskError("More than 1 BlackDuck CLI directory") return path.join(base, dirs.pop()) def _prepare_command(self, project, version, archive): """ Prepare the necessary CLI parameters :param project: str, name of the project :param version: str, version of the release :param archive: str, path to the archive with the sources :return: List[str], command list ready to be run """ binary = "{base}/{rel}".format(base=self._find_blackduck_cli_root(), rel="bin/scan.cli.sh") return [ binary, "--host", config.blackduck_host, "--port", str(int(config.blackduck_port)), "--scheme", config.blackduck_scheme, "--username", config.blackduck_username, "--project", project, "--release", version, archive ] def _get_release(self, hub, project, version): """ Get release ID for given project version :param hub: BlackDuckHub, hub object to use :param project: str, name of the project :param version: str, version :return: BlackDuckRelease object or None if not found """ # check that the specified project exists proj = hub.find_project(project) if not proj: return None # check that we have the proper version releases = hub.get_releases(proj) return releases.get(version, None) def _release_data(self, hub, project, version): """ Fetch release data for the given project and version :param hub: BlackDuckHub, hub object to use :param project: str, name of the project :param version: str, version :return: dict, BoM information about the release """ release = self._get_release(hub, project, version) if release is None: return None return hub.get_release_bom_json(release) def _get_hub(self): # connect to the Black Duck Hub hub_url = self._format_hub_url() self.log.debug("hub url: {url}".format(url=hub_url)) hub = BlackDuckHub(hub_url) hub.connect_session(config.blackduck_username, config.blackduck_password) return hub def _get_project_name(self, arguments): return "{ecosystem}-{package}".format(ecosystem=arguments['ecosystem'], package=arguments['name']) def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) result_data = {'status': 'unknown', 'summary': [], 'details': {}} if self._is_valid_ecosystem(arguments['ecosystem']): hub = self._get_hub() # BlackDuck project doesn't have a notion of ecosystem, so we need to # namespace the project names ourselves, so for package `crumb` in the NPM ecosystem # we'll end up with the name `npm-crumb` project = self._get_project_name(arguments) version = arguments['version'] # Check if the given project had already been scanned data = self._release_data(hub, project, version) if not data and self._allow_cli_scan: self.log.debug("No data available for project {p} {v}".format( p=project, v=version)) # No data available, issue a new scan and re-query release data source_tarball_path = ObjectCache.get_from_dict( arguments).get_source_tarball() command = self._prepare_command(project, version, source_tarball_path) self.log.debug( "Executing command, timeout={timeout}: {cmd}".format( timeout=self._BLACKDUCK_CLI_TIMEOUT, cmd=command)) bd = TimedCommand(command) status, output, error = bd.run( timeout=self._BLACKDUCK_CLI_TIMEOUT, update_env={'BD_HUB_PASSWORD': config.blackduck_password}) self.log.debug("status = %s, error = %s", status, error) self.log.debug("output = %s", output) data = self._release_data(hub, project, version) self.log.debug("Release data for project {p} {v}: {d}".format( p=project, v=version, d=data)) result_data['details'] = data result_data['status'] = 'success' if data else 'error' else: result_data['status'] = 'error' return result_data
class StackAnalysesByGraph(ResourceWithSchema): schema_ref = SchemaRef('stack_analyses', '2-1-4') def post(self): session = FuturesSession() files = request.files.getlist('manifest[]') dt = datetime.datetime.now() origin = request.form.get('origin') # At least one manifest file should be present to analyse a stack if len(files) <= 0: return jsonify( error= "Error processing request. Please upload a valid manifest files." ) request_id = uuid.uuid4().hex manifests = [] stack_data = {} result = [] for f in files: filename = f.filename # check if manifest files with given name are supported manifest_descriptor = get_manifest_descriptor_by_filename(filename) if manifest_descriptor is None: return jsonify( error="Manifest file '{filename}' is not supported".format( filename=filename)) content = f.read().decode('utf-8') # In memory file to be passed as an API parameter to /appstack manifest_file = StringIO(content) # Check if the manifest is valid if not manifest_descriptor.validate(content): return jsonify( error= "Error processing request. Please upload a valid manifest file '{filename}'" .format(filename=filename)) # Limitation: Currently, appstack can support only package.json # Record the response details for this manifest file manifest = { 'filename': filename, 'content': content, 'ecosystem': manifest_descriptor.ecosystem } manifests.append(manifest) if 'package.json' in filename: substr = [] # Read package contents packagejson = json.loads(content) appstack_file = {'packagejson': manifest_file} url = current_app.config["BAYESIAN_ANALYTICS_URL"] analytics_url = "{analytics_baseurl}/api/v1.0/recommendation".format( analytics_baseurl=url) urls = [ analytics_url, current_app.config["GREMLIN_SERVER_URL_REST"] ] # call recommendation api asynchronously try: reco_req = session.post(urls[0], files=appstack_file, timeout=None) except Exception as exc: current_app.logger.warn("Analytics query: {}".format(exc)) # carry on with further processing for pkg, ver in packagejson['dependencies'].items(): substr.append("has('pecosystem','NPM').has('pname','" + pkg + "').has('version','" + ver + "')") substr1 = ",".join(substr) str_gremlin = "g.V().or(" + substr1 + ").valueMap(true);" payload = {'gremlin': str_gremlin} # call graph endpoint to fetch attributes asynchronously graph_req = session.post(urls[1], data=json.dumps(payload)) #wait for all request to process graph_resp = graph_req.result() stack_data = aggregate_stack_data(graph_resp.json(), filename, "npm") #Hardcoded to NPM #Get Recommendation API result reco_resp = reco_req.result() reco_json = reco_resp.json() stack_data['recommendation'] = reco_json result.append(stack_data) # Store the Request in DB try: req = StackAnalysisRequest(id=request_id, submitTime=str(dt), requestJson={'manifest': manifests}, origin=origin, result={'result': result}) rdb.session.add(req) rdb.session.commit() except SQLAlchemyError: current_app.logger.exception( 'Failed to create new analysis request') raise HTTPError( 500, "Error inserting log for request {t}".format(t=request_id)) response = { 'status': 'success', 'request_id': request_id, 'result': result } return (response)
class MercatorTask(BaseTask): _analysis_name = 'metadata' _dependency_tree_lock = '_dependency_tree_lock' description = 'Collects `Release` specific information from Mercator' schema_ref = SchemaRef(_analysis_name, '3-1-1') _data_normalizer = DataNormalizer() def _parse_requires_txt(self, path): requires = [] try: with open(path, 'r') as f: for l in f.readlines(): l = l.strip() if l.startswith('['): # the first named ini-like [section] ends the runtime requirements break elif l: requires.append(l) except Exception as e: self.log.warning('Failed to process "{p}": {e}'.format(p=path, e=str(e))) return requires def _merge_python_items(self, topdir, data): metadata_json = None pkg_info = None requirements_txt = None def get_depth(path): return path.rstrip('/').count('/') def is_deeper(item1, item2): """ Returns True if item1 is deeper in directory hierarchy than item2 """ if item1 is None: return True return get_depth(item1['path']) > get_depth(item2['path']) # find outermost PKG_INFO/metadata.json/requirements.txt - there can be # testing ones etc. for item in data['items']: if item['ecosystem'] == 'Python-Dist' and item['path'].endswith( '.json'): if is_deeper(metadata_json, item): metadata_json = item elif item['ecosystem'] == 'Python-Dist': # PKG-INFO # we prefer PKG_INFO files from .egg-info directories, # since these have the very useful `requires.txt` next to them if pkg_info is None: pkg_info = item else: pkg_info_in_egg = pkg_info['path'].endswith( '.egg-info/PKG-INFO') item_in_egg = item['path'].endswith('.egg-info/PKG-INFO') # rather than one insane condition, we use several less complex ones if pkg_info_in_egg and item_in_egg and is_deeper( pkg_info, item): # if both are in .egg-info, but current pkg_info is deeper pkg_info = item elif item_in_egg and not pkg_info_in_egg: # if item is in .egg-info and current pkg_info is not pkg_info = item elif not (item_in_egg or pkg_info_in_egg) and is_deeper( pkg_info, item): # if none of them are in .egg-info, but current pkg_info is deeer pkg_info = item elif item['ecosystem'] == 'Python-RequirementsTXT' and is_deeper( pkg_info, item): requirements_txt = item if pkg_info: self.log.info('Found PKG-INFO at {p}'.format(p=pkg_info['path'])) if metadata_json: self.log.info( 'Found metadata.json at {p}'.format(p=metadata_json['path'])) if requirements_txt: self.log.info('Found requirements.txt at {p}'.format( p=requirements_txt['path'])) ret = None # figure out if this was packaged as wheel => metadata.json would # have depth of topdir + 2 if metadata_json and get_depth( metadata_json['path']) == get_depth(topdir) + 2: self.log.info('Seems like this is wheel, using metadata.json ...') ret = metadata_json # figure out if this was packaged as sdist => PKG_INFO would # have depth of topdir + 2 or topdir + 3 # (and perhaps there are requires.txt or requirements.txt that we could use) # NOTE: for now, we always treat requirements.txt as requires_dist elif pkg_info and get_depth(pkg_info['path']) <= get_depth(topdir) + 3: self.log.info( 'Seems like this is sdist or egg, using PKG-INFO ...') requires_dist = [] # in well-made sdists, there are requires.txt next to PKG_INFO # (this is something different that requirements.txt) # TODO: maybe mercator could do this in future requires = os.path.join(os.path.dirname(pkg_info['path']), 'requires.txt') if os.path.exists(requires): self.log.info( 'Found a "requires.txt" file next to PKG-INFO, going to use it ...' ) requires_dist = self._parse_requires_txt(requires) elif requirements_txt: self.log.info( 'No "requires.txt" file found next to PKG-INFO, but requirements.txt' ' found, going to use it') # if requires.txt can't be found, try requirements.txt requires_dist = requirements_txt['result']['dependencies'] else: self.log.info( 'Found no usable source of requirements for PKG-INFO :(') pkg_info['result']['requires_dist'] = requires_dist ret = pkg_info elif requirements_txt: self.log.info('Only requirements.txt found, going to use it ...') requirements_txt['result']['requires_dist'] = \ requirements_txt['result'].pop('dependencies') ret = requirements_txt return ret def execute(self, arguments): "Execute mercator and convert it's output to JSON object" self._strict_assert(arguments.get('ecosystem')) if 'url' in arguments: # run mercator on a git repo return self.run_mercator_on_git_repo(arguments) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) # TODO: make this even uglier; looks like we didn't get the abstraction quite right # when we were adding support for Java/Maven. if self.storage.get_ecosystem(arguments['ecosystem']).is_backed_by( EcosystemBackend.maven): # cache_path now points directly to the pom cache_path = ObjectCache.get_from_dict(arguments).get_pom_xml() else: cache_path = ObjectCache.get_from_dict( arguments).get_extracted_source_tarball() return self.run_mercator(arguments, cache_path) def run_mercator_on_git_repo(self, arguments): self._strict_assert(arguments.get('url')) workdir = None try: workdir = tempfile.mkdtemp() repo_url = arguments.get('url') repo = Git.clone(repo_url, path=workdir, depth=str(1)) metadata = self.run_mercator(arguments, workdir, keep_path=True, outermost_only=False, timeout=900) if metadata.get('status', None) != 'success': self.log.error('Mercator failed on %s', repo_url) return None # add some auxiliary information so we can later find the manifest file head = repo.rev_parse(['HEAD'])[0] for detail in metadata['details']: path = detail['path'][len(workdir):] # path should look like this: # <git-sha1>/path/to/manifest.file detail['path'] = head + path return metadata finally: if workdir: shutil.rmtree(workdir) def run_mercator(self, arguments, cache_path, keep_path=False, outermost_only=True, timeout=300): result_data = {'status': 'unknown', 'summary': [], 'details': []} mercator_target = arguments.get('cache_sources_path', cache_path) tc = TimedCommand(['mercator', mercator_target]) status, data, err = tc.run( timeout=timeout, is_json=True, update_env={'MERCATOR_JAVA_RESOLVE_POMS': 'true'}) if status != 0: self.log.error(err) result_data['status'] = 'error' return result_data ecosystem_object = self.storage.get_ecosystem(arguments['ecosystem']) if ecosystem_object.is_backed_by(EcosystemBackend.pypi): # TODO: attempt static setup.py parsing with mercator items = [self._merge_python_items(mercator_target, data)] else: if outermost_only: # process only root level manifests (or the ones closest to the root level) items = self._data_normalizer.get_outermost_items( data.get('items') or []) else: items = data.get('items') or [] self.log.debug('mercator found %i projects, outermost %i', len(data), len(items)) if ecosystem_object.is_backed_by(EcosystemBackend.maven): # for maven we download both Jar and POM, we consider POM to be *the* # source of information and don't want to duplicate info by including # data from pom included in artifact (assuming it's included) items = [ data for data in items if data['ecosystem'].lower() == 'java-pom' ] result_data['details'] = [ self._data_normalizer.handle_data(data, keep_path=keep_path) for data in items ] result_data['status'] = 'success' return result_data
class LicenseCheckTask(BaseTask): _analysis_name = 'source_licenses' description = "Check licences of all files of a package" schema_ref = SchemaRef(_analysis_name, '3-0-0') @staticmethod def process_output(data): # not interested in these keys_to_remove = [ 'start_line', 'end_line', 'matched_rule', 'score', 'key' ] # 'files' is a list of file paths along with info about detected licenses. # If there's the same license text in most files, then almost the same license info # accompanies each file path. # Therefore transform it into dict of licenses (keys) along with info about the license plus # paths of files where the license has been detected. licenses = {} for file in data.pop('files'): for _license in file['licenses']: # short_name becomes key short_name = _license.pop('short_name') if short_name not in licenses.keys(): for key in keys_to_remove: del _license[key] _license['paths'] = {file['path']} licenses[short_name] = _license else: licenses[short_name]['paths'].add(file['path']) for l in licenses.values(): l['paths'] = list(l['paths']) # set -> list data['licenses'] = licenses del data['scancode_options'] return data def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) try: cache_path = ObjectCache.get_from_dict(arguments).get_sources() except Exception: eco = arguments.get('ecosystem') pkg = arguments.get('name') ver = arguments.get('version') if arguments['ecosystem'] != 'maven': self.log.error( 'Could not get sources for package {e}/{p}/{v}'.format( e=eco, p=pkg, v=ver)) raise self.log.info('Could not get sources for maven package {p}/{v},' 'will try to run on binary jar'.format(p=pkg, v=ver)) cache_path = ObjectCache.get_from_dict( arguments).get_extracted_source_tarball() result_data = {'status': 'unknown', 'summary': {}, 'details': {}} try: command = [ os.path.join( os.getenv('SCANCODE_PATH', '/opt/scancode-toolkit/'), 'scancode'), # Scan for licenses '--license', # Do not return license matches with scores lower than this score '--license-score', SCANCODE_LICENSE_SCORE, # Files without findings are omitted '--only-findings', # Use n parallel processes '--processes', SCANCODE_PROCESSES, # Do not print summary or progress messages '--quiet', # Strip the root directory segment of all paths '--strip-root', # Stop scanning a file if scanning takes longer than a timeout in seconds '--timeout', SCANCODE_TIMEOUT, cache_path ] output = TimedCommand.get_command_output(command, graceful=False, is_json=True, timeout=600) details = self.process_output(output) result_data['details'] = details result_data['status'] = 'success' result_data['summary'] = { 'sure_licenses': list(details['licenses'].keys()) } except: self.log.exception("License scan failed") result_data['status'] = 'error' return result_data
class CVEcheckerTask(BaseTask): name = 'cucoslib.workers.CVEchecker' _analysis_name = 'security_issues' description = "Security issues scanner. Uses Snyk vulndb for npm and OWASP Dep.Check for maven" schema_ref = SchemaRef(_analysis_name, '3-0-0') @staticmethod def _filter_vulndb_fields(entry): result = { 'cvss': { 'score': 0, 'vector': "" } } for field in ['description', 'severity']: result[field] = entry.get(field) id = entry.get('identifiers', {}).get('CVE') or entry.get('identifiers', {}).get('CWE') result['id'] = id[0] if id else '' # prefer CVSSv2, because CVSSv3 seems to contain only vector string, not score itself if entry.get('CVSSv2'): # "CVSSv2": "7.5 (HIGH) (AV:N/AC:L/Au:N/C:P/I:P/A:P)" try: score, severity, vector = entry.get('CVSSv2').split(' ') score = float(score) vector = vector.strip('()') except ValueError: pass else: result['cvss']['score'] = score result['cvss']['vector'] = vector elif entry.get('CVSSv3'): # "CVSSv3": "CVSS:3.0/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H" <- there's no score ?? result['cvss']['score'] = 0 # ? result['cvss']['vector'] = entry.get('CVSSv3') # Snyk vulndb doesn't contain references result['references'] = [] return result def _npm_scan(self, arguments): """ Query Snyk vulndb stored on S3 """ s3 = StoragePool.get_connected_storage('S3Snyk') try: self.log.debug('Retrieving Snyk vulndb from S3') vulndb = s3.retrieve_vulndb() except: self.log.error('Failed to obtain Snyk vulndb database') return {'summary': ['Failed to obtain Snyk vulndb database'], 'status': 'error', 'details': []} entries = [] solver = get_ecosystem_solver(self.storage.get_ecosystem('npm')) for entry in vulndb.get('npm', {}).get(arguments['name'], []): vulnerable_versions = entry['semver']['vulnerable'] affected_versions = solver.solve(["{} {}".format(arguments['name'], vulnerable_versions)], all_versions=True) if arguments['version'] in affected_versions.get(arguments['name'], []): entries.append(self._filter_vulndb_fields(entry)) return {'summary': [e['id'] for e in entries if e], 'status': 'success', 'details': entries} def _run_owasp_dep_check(self, scan_path, experimental=False): def _clean_dep_check_tmp(): for dcdir in glob.glob(os.path.join(gettempdir(), 'dctemp*')): rmtree(dcdir) s3 = StoragePool.get_connected_storage('S3OWASPDepCheck') depcheck = os.path.join(os.environ['OWASP_DEP_CHECK_PATH'], 'bin', 'dependency-check.sh') with tempdir() as temp_data_dir: retrieved = s3.retrieve_depcheck_db_if_exists(temp_data_dir) if not retrieved: self.log.debug('No cached OWASP Dependency-Check DB, generating fresh now ...') command = [depcheck, '--updateonly', '--data', temp_data_dir] # give DependencyCheck 30 minutes to download the DB TimedCommand.get_command_output(command, graceful=False, timeout=1800) report_path = os.path.join(temp_data_dir, 'report.xml') command = [depcheck, '--noupdate', '--format', 'XML', '--project', 'test', '--data', temp_data_dir, '--scan', scan_path, '--out', report_path] if experimental: command.extend(['--enableExperimental']) output = [] try: self.log.debug('Running OWASP Dependency-Check to scan %s for vulnerabilities' % scan_path) output = TimedCommand.get_command_output(command, graceful=False, timeout=600) # 10 minutes with open(report_path) as r: report_dict = anymarkup.parse(r.read()) except (TaskError, FileNotFoundError) as e: _clean_dep_check_tmp() for line in output: self.log.warning(line) self.log.exception(str(e)) return {'summary': ['OWASP Dependency-Check scan failed'], 'status': 'error', 'details': []} # If the CVEDBSyncTask has never been run before, we just had to create the DB ourselves # Make the life easier for other workers and store it to S3 s3.store_depcheck_db_if_not_exists(temp_data_dir) _clean_dep_check_tmp() results = [] dependencies = report_dict.get('analysis', {}).get('dependencies', {}).get('dependency', []) if not isinstance(dependencies, list): dependencies = [dependencies] for dependency in dependencies: vulnerabilities = dependency.get('vulnerabilities', {}).get('vulnerability', []) if not isinstance(vulnerabilities, list): vulnerabilities = [vulnerabilities] for vulnerability in vulnerabilities: av = vulnerability.get('cvssAccessVector') av = av[0] if av else '?' ac = vulnerability.get('cvssAccessComplexity') ac = ac[0] if ac else '?' au = vulnerability.get('cvssAuthenticationr') au = au[0] if au else '?' c = vulnerability.get('cvssConfidentialImpact') c = c[0] if c else '?' i = vulnerability.get('cvssIntegrityImpact') i = i[0] if i else '?' a = vulnerability.get('cvssAvailabilityImpact') a = a[0] if a else '?' vector = "AV:{AV}/AC:{AC}/Au:{Au}/C:{C}/I:{I}/A:{A}".\ format(AV=av, AC=ac, Au=au, C=c, I=i, A=a) result = { 'cvss': { 'score': vulnerability.get('cvssScore'), 'vector': vector } } references = vulnerability.get('references', {}).get('reference', []) if not isinstance(references, list): references = [references] result['references'] = [r.get('url') for r in references] for field in ['severity', 'description']: result[field] = vulnerability.get(field) result['id'] = vulnerability.get('name') results.append(result) return {'summary': [r['id'] for r in results], 'status': 'success', 'details': results} def _maven_scan(self, arguments): """ Run OWASP dependency-check """ jar_path = ObjectCache.get_from_dict(arguments).get_source_tarball() return self._run_owasp_dep_check(jar_path, experimental=False) def _python_scan(self, arguments): """ Run OWASP dependency-check experimental analyzer for Python artifacts https://jeremylong.github.io/DependencyCheck/analyzers/python-analyzer.html """ tarball = ObjectCache.get_from_dict(arguments).get_source_tarball() if tarball.endswith('zip') or tarball.endswith('.whl'): # tar.gz seems to be not supported scan_path = tarball else: extracted_tarball = ObjectCache.get_from_dict(arguments).get_extracted_source_tarball() # depcheck needs to be pointed to a specific file, we can't just scan whole directory egg_info, pkg_info, metadata = None, None, None for root, dirs, files in os.walk(extracted_tarball): if root.endswith('.egg-info'): egg_info = root if 'PKG-INFO' in files: pkg_info = os.path.join(root, 'PKG-INFO') if 'METADATA' in files: metadata = os.path.join(root, 'METADATA') scan_path = egg_info or pkg_info or metadata if not scan_path: return {'summary': ['File types not supported by OWASP dependency-check'], 'status': 'error', 'details': []} return self._run_owasp_dep_check(scan_path, experimental=True) def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) if arguments['ecosystem'] == 'maven': return self._maven_scan(arguments) elif arguments['ecosystem'] == 'npm': return self._npm_scan(arguments) elif arguments['ecosystem'] == 'pypi': return self._python_scan(arguments) else: return {'summary': ['Unsupported ecosystem'], 'status': 'error', 'details': []}
class BlackDuckHub(object): """ Hub provides access around Black Duck Hub APIs """ # The authentication token is returned in a cookie with this name COOKIE_NAME = 'JSESSIONID' def __init__(self, url): self._url = url self._session = None @property def url(self): """ URL of the Hub with trailing slash, example `https://hub.blackducksoftware.com/` """ return self._url def _api(self, param): """ Format a new API call, checks session validity as well :param param: str, parameters to append to base url :return: str, formatted API call """ return "{}{}".format(self.url, param) def _api_get(self, param): """ Perform a get request against the API using local `_session` :param param: str, full request URL :return: requests.Request, a request object """ return get(self._api(param), cookies={self.COOKIE_NAME: self._session.api_token.token}, verify=False) def connect_session(self, username, password): """ Establishes a new session with the HUB using the provided credentials :param username: str :param password: str :return: BlackDuckSession, a session object :raises: BlackDuckSessionException """ req = post(self._api("j_spring_security_check"), data={ 'j_username': username, 'j_password': password }, verify=False) if req.status_code != 204: raise BlackDuckSessionException("Black Duck authentication error") token = req.cookies.get(self.COOKIE_NAME) self._session = BlackDuckSession(BlackDuckApiToken(token)) return self._session @needs_session def find_project(self, name): """ Find a Project by Name :param name: str, name of the project :return: BlackDuckProject, found project or `None` :raises: BlackDuckSessionException """ preq = self._api_get('api/v1/projects?name=' + name) if preq.status_code == 200: pdata = preq.json() return BlackDuckProject(pdata) else: return None @needs_session @schema.result(SchemaRef("blackduck-project-list", "1-0-0")) def _list_projects_json(self): req = self._api_get('api/projects/') if req.status_code == 200: return req.json() else: raise BlackDuckException('Unable to list projects') def list_projects(self): """ Lists all projects valid for the current session :return: List[BlackDuckProject], list of projects :raises: BlackDuckException, BlackDuckSessionException """ names = [project['name'] for project in self._list_projects_json()] projects = [] for name in names: projects.append(self.find_project(name)) return projects @needs_session def get_releases(self, project_id): """ Get all releases of the given project :param project_id: BlackDuckProject or str, project reference or ID :return: Dict[str, BlackDuckRelease], a map of version strings to release objects :raises: BlackDuckException, BlackDuckSessionException """ if isinstance(project_id, BlackDuckProject): project_id = project_id.id req = self._api_get( 'api/v1/projects/{id}/version-summaries'.format(id=project_id)) if req.status_code == 200: data = req.json() return { obj['version']: BlackDuckRelease(obj, project_id) for obj in data['items'] } else: raise BlackDuckException('Unable to fetch releases for ' + project_id) @needs_session @schema.result(SchemaRef("blackduck-vulnerable-bom", "1-0-0")) def get_release_bom_json(self, release_id): """ Get the Bill of Materials for specific release :param release_id: BlackDuckRelease or str, release reference or ID :return: dict, the BOM JSON as a dictionary :raises: BlackDuckException, BlackDuckSessionException """ release = release_id if isinstance(release_id, BlackDuckRelease): release_id = release_id.id req = self._api_get( 'api/projects/{p}/versions/{i}/vulnerable-bom-components'.format( i=release_id, p=release.project)) if req.status_code == 200: return req.json() else: raise BlackDuckException('Unable to fetch release information ' + release_id + " " + release.project) @needs_session def get_release_code_locations(self, release_id): """ Get code locations for given release :param release_id: BlackDuckRelease or str, release reference or ID :return: dict, response json containing the retrieved code locations list :raises: BlackDuckException, BlackDuckSessionException """ release = release_id if isinstance(release_id, BlackDuckRelease): release_id = release_id.id req = self._api_get( 'api/projects/{p}/versions/{i}/codelocations'.format( i=release_id, p=release.project)) if req.status_code == 200: return req.json() else: raise BlackDuckException( 'Unable to fetch code locations for {relid} {relproj}'.format( relid=release_id, relproj=release.project)) @needs_session def get_code_location_scan_summary(self, location_id): """ Get scan summary for given code location ID :param location_id: str :return: dict, the code location :raises: BlackDuckException, BlackDuckSessionException """ req = self._api_get('api/codelocations/{locid}/scan-summaries'.format( locid=location_id)) if req.status_code == 200: return req.json() else: raise BlackDuckException( 'Unable to fetch scan summary for code location {locid}'. format(locid=location_id))
class GithubTask(BaseTask): description = 'Collects statistics using Github API' _analysis_name = "github_details" schema_ref = SchemaRef(_analysis_name, '1-0-4') # used for testing _repo_name = None _repo_url = None @classmethod def create_test_instance(cls, repo_name, repo_url): instance = super().create_test_instance() # set for testing as we are not querying DB for mercator results instance._repo_name = repo_name instance._repo_url = repo_url return instance @staticmethod def _get_last_years_commits(repo): activity = repo.get_stats_commit_activity() if not activity: return [] return [x.total for x in activity] @staticmethod def _rate_limit_exceeded(gh): return gh.rate_limiting[0] == 0 def _issues_or_prs_count(self, gh, query): # Check the rate-limit for Github API first. Apply retry if needed if self._rate_limit_exceeded(gh): retrytime = gh.rate_limiting_resettime - int( datetime.datetime.now().timestamp()) + 10 self.log.info("Github rate-limit exceeded, retrying in %d seconds", retrytime) self.retry(countdown=retrytime) items = gh.search_issues(query=query) return getattr(items, 'totalCount', -1) @staticmethod def _get_repo_stats(repo): # len(list()) is workaround for totalCount being None # https://github.com/PyGithub/PyGithub/issues/415 d = {'contributors_count': len(list(repo.get_contributors()))} for prop in REPO_PROPS: d[prop] = repo.raw_data.get(prop, -1) return d def _query_repo_name(self): """Retrieve GitHub repo from a preceding Mercator scan""" # Fridolin: most of the checks can be removed since Dispatcher schedules this task iff we have github.com wr = self.parent_task_result('metadata') if wr is None: self.log.error( "No repo_name provided, and no Mercator scan result") return None code_repos =\ [m.get("code_repository") for m in wr.get('details', []) if m.get("code_repository")] repo_details = code_repos[0] if code_repos else None if repo_details is None: self.log.debug("No repo_name provided, and no repo metadata found") return None repo_name = repo_details.get("url") if repo_name is None: self.log.debug('No repo name extracted, nothing to do') return None parsed = parse_gh_repo(repo_name) if not parsed: self.log.debug('Could not parse Github repo URL %s', repo_name) else: self._repo_url = 'https://github.com/' + parsed return parsed def _get_topics(self): if not self._repo_url: return [] pop = requests.get('{url}'.format(url=self._repo_url)) poppage = bs4.BeautifulSoup(pop.text, 'html.parser') topics = [] for link in poppage.find_all("a", class_="topic-tag"): topics.append(link.text.strip()) return topics def execute(self, arguments): result_data = {'status': 'unknown', 'summary': [], 'details': {}} # For testing purposes, a repo may be specified at task creation time if self._repo_name is None: # Otherwise, get the repo name from earlier Mercator scan results self._repo_name = self._query_repo_name() if self._repo_name is None: # Not a GitHub hosted project return result_data token = self.configuration.github_token if not token: if self._rate_limit_exceeded(github.Github()): self.log.error( "No Github API token provided (GITHUB_TOKEN env variable), " "and rate limit exceeded! " "Ending now to not wait endlessly") result_data['status'] = 'error' return result_data else: self.log.warning( "No Github API token provided (GITHUB_TOKEN env variable), " "requests will be unauthenticated, " "i.e. limited to 60 per hour") else: # there might be more comma-separated tokens, randomly select one token = random.choice(token.split(',')).strip() gh = github.Github(login_or_token=token) try: repo = gh.get_repo(full_name_or_id=self._repo_name, lazy=False) except github.GithubException as e: self.log.exception(str(e)) result_data['status'] = 'error' return result_data result_data['status'] = 'success' # Get Count of Issues and PRs for last year and last month now = datetime.datetime.utcnow() month = ( now - datetime.timedelta(days=MONTH_BACK)).strftime('%Y-%m-%dT%H:%M:%SZ') year = ( now - datetime.timedelta(days=YEAR_BACK)).strftime('%Y-%m-%dT%H:%M:%SZ') now = now.strftime('%Y-%m-%dT%H:%M:%SZ') issues_closed_year = self._issues_or_prs_count( gh, query='repo:' + repo.full_name + ' closed:' + year + '..' + now + ' type:issue') issues_closed_month = self._issues_or_prs_count( gh, query='repo:' + repo.full_name + ' closed:' + month + '..' + now + ' type:issue') prs_closed_year = self._issues_or_prs_count( gh, query='repo:' + repo.full_name + ' closed:' + year + '..' + now + ' type:pr') prs_closed_month = self._issues_or_prs_count( gh, query='repo:' + repo.full_name + ' closed:' + month + '..' + now + ' type:pr') issues_opened_year = self._issues_or_prs_count( gh, query='repo:' + repo.full_name + ' created:' + year + '..' + now + ' type:issue') issues_opened_month = self._issues_or_prs_count( gh, query='repo:' + repo.full_name + ' created:' + month + '..' + now + ' type:issue') prs_opened_year = self._issues_or_prs_count( gh, query='repo:' + repo.full_name + ' created:' + year + '..' + now + ' type:pr') prs_opened_month = self._issues_or_prs_count( gh, query='repo:' + repo.full_name + ' created:' + month + '..' + now + ' type:pr') issues = { 'updated_issues': { 'year': { 'opened': issues_opened_year, 'closed': issues_closed_year }, 'month': { 'opened': issues_opened_month, 'closed': issues_closed_month } }, 'updated_pull_requests': { 'year': { 'opened': prs_opened_year, 'closed': prs_closed_year }, 'month': { 'opened': prs_opened_month, 'closed': prs_closed_month } } } # Get Repo Statistics notoriety = self._get_repo_stats(repo) if notoriety: issues.update(notoriety) issues['topics'] = self._get_topics() # Get Commit Statistics last_year_commits = self._get_last_years_commits(repo) commits = { 'last_year_commits': { 'sum': sum(last_year_commits), 'weekly': last_year_commits } } issues.update(commits) result_data['details'] = issues return result_data
def test_next_revision(self): schema_ref = SchemaRef("example", "1-0-0") assert schema_ref.next_revision() == SchemaRef("example", "1-1-0")
def test_next_model(self): schema_ref = SchemaRef("example", "1-0-0") assert schema_ref.next_model() == SchemaRef("example", "2-0-0")
class CodeMetricsTask(BaseTask): _analysis_name = 'code_metrics' description = 'Compute various code metrics for a project' schema_ref = SchemaRef(_analysis_name, '1-0-0') _CLI_TIMEOUT = 300 def _run_analyzer(self, command, json_output=True): """Run command (analyzer), if a JSON output is expected, parse it :param command: command to be run (command with argument vector as array) :param json_output: True if output should be parsed :return: status, output, error triplet """ self.log.debug("Executing command, timeout={timeout}: {cmd}".format( timeout=self._CLI_TIMEOUT, cmd=command)) cmd = TimedCommand(command) status, output, error = cmd.run(timeout=self._CLI_TIMEOUT) self.log.debug("status: %d, output: %s, error: %s", status, output, error) if status != 0: self.log.warning( "Executing command failed, return value: %d, stderr: '%s' ", status, error) # Some tools such as complexity-report write zero bytes to output (they are propagated from sources like # for npm/glob/7.0.3). This caused failures when pushing results to Postgres as Postgres cannot store # null bytes in results. Let's be safe here. output = list(line.replace('\\u0000', '\\\\0') for line in output) if json_output: if output: output = "".join(output) output = json.loads(output) else: output = {} return status, output, error def _get_generic_result(self, source_path): """Get core result of CodeMetricsTask task that is based on cloc tool, this output is later enriched with output of tools based on languages that were found by cloc :param source_path: path to sources where analyzed artefact resists :return: tuple where generic information with ecosystem specific dict """ command = ['cloc', '--json', source_path] status, output, error = self._run_analyzer(command) if status != 0: # Let the whole task fail raise RuntimeError("Running cloc command failed: '%s'" % error) # cloc places generic summary here, we will maintain it in top level so remove misleading key header = { 'total_files': output['header'].pop('n_files'), 'total_lines': output['header'].pop('n_lines') } output.pop('header') if 'SUM' in output: header['blank_lines'] = output['SUM']['blank'] header['comment_lines'] = output['SUM']['comment'] header['code_lines'] = output['SUM']['code'] output.pop('SUM', None) # rename to be more precise with naming wanted_keys = (('blank', 'blank_lines'), ('code', 'code_lines'), ('comment', 'comment_lines'), ('nFiles', 'files_count')) for key in output.keys(): # filter only language-specific results, leave statistics untouched if isinstance(output[key], dict): output[key] = DataNormalizer.transform_keys( output[key], wanted_keys) return header, output @staticmethod def _normalize_complexity_report_output(output, source_path): """ Normalize complexity_report output See https://github.com/escomplex/escomplex/blob/master/README.md#metrics :param output: output dict to be normalized :param source_path: path to sources that was used :return: normalized output """ # For metrics meaning see: wanted_keys = (('maintainability', 'project_maintainability'), ('changeCost', 'cost_change'), ('cyclomatic', 'average_cyclomatic_complexity'), ('effort', 'average_halstead_effort'), ('firstOrderDensity', 'first_order_density'), ('loc', 'average_function_lines_of_code'), ('params', 'average_function_parameters_count'), ('reports', 'modules')) output = DataNormalizer.transform_keys(output, wanted_keys) wanted_module_keys = (('maintainability', 'module_maintainability'), ('dependencies', ), ('loc', 'average_function_lines_of_code'), ('path', ), ('params', 'average_function_parameters_count'), ('functions', )) for idx, module in enumerate(output.get('modules', [])): output['modules'][idx] = DataNormalizer.transform_keys( module, wanted_module_keys) source_path_len = len(source_path) + 1 if 'path' in module: output['modules'][idx]['path'] = module['path'][ source_path_len:] for fun_idx, function in enumerate(module.get('functions')): if 'cyclomaticDensity' in function: function['cyclomatic_density'] = function.pop( 'cyclomaticDensity') return output @staticmethod def _normalize_javancss_output(output): """Parse and normalize JavaNCSS ASCII output :param output: output dict to be normalized :return: normalized output """ output = output.get('javancss', {}) result = {'functions': {}, 'objects': {}, 'packages': {}} # The output of JavaNCSS is an XML, which is parsed using anymarkup. This can introduce some pitfalls here # if there is found exactly one item of a type. E.g.: # # <functions> # <function>...<function/> # <functions> # # Is parsed as object 'functions' containing *one object* 'function', whereas: # # <functions> # <function>...<function/> # <function>...<function/> # <functions> # # Is parsed as object 'functions' containing a *list of objects* 'function'. Thus the isinstance(.., list) # checks. # Parse functions section if 'functions' in output: functions = output['functions'] wanted_function_keys = (('ccn', 'cyclomatic_complexity'), ('javadocs', ), ('name', )) result['functions']['function'] = [] if 'function' in functions: if not isinstance(functions['function'], list): functions['function'] = [functions['function']] for function in functions['function']: result['functions']['function'].append( DataNormalizer.transform_keys(function, wanted_function_keys)) function_averages = functions.get('function_averages', {}) result['functions'][ 'average_cyclomatic_complexity'] = function_averages.get('ccn') result['functions']['average_javadocs'] = function_averages.get( 'javadocs') # Parse objects section if 'objects' in output: objects = output['objects'] wanted_objects_keys = (('classes', ), ('functions', ), ('name', ), ('javadocs', )) result['objects']['object'] = [] if 'object' in objects: if not isinstance(objects['object'], list): objects['object'] = [objects['object']] for obj in objects['object']: result['objects']['object'].append( DataNormalizer.transform_keys(obj, wanted_objects_keys)) object_averages = objects.get('averages', {}) result['objects']['average_classes'] = object_averages.get( 'classes') result['objects']['average_functions'] = object_averages.get( 'functions') result['objects']['average_javadocs'] = object_averages.get( 'javadocs') # Parse packages section if 'packages' in output: packages = output['packages'] packages_total = packages.get('total', {}) result['packages']['classes'] = packages_total.get('classes') result['packages']['functions'] = packages_total.get('functions') result['packages']['javadoc_lines'] = packages_total.get( 'javadoc_lines') result['packages']['javadocs'] = packages_total.get('javadocs') result['packages']['multi_comment_lines'] = packages_total.get( 'multi_comment_lines') result['packages']['single_comment_lines'] = packages_total.get( 'single_comment_lines') return result def _normalize_mccabe_output(self, output): result = [] for line in output: # NOTE: due to the way print works in python 2 vs python 3, the mccabe under # python 2 returns `(<coords> <name> <complexity>)`, while the python 3 # version returns the same without the brackets coords, func_name, complexity = line.split() result.append({ 'name': func_name.strip("'"), 'complexity': int(complexity.strip(')')) }) return result def complexity_report(self, source_path): """Run complexity_report tool https://www.npmjs.com/package/complexity-report :param source_path: path to source codes :return: normalized output """ command = ['cr', '--format=json', source_path] status, output, error = self._run_analyzer(command) if status != 0: self.log.warning("Runing complexity report tool failed: %s", error) return {} if output: output = self._normalize_complexity_report_output( output, source_path) return output def javancss(self, source_path): """Run JavaNCSS tool http://www.kclee.de/clemens/java/javancss :param source_path: path to source codes :return normalized output """ javancss_path = os.path.join(os.environ['JAVANCSS_PATH'], 'bin', 'javancss') command = [javancss_path, '-all', '-xml', source_path] status, output, error = self._run_analyzer(command, json_output=False) if status != 0: self.log.warning("JavaNCSS tool reported some errors: %s", error) if output: output = anymarkup.parse("".join(output)) output = self._normalize_javancss_output(output) return output def python_mccabe(self, source_path): """Run mccabe tool https://pypi.python.org/pypi/mccabe :param source_path: path to source codes :return: normalized output """ result = {'files': []} # we'll compute total average cyclomatic complexity manually based as # <total complexity>/<total number of functions> command = ['python3', '-m', 'mccabe'] # mccabe has to be run on individual files, doesn't work recursively on directories for root, dirs, files in os.walk(source_path): for f in files: if f.endswith('.py'): to_run = command + [os.path.join(root, f)] status, output, error = self._run_analyzer( to_run, json_output=False) if status != 0: self.log.info( 'Analyzing with Py3 failed, trying to analyze with Py2 ...' ) to_run[0] = 'python2' status, output, error = self._run_analyzer( to_run, json_output=False) if status != 0: self.log.error( 'Failed to analyze with both Py2 and Py3') continue normalized = self._normalize_mccabe_output(output) # compute file average cyclomatic complexity, add numbers # to overall package complexity f_complexity = functools.reduce( lambda x, y: x + y['complexity'], normalized, 0) f_functions = len(normalized) f_acc = round(f_complexity / f_functions, 1) if f_functions > 0 else 0 result['files'].append({ 'name': os.path.join(root, f)[len(source_path):].strip('/'), 'functions': normalized, 'average_cyclomatic_complexity': f_acc }) return result # A table that carries functions that should be called based on language that was found by cloc, keys has to match # keys in cloc output. Each handler expect one argument - path to the source where sources sit, the result is # a dict. When you write new analyzer handlers, make sure that there are no key collisions with new ones as results # are aggregated under "metrics" key. # See 'Recognized languages' section at http://cloc.sourceforge.net/ _LANGUAGE_ANALYZER_HANDLERS = { "JavaScript": [ complexity_report, ], "Ruby": [], "Java": [ javancss, ], "Python": [ python_mccabe, ], "Go": [], "Rust": [] } def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) source_path = ObjectCache.get_from_dict(arguments).get_sources() header, language_stats = self._get_generic_result(source_path) for language in language_stats.keys(): for handler in self._LANGUAGE_ANALYZER_HANDLERS.get(language, []): metrics_data = handler(self, source_path) if not metrics_data: continue if 'metrics' not in language_stats[language]: language_stats[language]['metrics'] = {} language_stats[language]['metrics'].update(metrics_data) # we don't want to have possibly unique keys and we want to avoid enumerating all languages that are # supported by cloc - convert a dict to a list of language-specific entries result = {'languages': []} for language in language_stats.keys(): record = language_stats.get(language) record['language'] = language result['languages'].append(record) return {'summary': header, 'status': 'success', 'details': result}
def test_next_addition(self): schema_ref = SchemaRef("example", "1-0-0") assert schema_ref.next_addition() == SchemaRef("example", "1-0-1")