class RepoDependencyFinderTask(BaseTask): """Finds out direct and indirect dependencies from a given github repository.""" _mercator = MercatorTask.create_test_instance( task_name='RepoDependencyFinderTask') def execute(self, arguments=None): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self.log.info("Arguments passed from flow: {}".format(arguments)) self._strict_assert(arguments.get('service_token')) github_repo = arguments.get('github_repo').strip() dependencies = [] repo_cves = [] if len(arguments.get('epv_list', [])): # self._strict_assert(arguments.get('epv_list')) for epv in arguments.get('epv_list'): dependencies.append('{ecosystem}:{package}:{version}'.format( ecosystem=epv.get('ecosystem'), package=epv.get('name'), version=epv.get('version'))) self.log.info('######## Dependencies list: %r' % dependencies) try: repo_cves = self.get_cve(dependencies) except TaskError: raise TaskError('Failed to get CVEs') else: dependencies = GithubDependencyTreeTask.extract_dependencies( github_repo=github_repo, user_flow=True) self.log.info('######## Deps list %r' % dependencies) try: # forward only the available dependencies in the system. Unknown # dependencies are not going to be ingested for osioUserNotificationFlow. repo_cves = self.create_repo_node_and_get_cve( github_repo, dependencies) self.log.info('######## repo_cves %r' % repo_cves) except TaskError: raise TaskError('Failed to Create Repo Node') report = self.generate_report(repo_cves=repo_cves, deps_list=dependencies) return { 'report': report, 'service_token': arguments['service_token'], 'dependencies': dependencies } def create_repo_node_and_get_cve(self, github_repo, deps_list): """Create a repository node in the graphdb and create its edges to all deps. :param github_repo: :param dependencies: :return: {}, gremlin_response """ gremlin_str = ( "repo=g.V().has('repo_url', '{repo_url}').tryNext().orElseGet{{" "graph.addVertex('vertex_label', 'Repo', 'repo_url', '{repo_url}')}};" "g.V(repo).outE('has_dependency').drop().iterate();" "g.V(repo).outE('has_transitive_dependency').drop().iterate();". format(repo_url=github_repo)) # Create an edge between repo -> direct dependencies for pkg in deps_list.get('direct'): ecosystem = pkg.split(':')[0] version = pkg.split(':')[-1] name = pkg.replace(ecosystem + ':', '').replace(':' + version, '') gremlin_str += ( "ver=g.V().has('pecosystem', '{ecosystem}').has('pname', '{name}')." "has('version', '{version}');ver.hasNext() && " "g.V(repo).next().addEdge('has_dependency', ver.next());". format(ecosystem=ecosystem, name=name, version=version)) # Create an edge between repo -> transitive dependencies for pkg in deps_list.get('transitive'): ecosystem = pkg.split(':')[0] version = pkg.split(':')[-1] name = pkg.replace(ecosystem + ':', '').replace(':' + version, '') gremlin_str += ( "ver=g.V().has('pecosystem', '{ecosystem}').has('pname', '{name}')." "has('version', '{version}');ver.hasNext() && " "g.V(repo).next().addEdge('has_transitive_dependency', ver.next());" .format(ecosystem=ecosystem, name=name, version=version)) # Traverse the Repo to Direct/Transitive dependencies that have CVE's and report them gremlin_str += ( "g.V(repo).as('rp').outE('has_dependency','has_transitive_dependency')" ".as('ed').inV().as('epv').select('rp','ed','epv').by(valueMap(true));" ) payload = {"gremlin": gremlin_str} try: rawresp = requests.post(url=GREMLIN_SERVER_URL_REST, json=payload) resp = rawresp.json() self.log.info('######## Gremlin Response %r' % resp) if rawresp.status_code != 200: raise TaskError( "Error creating repository node for {repo_url} - " "{resp}".format(repo_url=github_repo, resp=resp)) except Exception: self.log.error(traceback.format_exc()) raise TaskError( "Error creating repository node for {repo_url}".format( repo_url=github_repo)) return resp def get_cve(self, deps_list): """ Get CVE information for dependencies from the Graph database. :param deps_list: :return: gremlin_response """ package_set = set() version_set = set() eco_set = set() for epv in deps_list: ecosystem = epv.split(':')[0] eco_set.add(ecosystem) version = epv.split(':')[-1] version_set.add(version) name = epv.replace(ecosystem + ':', '').replace(':' + version, '') package_set.add(name) gremlin_str = ( "g.V().has('pecosystem', within(eco_list)).has('pname', within(pkg_list))." "has('version', within(ver_list))." "in('has_dependency','has_transitive_dependency').dedup().as('rp')." "outE('has_dependency','has_transitive_dependency').as('ed').inV().has(" "'cve_ids').as('epv').select('rp','ed','epv').by(valueMap(true));") payload = { 'gremlin': gremlin_str, 'bindings': { 'eco_list': list(eco_set), 'pkg_list': list(package_set), 'ver_list': list(version_set) } } try: rawresp = requests.post(url=GREMLIN_SERVER_URL_REST, json=payload) resp = rawresp.json() if rawresp.status_code != 200: raise RuntimeError("Error creating repository node for %r" % resp) except Exception: self.log.error(traceback.format_exc()) raise RuntimeError("Error creating repository node") return resp def generate_report(self, repo_cves, deps_list): """ Generate a json structure to include cve details for dependencies. :param repo_cves: :param deps_list: :return: list """ repo_list = [] for repo_cve in repo_cves.get('result').get('data', []): epv = repo_cve.get('epv') repo_url = repo_cve.get('rp').get('repo_url')[0] name = epv.get('pname')[0] version = epv.get('version')[0] ecosystem = epv.get('pecosystem')[0] str_epv = ecosystem + ":" + name + ":" + version cve_count = len(epv.get('cve_ids', [])) vulnerable_deps = [] first = True if cve_count > 0 and (str_epv in i for x, i in deps_list.items()): cve_list = [] for cve in epv.get('cve_ids'): cve_id = cve.split(':')[0] cvss = cve.split(':')[-1] cve_list.append({'CVE': cve_id, 'CVSS': cvss}) vulnerable_deps.append({ 'ecosystem': epv.get('pecosystem')[0], 'name': epv.get('pname')[0], 'version': epv.get('version')[0], 'cve_count': cve_count, 'cves': cve_list, 'is_transitive': repo_cve.get('ed').get('label') == 'has_transitive_dependency' }) for repo in repo_list: if repo_url == repo.get('repo_url'): repo_vul_deps = repo.get('vulnerable_deps') repo['vulnerable_deps'] = vulnerable_deps + repo_vul_deps first = False if first: repo_list.append({ 'repo_url': repo_url, 'vulnerable_deps': vulnerable_deps }) return repo_list
def execute(self, arguments, db, manifests, source=None): """Dependency finder logic.""" # TODO: reduce cyclomatic complexity # If we receive a manifest file we need to save it first result = [] for manifest in manifests: content_hash = None if source == 'osio': content_hash = generate_content_hash(manifest['content']) current_app.logger.info("{} file digest is {}".format(manifest['filename'], content_hash)) s3 = AmazonS3(bucket_name='boosters-manifest') try: s3.connect() manifest['content'] = s3.retrieve_blob(content_hash).decode('utf-8') except ClientError as e: current_app.logger.error("Unexpected error while retrieving S3 data: %s" % e) raise with TemporaryDirectory() as temp_path: with open(os.path.join(temp_path, manifest['filename']), 'a+') as fd: fd.write(manifest['content']) # mercator-go does not work if there is no package.json if 'shrinkwrap' in manifest['filename'].lower(): with open(os.path.join(temp_path, 'package.json'), 'w') as f: f.write(json.dumps({})) # Create instance manually since stack analysis is not handled by dispatcher subtask = MercatorTask.create_test_instance(task_name='metadata') arguments['ecosystem'] = manifest['ecosystem'] out = subtask.run_mercator(arguments, temp_path, resolve_poms=False) if not out["details"]: raise FatalTaskError("No metadata found processing manifest file '{}'" .format(manifest['filename'])) if 'dependencies' not in out['details'][0] and out.get('status', None) == 'success': raise FatalTaskError("Dependencies could not be resolved from manifest file '{}'" .format(manifest['filename'])) out["details"][0]['manifest_file'] = manifest['filename'] out["details"][0]['ecosystem'] = manifest['ecosystem'] out["details"][0]['manifest_file_path'] = manifest.get('filepath', 'File path not available') # If we're handling an external request we need to convert dependency specifications to # concrete versions that we can query later on in the `AggregatorTask` manifest_descriptor = get_manifest_descriptor_by_filename(manifest['filename']) if 'external_request_id' in arguments: manifest_dependencies = [] if manifest_descriptor.has_resolved_deps: # npm-shrinkwrap.json, pom.xml if "_dependency_tree_lock" in out["details"][0]: # npm-shrinkwrap.json if 'dependencies' in out['details'][0]["_dependency_tree_lock"]: manifest_dependencies = out["details"][0]["_dependency_tree_lock"].get( "dependencies", []) else: # pom.xml if 'dependencies' in out['details'][0]: manifest_dependencies = out["details"][0].get("dependencies", []) if manifest_descriptor.has_recursive_deps: # npm-shrinkwrap.json def _flatten(deps, collect): for dep in deps: collect.append({'package': dep['name'], 'version': dep['version']}) _flatten(dep['dependencies'], collect) resolved_deps = [] _flatten(manifest_dependencies, resolved_deps) else: # pom.xml resolved_deps =\ [{'package': x.split(' ')[0], 'version': x.split(' ')[1]} for x in manifest_dependencies] else: # package.json, requirements.txt try: resolved_deps = self._handle_external_deps( Ecosystem.by_name(db, arguments['ecosystem']), out["details"][0]["dependencies"]) except Exception: raise out["details"][0]['_resolved'] = resolved_deps result.append(out) return {'result': result}
def setup_method(self, method): """Set up the MercatorTask.""" self.m = MercatorTask.create_test_instance(task_name='metadata') assert method
def execute(self, arguments): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('data')) self._strict_assert(arguments.get('external_request_id')) db = self.storage.session try: results = db.query(StackAnalysisRequest)\ .filter(StackAnalysisRequest.id == arguments.get('external_request_id'))\ .first() except SQLAlchemyError: db.rollback() raise manifests = [] if results is not None: row = results.to_dict() request_json = row.get("requestJson", {}) manifests = request_json.get('manifest', []) # If we receive a manifest file we need to save it first result = [] for manifest in manifests: with TemporaryDirectory() as temp_path: with open(os.path.join(temp_path, manifest['filename']), 'a+') as fd: fd.write(manifest['content']) # mercator-go does not work if there is no package.json if 'shrinkwrap' in manifest['filename'].lower(): with open(os.path.join(temp_path, 'package.json'), 'w') as f: f.write(json.dumps({})) # Create instance manually since stack analysis is not handled by dispatcher subtask = MercatorTask.create_test_instance( task_name=self.task_name) arguments['ecosystem'] = manifest['ecosystem'] out = subtask.run_mercator(arguments, temp_path) if not out["details"]: raise FatalTaskError( "No metadata found processing manifest file '{}'".format( manifest['filename'])) if 'dependencies' not in out['details'][0] and out.get( 'status', None) == 'success': raise FatalTaskError( "Dependencies could not be resolved from manifest file '{}'" .format(manifest['filename'])) out["details"][0]['manifest_file'] = manifest['filename'] out["details"][0]['ecosystem'] = manifest['ecosystem'] out["details"][0]['manifest_file_path'] = manifest.get( 'filepath', 'File path not available') # If we're handling an external request we need to convert dependency specifications to # concrete versions that we can query later on in the `AggregatorTask` manifest_descriptor = get_manifest_descriptor_by_filename( manifest['filename']) if 'external_request_id' in arguments: manifest_dependencies = [] if manifest_descriptor.has_resolved_deps: # npm-shrinkwrap.json, pom.xml if "_dependency_tree_lock" in out["details"][ 0]: # npm-shrinkwrap.json if 'dependencies' in out['details'][0][ "_dependency_tree_lock"]: manifest_dependencies = out["details"][0][ "_dependency_tree_lock"].get( "dependencies", []) else: # pom.xml if 'dependencies' in out['details'][0]: manifest_dependencies = out["details"][0].get( "dependencies", []) if manifest_descriptor.has_recursive_deps: # npm-shrinkwrap.json def _flatten(deps, collect): for dep in deps: collect.append({ 'package': dep['name'], 'version': dep['version'] }) _flatten(dep['dependencies'], collect) resolved_deps = [] _flatten(manifest_dependencies, resolved_deps) else: # pom.xml resolved_deps =\ [{'package': x.split(' ')[0], 'version': x.split(' ')[1]} for x in manifest_dependencies] else: # package.json, requirements.txt resolved_deps = self._handle_external_deps( self.storage.get_ecosystem(arguments['ecosystem']), out["details"][0]["dependencies"]) out["details"][0]['_resolved'] = resolved_deps result.append(out) return {'result': result}
def setup_method(self, method): self.m = MercatorTask.create_test_instance(task_name='metadata')
class GithubDependencyTreeTask(BaseTask): """Finds out direct and indirect dependencies from a given github repository.""" _mercator = MercatorTask.create_test_instance( task_name='GithubDependencyTreeTask') def execute(self, arguments=None): """Task code. :param arguments: dictionary with task arguments :return: {}, results """ self._strict_assert(arguments.get('github_repo')) self._strict_assert(arguments.get('github_sha')) self._strict_assert(arguments.get('email_ids')) github_repo = arguments.get('github_repo') github_sha = arguments.get('github_sha') dependencies = list( GithubDependencyTreeTask.extract_dependencies( github_repo, github_sha)) return { "dependencies": dependencies, "github_repo": github_repo, "github_sha": github_sha, "email_ids": arguments.get('email_ids') } @staticmethod def extract_dependencies(github_repo, github_sha=None): """Extract the dependencies information. Currently assuming repository is maven/npm/python repository. :param github_repo: repository url :param github_sha: commit hash :return: set of direct (and indirect) dependencies """ with TemporaryDirectory() as workdir: repo = Git.clone(url=github_repo, path=workdir, timeout=3600) if github_sha is not None: repo.reset(revision=github_sha, hard=True) with cwd(repo.repo_path): # TODO: Make this task also work for files not present in root directory. # First change the package-lock.json to npm-shrinkwrap.json GithubDependencyTreeTask.change_package_lock_to_shrinkwrap() if peek(Path.cwd().glob("pom.xml")): return GithubDependencyTreeTask.get_maven_dependencies() elif peek(Path.cwd().glob("npm-shrinkwrap.json")) \ or peek(Path.cwd().glob("package.json")): return GithubDependencyTreeTask.get_npm_dependencies( repo.repo_path) elif peek(Path.cwd().glob("requirements.txt")): return GithubDependencyTreeTask.get_python_dependencies( repo.repo_path) elif peek(Path.cwd().glob("glide.lock")): return GithubDependencyTreeTask.get_go_glide_dependencies( repo.repo_path) elif peek(Path.cwd().glob("Gopkg.lock")): return GithubDependencyTreeTask.get_go_pkg_dependencies() else: raise TaskError("Please provide maven or npm or " "python or Go repository for scanning!") @staticmethod def get_maven_dependencies(): """Get direct and indirect dependencies from pom.xml by using maven dependency tree plugin. :return: set of direct and indirect dependencies """ output_file = Path.cwd() / "dependency-tree.txt" cmd = [ "mvn", "org.apache.maven.plugins:maven-dependency-plugin:3.0.2:tree", "-DoutputType=dot", "-DoutputFile={filename}".format(filename=output_file), "-DappendOutput=true" ] timed_cmd = TimedCommand(cmd) status, output, _ = timed_cmd.run(timeout=3600) if status != 0 or not output_file.is_file(): # all errors are in stdout, not stderr raise TaskError(output) with output_file.open() as f: return GithubDependencyTreeTask.parse_maven_dependency_tree( f.readlines()) @staticmethod def parse_maven_dependency_tree(dependency_tree): """Parse the dot representation of maven dependency tree. For available representations of dependency tree see http://maven.apache.org/plugins/maven-dependency-plugin/tree-mojo.html#outputType :param dependency_tree: DOT representation of maven dependency tree :return: set of direct and indirect dependencies """ dot_file_parser_regex = re.compile('"(.*?)"') set_pom_names = set() set_package_names = set() for line in dependency_tree: matching_lines_list = dot_file_parser_regex.findall(line) # If there's only one string, it means this a pom name. if len(matching_lines_list) == 1: # Remove scope from package name. Package name is of the form: # <group-id>:<artifact-id>:<packaging>:<?classifier>:<version>:<scope> matching_line = matching_lines_list[0].rsplit(':', 1)[0] add_maven_coords_to_set(matching_line, set_pom_names) else: for matching_line in matching_lines_list: matching_line = matching_line.rsplit(':', 1)[0] add_maven_coords_to_set(matching_line, set_package_names) # Remove pom names from actual package names. return set_package_names.difference(set_pom_names) @classmethod def get_npm_dependencies(cls, path): """Get a list of direct and indirect dependencies from npm-shrinkwrap. If there is no npm-shrinkwrap file present then it fall backs to use package.json and provides only the list of direct dependencies. :param path: path to run the mercator :return: set of direct (and indirect) dependencies """ mercator_output = cls._mercator.run_mercator( arguments={"ecosystem": "npm"}, cache_path=path, resolve_poms=False) set_package_names = set() mercator_output_details = mercator_output['details'][0] dependency_tree_lock = mercator_output_details \ .get('_dependency_tree_lock') # Check if there is lock file present if dependency_tree_lock: dependencies = dependency_tree_lock.get('dependencies') for dependency in dependencies: transitive_deps = dependency.get('dependencies', []) name = dependency.get('name') version = dependency.get('version') dev_dependency = dependency.get('dev') if not dev_dependency: set_package_names.add( "{ecosystem}:{package}:{version}".format( ecosystem="npm", package=name, version=version)) # There can be multiple transitive dependencies. for t_dep in transitive_deps: name = t_dep.get('name') version = t_dep.get('version') dev_dependency = dependency.get('dev') if not dev_dependency: set_package_names.add( "{ecosystem}:{package}:{version}".format( ecosystem="npm", package=name, version=version)) else: all_dependencies = mercator_output_details.get('dependencies', []) for dependency in all_dependencies: name, version = dependency.split() set_package_names.add("{ecosystem}:{package}:{version}".format( ecosystem="npm", package=name, version=version)) return set_package_names @classmethod def get_python_dependencies(cls, path): """Get a list of direct and indirect dependencies from requirements.txt. To get a list of direct and transitive dependencies the requirements.txt file has to be generated through `pip-compile` else only direct dependencies can be extracted. :param path: path to run the mercator :return: set of direct (and indirect) dependencies """ mercator_output = cls._mercator.run_mercator( arguments={"ecosystem": "pypi"}, cache_path=path, resolve_poms=False) set_package_names = set() mercator_output_details = mercator_output['details'][0] dependencies = mercator_output_details.get('dependencies', []) for dependency in dependencies: name, version = dependency.split("==") set_package_names.add("{ecosystem}:{package}:{version}".format( ecosystem="pypi", package=name, version=version)) return set_package_names @classmethod def get_go_glide_dependencies(cls, path): """Get all direct and transitive dependencies by parsing glide.lock. :param path: path to run the mercator :return: set of direct and indirect dependencies """ mercator_output = cls._mercator.run_mercator( arguments={"ecosystem": "go"}, cache_path=path, resolve_poms=False) set_package_names = set() mercator_output_details = mercator_output['details'][0] dependency_tree_lock = mercator_output_details \ .get('_dependency_tree_lock') dependencies = dependency_tree_lock.get('dependencies') for dependency in dependencies: sub_packages = dependency.get('subpackages') name = dependency.get('name') version = dependency.get('version') if sub_packages: for sub_package in sub_packages: # Ignore sub-packages like '.', '..', '...' etc. if sub_package != len(sub_package) * '.': # We need to come up with a unified format # of how sub-packages are presented. sub_package_name = name + '/{}'.format(sub_package) set_package_names.add( "{ecosystem}:{package}:{version}".format( ecosystem="go", package=sub_package_name, version=version)) else: set_package_names.add( "{ecosystem}:{package}:{version}".format( ecosystem="go", package=name, version=version)) else: set_package_names.add("{ecosystem}:{package}:{version}".format( ecosystem="go", package=name, version=version)) return set_package_names @staticmethod def get_go_pkg_dependencies(): """Get all direct and indirect dependencies by parsing Gopkg.lock. :return: set of direct and indirect dependencies """ # TODO: Run mercator instead of this custom parsing logic, once mercator supports this. set_package_names = set() lock_file_contents = anymarkup.parse_file('Gopkg.lock', format='toml') packages = lock_file_contents.get('projects') for package in packages: name = package.get('name') sub_packages = package.get('packages') version = package.get('revision') if sub_packages: for sub_package in sub_packages: # Ignore sub-packages like '.', '..', '...' etc. if sub_package != len(sub_package) * '.': sub_package_name = name + '/{}'.format(sub_package) set_package_names.add( "{ecosystem}:{package}:{version}".format( ecosystem="go", package=sub_package_name, version=version)) else: set_package_names.add( "{ecosystem}:{package}:{version}".format( ecosystem="go", package=name, version=version)) else: set_package_names.add("{ecosystem}:{package}:{version}".format( ecosystem="go", package=name, version=version)) return set_package_names @staticmethod def change_package_lock_to_shrinkwrap(): """Rename package-lock.json to npm-shrinkwrap.json. For more information about package-lock.json please visit https://docs.npmjs.com/files/package-lock.json """ # TODO: Remove this method once mercator has support for package-lock.json package_lock_path = Path.cwd() / "package-lock.json" if package_lock_path.is_file(): package_lock_path.rename("npm-shrinkwrap.json")