def flatten_deps( node_list_output: Dict[str, Union[Dict, str]] ) -> Generator[NPMPackage, None, None]: """returns a DFS of npm list JSON output yield NPMPackage objs with parent to child refs by ID """ pkgs: List[NPMPackage] = [] paths: List[JSONPath] = [] for path in visit_deps(node_list_output): pkg: NPMPackage if path: assert isinstance(path[-1], str) pkg = _get_pkg(get_in(node_list_output, path), path[-1]) else: pkg = _get_pkg(get_in(node_list_output, path)) for prev_pkg, prev_pkg_path in itertools.zip_longest( reversed(pkgs), reversed(paths)): # match direct deps as one level deeper with a matching prefix # e.g. from ["dependencies", "yargs"] # match ["dependencies", "yargs", "dependencies", "yarg-parser"] # but do not match: # [] (the root) # ["dependencies", "ps"] (a sibling dep) # or ["dependencies", "yargs", "dependencies", \ # "yarg-parser", "dependencies", "yarg-parser-dep"] (an indirect child) if (len(prev_pkg_path) - 2 == len(path) and path == prev_pkg_path[:len(path)]): bisect.insort(pkg.dependencies, prev_pkg.package_id) yield pkg pkgs.append(pkg) paths.append(path)
async def run_pipeline( source: Generator[Dict[str, Any], None, None], args: argparse.Namespace ) -> AsyncGenerator[Dict, None]: log.info(f"{pipeline.name} pipeline started") for i, line in enumerate(source): result = extract_fields( line, [ "branch", "commit", "tag", "org", "repo", "repo_url", "ref", "dependency_files", ], ) result["tasks"] = [] for task_data in get_in(line, ["task_results"], []): # filter for node list_metadata output to parse and flatten deps task_name = get_in(task_data, ["name"], None) if task_name not in args.repo_task: continue task_command = get_in(task_data, ["command"], None) task_result = extract_fields( task_data, [ "command", "container_name", "exit_code", "name", "relative_path", "working_dir", ], ) updates = parse_command(task_name, task_command, task_data, line) if updates: if task_name == "list_metadata": log.info( f"wrote {task_result['name']} {result['org']}/{result['repo']} {task_result['relative_path']}" f" {result['ref']['value']} w/" f" {updates['dependencies_count']} deps and {updates.get('problems_count', 0)} problems" # f" {updates['graph_stats']}" ) elif task_name == "audit": log.info( f"wrote {task_result['name']} {result['org']}/{result['repo']} {task_result['relative_path']}" f" {result['ref']['value']} w/" f" {updates['vulnerabilities_count']} vulns" ) task_result.update(updates) result["tasks"].append(task_result) yield result
def cargo_metadata_to_rust_crates( cargo_meta_out: Dict, ) -> Dict[str, RustCrate]: assert (get_in(cargo_meta_out, ["metadata", "version" ]) == 1), "cargo metadata format was not version 1" # build hashmap by pkg_id so we can lookup additional package info from # resolved crate as packages[crate.id] crates: Dict[str, RustCrate] = {} for n in get_in(cargo_meta_out, ["metadata", "nodes"]): crate = RustCrate(**extract_fields(n, {"id", "features", "deps"})) assert crate.id not in crates crates[crate.id] = crate return crates
def _visit_child_deps(node_list_output: Dict[str, Union[Dict, str]], path: JSONPath) -> Generator[JSONPath, None, None]: output = get_in(node_list_output, path) if output: for child_dep_key, child_dep in output.items(): for nested_child_path in _visit_child_deps( node_list_output, list(path) + [child_dep_key, "dependencies"]): yield nested_child_path yield list(path) + [child_dep_key] yield path
def parse_cargo_task(task_name: str, task_result: Dict) -> Optional[Dict]: parsed_stdout = parse_stdout_as_json(get_in(task_result, ["stdout"], None)) if parsed_stdout is None: log.warn("got non-JSON stdout for cargo task") return None if task_name == "list_metadata": return parse_cargo_list_metadata(parsed_stdout) elif task_name == "audit": return parse_cargo_audit(parsed_stdout) elif task_name == "install": return None else: raise NotImplementedError()
def visit_deps( node_list_output: Dict[str, Union[Dict, str]] ) -> Generator[JSONPath, None, None]: """generator of nodes from npm list JSON output in DFS order returning paths to valid node deps in the JSON paths Child dep keys are unordered. """ for path in _visit_child_deps(node_list_output, ["dependencies"]): if is_valid_node_list_output_node(get_in(node_list_output, path)): yield path if is_valid_node_list_output_top_level(node_list_output): yield []
def parse_npm_task(task_name: str, task_result: Dict) -> Optional[Dict]: # TODO: reuse cached results for each set of dep files w/ hashes and task name parsed_stdout = parse_stdout_as_json(get_in(task_result, ["stdout"], None)) if parsed_stdout is None: log.warn("got non-JSON stdout for npm") return None if task_name == "list_metadata": return parse_npm_list(parsed_stdout) elif task_name == "audit": return parse_npm_audit(parsed_stdout) elif task_name == "install": return None else: raise NotImplementedError()
def parse_npm_list(parsed_stdout: Dict) -> Dict: deps = [dep for dep in flatten_deps(parsed_stdout)] updates = {"problems": get_in(parsed_stdout, ["problems"], [])} updates["dependencies"] = [asdict(dep) for dep in deps] updates["dependencies_count"] = len(deps) updates["problems_count"] = len(updates["problems"]) updates["root"] = asdict(deps[-1]) if len(deps) else None updates["direct_dependencies_count"] = ( len(deps[-1].dependencies) if len(deps) else None ) updates["graph_stats"] = ( get_graph_stats(npm_packages_to_networkx_digraph(deps)) if deps else dict() ) return updates
def cargo_metadata_to_rust_crate_and_packages( cargo_meta_out: Dict, ) -> Tuple[Dict[str, RustCrate], Dict[str, RustPackage]]: log.debug( "running crate-graph on {0[cargo_tomlfile_path]} in {0[org]}/{0[repo]} at {0[commit]} " .format(cargo_meta_out)) crates = cargo_metadata_to_rust_crates(cargo_meta_out) packages: Dict[str, RustPackage] = {} for p in get_in(cargo_meta_out, ["metadata", "packages"]): pkg = RustPackage(**p) assert pkg.id not in packages packages[pkg.id] = pkg return (crates, packages)
def insert_npm_registry_data( session: sqlalchemy.orm.Session, source: Generator[Dict[str, Any], None, None]) -> None: for line in source: # save version specific data for version, version_data in line["versions"].items(): fields = extract_nested_fields( version_data, { "package_name": ["name"], "package_version": ["version"], "shasum": ["dist", "shasum"], "tarball": ["dist", "tarball"], "git_head": ["gitHead"], "repository_type": ["repository", "type"], "repository_url": ["repository", "url"], "description": ["description"], "url": ["url"], "license_type": ["license"], "keywords": ["keywords"], "has_shrinkwrap": ["_hasShrinkwrap"], "bugs_url": ["bugs", "url"], "bugs_email": ["bugs", "email"], "author_name": ["author", "name"], "author_email": ["author", "email"], "author_url": ["author", "url"], "maintainers": ["maintainers"], "contributors": ["contributors"], "publisher_name": ["_npmUser", "name"], "publisher_email": ["_npmUser", "email"], "publisher_node_version": ["_nodeVersion"], "publisher_npm_version": ["_npmVersion"], }, ) # license can we a string e.g. 'MIT' # or dict e.g. {'type': 'MIT', 'url': 'https://github.com/jonschlinkert/micromatch/blob/master/LICENSE'} fields["license_url"] = None if isinstance(fields["license_type"], dict): fields["license_url"] = fields["license_type"].get("url", None) fields["license_type"] = fields["license_type"].get( "type", None) # looking at you [email protected].{3,4} with: # [{"name": "StrongLoop", "url": "http://strongloop.com/license/"}, "MIT"], if not ((isinstance(fields["license_type"], str) or fields["license_type"] is None) and (isinstance(fields["license_url"], str) or fields["license_url"] is None)): log.warning( f"skipping weird license format {fields['license_type']}") fields["license_url"] = None fields["license_type"] = None # published_at .time[<version>] e.g. '2014-05-23T21:21:04.170Z' (not from # the version info object) # where time: an object mapping versions to the time published, along with created and modified timestamps fields["published_at"] = get_in(line, ["time", version]) fields["package_modified_at"] = get_in(line, ["time", "modified"]) fields[ "source_url"] = f"https://registry.npmjs.org/{fields['package_name']}" if (session.query(NPMRegistryEntry.id).filter_by( package_name=fields["package_name"], package_version=fields["package_version"], shasum=fields["shasum"], tarball=fields["tarball"], ).one_or_none()): log.debug( f"skipping inserting npm registry entry for {fields['package_name']}@{fields['package_version']}" f" from {fields['tarball']} with sha {fields['shasum']}") else: session.add(NPMRegistryEntry(**fields)) session.commit() log.info( f"added npm registry entry for {fields['package_name']}@{fields['package_version']}" f" from {fields['tarball']} with sha {fields['shasum']}")
def test_get_in_errors(value, path, default, expected_error): with pytest.raises(expected_error): m.get_in(value, path, default)
def test_get_in(value, path, default, expected): assert m.get_in(value, path, default) == expected