def reset(self, revision, hard=False): """Run 'git reset'.""" cmd = ["git", "reset", revision] if hard: cmd.extend(["--hard"]) with cwd(self.repo_path): TimedCommand.get_command_output(cmd, graceful=False)
def add(self, path): """Add path to index. :param path: str """ with cwd(self.repo_path): TimedCommand.get_command_output(["git", "add", path], graceful=False)
def clone(cls, url, path, depth=None, branch=None, single_branch=False): """ clone repository provided as url to specific path :param url: str :param path: str :param depth: str :param branch: str :return: instance of Git() """ orig_url = url cls.config() # git clone doesn't understand urls starting with: git+ssh, git+http, git+https url = url2git_repo(url) cmd = ["git", "clone", url, path] if depth is not None: cmd.extend(["--depth", depth]) if branch is not None: cmd.extend(["--branch", branch]) if single_branch: cmd.extend(["--single-branch"]) try: TimedCommand.get_command_output(cmd, graceful=False) except TaskError as exc: raise TaskError("Unable to clone: %s" % orig_url) from exc return cls(path=path)
def _generate_pom_xml(to_solve): """Create pom.xml with dependencies from to_solve. And run 'mvn versions:resolve-ranges', which resolves the version ranges (overwrites the pom.xml). :param to_solve: {"groupId:artifactId": "version-range"} """ project = etree.Element('project') etree.SubElement(project, 'modelVersion').text = '4.0.0' etree.SubElement(project, 'groupId').text = 'foo.bar.baz' etree.SubElement(project, 'artifactId').text = 'testing' etree.SubElement(project, 'version').text = '1.0.0' dependencies = etree.SubElement(project, 'dependencies') for name, version_range in to_solve.items(): group_id, artifact_id = name.rstrip(':').split(':') dependency = etree.SubElement(dependencies, 'dependency') etree.SubElement(dependency, 'groupId').text = group_id etree.SubElement(dependency, 'artifactId').text = artifact_id etree.SubElement(dependency, 'version').text = version_range with open('pom.xml', 'wb') as pom: pom.write( etree.tostring(project, xml_declaration=True, pretty_print=True)) TimedCommand.get_command_output(['mvn', 'versions:resolve-ranges'], graceful=False)
def zip_file(file, archive, junk_paths=False): command = ['zip', '-r', archive, file] if junk_paths: # Store just the name of a saved file (junk the path), not directory names. # By default, zip will store the full path (relative to the current directory). command.extend(['--junk-paths']) TimedCommand.get_command_output(command, graceful=False)
def fetch_npm_artifact(name, version, target_dir): """Fetch npm artifact using system 'npm' tool.""" git = Git.create_git(target_dir) # $ npm config get cache # /root/.npm cache_path = TimedCommand.get_command_output( ['npm', 'config', 'get', 'cache'], graceful=False).pop() # add package to cache: # /root/.npm/express/ # └── 4.13.4 # ├── package # │ ├── History.md # │ ├── index.js # │ ├── lib # │ ├── LICENSE # │ ├── package.json # │ └── Readme.md # └── package.tgz # 3 directories, 6 files name_ver = name if version: name_ver = "{}@{}".format(name, version) # make sure the artifact is not in the cache yet TimedCommand.get_command_output(['npm', 'cache', 'clean', name], graceful=False) logger.info("downloading npm module %s", name_ver) npm_command = ['npm', 'cache', 'add', name_ver] TimedCommand.get_command_output(npm_command, graceful=False) # copy tarball to workpath tarball_name = "package.tgz" glob_path = os.path.join(cache_path, name, "*") cache_abs_path = os.path.abspath(glob.glob(glob_path).pop()) artifact_path = os.path.join(cache_abs_path, tarball_name) logger.debug("[cache] tarball path = %s", artifact_path) artifact_path = shutil.copy(artifact_path, target_dir) logger.debug("[workdir] tarball path = %s", artifact_path) # Prior to npm-2.x.x (Fedora 24) # npm client was repackaging modules on download. It modified file permissions inside # package.tgz so they matched UID/GID of a user running npm command. Therefore its # digest was different then of a tarball downloaded directly from registry.npmjs.org. digest = compute_digest(artifact_path) Archive.extract(artifact_path, target_dir) # copy package/package.json over the extracted one, # because it contains (since npm >= 2.x.x) more information. npm_package_json = os.path.join(cache_abs_path, 'package', 'package.json') shutil.copy(npm_package_json, target_dir) # copy package/npm-shrinkwrap.json to target_dir npm_shrinkwrap_json = os.path.join(target_dir, 'package', 'npm-shrinkwrap.json') if os.path.isfile(npm_shrinkwrap_json): shutil.copy(npm_shrinkwrap_json, target_dir) git.add_and_commit_everything() return digest, artifact_path
def commit(self, message='blank'): """Commit git repository. :param message: str, commit message """ # --git-dir is #$%^&& # http://stackoverflow.com/questions/1386291/git-git-dir-not-working-as-expected with cwd(self.repo_path): TimedCommand.get_command_output(["git", "commit", "-m", message], graceful=False)
def create_git(cls, path): """Initialize new git repository at path. :param path: str :return: instance of Git() """ cls.config() TimedCommand.get_command_output(["git", "init", path], graceful=False) return cls(path=path)
def extract_zip(target, dest, mkdest=False): """Extract target zip archive into dest using system 'unzip' command.""" if mkdest: try: os.mkdir(dest, mode=0o775) except FileExistsError: pass # -o: overwrite existing files without prompting TimedCommand.get_command_output( ['unzip', '-q', '-o', '-d', dest, target])
def extract_zip(target, dest, mkdest=False): if mkdest: try: os.mkdir(dest, mode=0o775) except FileExistsError: pass # -o: overwrite existing files without prompting TimedCommand.get_command_output( ['unzip', '-q', '-o', '-d', dest, target]) # Fix possibly wrong permissions in zip files that would prevent us from deleting files. TimedCommand.get_command_output(['chmod', '-R', 'u+rwX,g+rwX', dest])
def update_depcheck_db_on_s3(): """Update OWASP Dependency-check DB on S3.""" s3 = StoragePool.get_connected_storage('S3VulnDB') depcheck = os.path.join(configuration.OWASP_DEP_CHECK_PATH, 'bin', 'dependency-check.sh') with tempdir() as temp_data_dir: s3.retrieve_depcheck_db_if_exists(temp_data_dir) # give DependencyCheck 25 minutes to download the DB TimedCommand.get_command_output( [depcheck, '--updateonly', '--data', temp_data_dir], timeout=1500) s3.store_depcheck_db(temp_data_dir)
def clone(cls, url, path, timeout=300, depth=None, branch=None, single_branch=False): """Clone repository provided as url to specific path. :param url: str :param path: str :param timeout: int :param depth: str :param branch: str :param single_branch: bool, only checkout single branch :return: instance of Git() """ orig_url = url # git clone doesn't understand urls starting with: git+ssh, git+http, git+https url = url2git_repo(url) orig_path = path path = Path(path) mode = 0 if path.is_dir(): mode = path.stat().st_mode cmd = ["git", "clone", url, orig_path] if depth is not None: cmd.extend(["--depth", depth]) if branch is not None: cmd.extend(["--branch", branch]) if single_branch: cmd.extend(["--single-branch"]) try: cls.config() TimedCommand.get_command_output(cmd, graceful=False, timeout=timeout) except TaskError as exc: if not path.is_dir() and mode: # 'git clone repo dir/' deletes (no way to turn this off) dir/ if cloning fails. # This might confuse caller of this method, so we recreate the dir on error here. try: path.mkdir(mode) except OSError: logger.error("Unable to re-create dir: %s", str(path)) raise TaskError("Unable to clone: %s" % orig_url) from exc return cls(path=orig_path)
def fetch_scm_artifact(name, version, target_dir): env = dict(os.environ) env['GOPATH'] = target_dir TimedCommand.get_command_output(['go', 'get', '-d', name], timeout=300, env=env, graceful=True) package_dir = os.path.join(target_dir, 'src', name) with cwd(package_dir): git = Git(package_dir) git.reset(version, hard=True) artifact_filename = git.archive(version) artifact_path = os.path.join(package_dir, artifact_filename) digest = compute_digest(artifact_path) return digest, artifact_path
def add_and_commit_everything(self, message="blank"): """Add and commit. git add . git commit -m everything :param message: str, commit message """ # first we need to remove any .git dirs/files from the archive, they could contain # directions that would break adding (e.g. Flask 0.10 contains .git with gitpath # pointing to Mitsuhiko's home dir) TimedCommand.get_command_output(['find', self.repo_path, '-mindepth', '2', '-name', '.git', '-exec', 'rm', '-rf', '{}', ';']) # add everything self.add(self.repo_path) self.commit(message=message)
def _run_victims_cve_db_cli(self, arguments): """Run Victims CVE DB CLI.""" s3 = StoragePool.get_connected_storage('S3VulnDB') output = [] with TemporaryDirectory() as temp_victims_db_dir: if not s3.retrieve_victims_db_if_exists(temp_victims_db_dir): self.log.debug('No Victims CVE DB found on S3, cloning from github') self.update_victims_cve_db_on_s3() s3.retrieve_victims_db_if_exists(temp_victims_db_dir) try: cli = os.path.join(temp_victims_db_dir, 'victims-cve-db-cli.py') command = [cli, 'search', '--ecosystem', 'java', '--name', arguments['name'], '--version', arguments['version']] output = TimedCommand.get_command_output(command, graceful=False, is_json=True, timeout=60) # 1 minute except TaskError as e: self.log.exception(e) return output
def execute(self, arguments): """Run oscryptocatcher tool for matching crypto algorithms.""" self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) cache_path = ObjectCache.get_from_dict( arguments).get_extracted_source_tarball() results = {'status': 'unknown', 'summary': {}, 'details': []} try: oscc = TimedCommand.get_command_output( ['oscryptocatcher', '--subdir-in-result', cache_path], graceful=False, is_json=True) self.log.debug("oscryptocatcher %s output: %s", cache_path, oscc) results['details'] = oscc['details'] results['summary'] = oscc['summary'] results['status'] = 'success' except Exception: raise FatalTaskError('oscryptocatcher failed') return results
def worker(path): mime = TimedCommand.get_command_output(['file', path, '-b', '-i']).pop() self.log.debug("%s mime = %s", path, mime) typ = TimedCommand.get_command_output(['file', path, '-b']) self.log.debug("%s filetype = %s", path, typ) linguist = None if 'charset=binary' not in mime: linguist = self._parse_linguist( TimedCommand.get_command_output(['linguist', path]) ) self.log.debug("%s linguist output = %s", path, linguist) results.append({ "type": typ, "output": linguist, "path": os.path.relpath(path, cache_path), })
def compute_ssdeep(self, target): """Compute SSdeep piece-wise linear hash of target.""" # 0 : ssdeep header # 1 : hash,filename data = TimedCommand.get_command_output(['ssdeep', '-c', '-s', target]) try: return data[1].split(',')[0].strip() except IndexError as exc: self.log.error("unable to compute ssdeep of %r", target) raise RuntimeError("can't compute digest of %r" % target) from exc
def extract(target, dest): """Detect archive type and extracts it.""" # Make sure that the destination directory exists try: Path(dest).mkdir(mode=0o777, parents=True) except FileExistsError: pass tar = Archive.TarMatcher.search(target) if target.endswith( ('.zip', '.whl', '.egg', '.jar', '.war', '.aar', '.nupkg')): Archive.extract_zip(target, dest) elif tar or target.endswith(('.tgz', '.bz2')): Archive.extract_tar(target, dest) else: raise ValueError('Unknown archive for {0}'.format(target)) # Fix possibly wrong permissions in zip files that would prevent us from deleting files. TimedCommand.get_command_output(['chmod', '-R', 'u+rwX,g+rwX', dest])
def config(): """Configure git.""" user_name = configuration.GIT_USER_NAME user_email = configuration.GIT_USER_EMAIL if not TimedCommand.get_command_output(["git", "config", "--get", "user.name"]): TimedCommand.get_command_output(["git", "config", "--global", "user.name", user_name]) if not TimedCommand.get_command_output(["git", "config", "--get", "user.email"]): TimedCommand.get_command_output(["git", "config", "--global", "user.email", user_email]) # Use 'true' as external program to ask for credentials, i.e. don't ask # Better would be GIT_TERMINAL_PROMPT=0, but that requires git >= 2.3 TimedCommand.get_command_output(["git", "config", "--global", "core.askpass", "/usr/bin/true"])
def fetch_go_artifact(name, version, target_dir): """Fetch go artifact using 'go get' command.""" env = dict(os.environ) env['GOPATH'] = target_dir Git.config() try: TimedCommand.get_command_output(['go', 'get', '-d', name], timeout=300, env=env, graceful=False) except TaskError: raise NotABugTaskError('Unable to go-get {n}'.format(n=name)) package_dir = os.path.join(target_dir, 'src', name) with cwd(package_dir): git = Git(package_dir) git.reset(version, hard=True) artifact_filename = git.archive(version) artifact_path = os.path.join(package_dir, artifact_filename) digest = compute_digest(artifact_path) return digest, artifact_path
def archive(self, basename, sub_path=None): """Create an archive; simply calls `git archive`. :param basename: str, name of the resulting archive, without file extension (suffix) :param sub_path: str, only add files found under this path to the archive; default: add all files from the repository (.git/ is always excluded) :return: str, filename """ suffix = "tar.gz" filename = basename + "." + suffix with cwd(self.repo_path): cmd = [ "git", "archive", "--format={}".format(suffix), "--output={}".format(filename), "HEAD" ] if sub_path: cmd.append(sub_path) TimedCommand.get_command_output(cmd) return filename
def archive(self, basename, basedir=None, sub_path=None, format="tar.gz"): """Create an archive; simply calls `git archive`. :param basename: str, name of the resulting archive, without file extension (suffix) :param basedir: str, path to a directory where to store the resulting archive :param sub_path: str, only add files found under this path to the archive; default: add all files from the repository (.git/ is always excluded) :param format: str, format of the resulting archive, default: 'tar.gz' :return: str, filename """ filename = os.path.join(basedir or "", basename + "." + format) with cwd(self.repo_path): cmd = [ "git", "archive", "--format={}".format(format), "--output={}".format(filename), "HEAD" ] if sub_path: cmd.append(sub_path) TimedCommand.get_command_output(cmd) return filename
def rev_parse(self, args=None): """Run git rev-parse. :param args: arguments to pass to `git rev-parse` :return: [str], output from `git rev-parse` """ cmd = ["git", "rev-parse"] if args: cmd.extend(args) with cwd(self.repo_path): return TimedCommand.get_command_output(cmd, graceful=False)
def update_depcheck_db_on_s3(): """Update OWASP Dependency-check DB on S3.""" s3 = StoragePool.get_connected_storage('S3VulnDB') depcheck = configuration.dependency_check_script_path with TemporaryDirectory() as temp_data_dir: s3.retrieve_depcheck_db_if_exists(temp_data_dir) old_java_opts = os.getenv('JAVA_OPTS', '') os.environ['JAVA_OPTS'] = CVEcheckerTask.dependency_check_jvm_mem_limit # give DependencyCheck 25 minutes to download the DB if TimedCommand.get_command_output([depcheck, '--updateonly', '--data', temp_data_dir], timeout=1500): s3.store_depcheck_db(temp_data_dir) os.environ['JAVA_OPTS'] = old_java_opts
def fetch_rubygems_artifact(name, version, target_dir): git = Git.create_git(target_dir) logger.info("downloading rubygems package %s-%s", name, version) version_arg = [] if version: version_arg = ['--version', version] gem_command = ['gem', 'fetch', name] gem_command.extend(version_arg) with cwd(target_dir): TimedCommand.get_command_output(gem_command, graceful=False) if not version: # if version is None we need to glob for the version that was downloaded artifact_path = os.path.abspath( glob.glob(os.path.join(target_dir, name + '*')).pop()) else: artifact_path = os.path.join( target_dir, '{n}-{v}.gem'.format(n=name, v=version)) digest = compute_digest(artifact_path) Archive.extract(artifact_path, target_dir) git.add_and_commit_everything() return digest, artifact_path
def ls_remote(repository, refs=None, args=None): """Get output of `git ls-remote <args> <repo> <refs>` command. :param repository: str, remote git repository :param refs: list, list of git references :param args: list, list of additional arguments for the command :return: command output """ cmd = ["git", "ls-remote"] if args: cmd.extend(args) cmd.append(repository) if refs: cmd.extend(refs) return TimedCommand.get_command_output(cmd, graceful=False)
def extract_gem(target, dest): """Extract target gem and gemspec. Gem into $dest/sources Gemspec (renamed to rubygems-metadata.yaml) into $dest/metadata/ """ sources = os.path.join(dest, 'sources') metadata = os.path.join(dest, 'metadata') TimedCommand.get_command_output(['mkdir', '-p', sources, metadata]) TimedCommand.get_command_output(['gem', 'unpack', target, '--target', sources]) with cwd(metadata): # --spec ignores --target, so we need to cwd TimedCommand.get_command_output(['gem', 'unpack', target, '--spec']) metadatayaml = glob.glob('*.gemspec').pop() os.rename(metadatayaml, 'rubygems-metadata.yaml')
def execute(self): self.log.info("Checking maven index for new releases") maven_index_checker_dir = os.getenv('MAVEN_INDEX_CHECKER_PATH') target_dir = os.path.join(maven_index_checker_dir, 'target') central_index_dir = os.path.join(target_dir, 'central-index') timestamp_path = os.path.join(central_index_dir, 'timestamp') s3 = StoragePool.get_connected_storage('S3MavenIndex') self.log.info('Fetching pre-built maven index from S3, if available.') s3.retrieve_index_if_exists(target_dir) old_timestamp = 0 try: old_timestamp = int(os.stat(timestamp_path).st_mtime) except OSError: self.log.info( 'Timestamp is missing, we need to build the index from scratch.' ) pass last_offset = s3.get_last_offset() with tempdir() as java_temp_dir: cmd = [ 'java', '-Xmx768m', '-Djava.io.tmpdir={}'.format(java_temp_dir), '-jar', 'maven-index-checker.jar', '-c' ] with cwd(maven_index_checker_dir): output = TimedCommand.get_command_output(cmd, is_json=True, graceful=False, timeout=1200) current_count = output['count'] new_timestamp = int(os.stat(timestamp_path).st_mtime) if old_timestamp != new_timestamp: self.log.info('Storing pre-built maven index to S3...') s3.store_index(target_dir) self.log.debug('Stored. Index in S3 is up-to-date.') if old_timestamp == 0: s3.set_last_offset(current_count) self.log.info( 'This is first run, i.e. all packages are considered new. ' 'Skipping scheduling to not analyze all packages in index.' ) return else: self.log.info('Index in S3 is up-to-date.') self.log.debug( "Number of entries in maven indexer: %d, " "last offset used: %d", current_count, last_offset) to_schedule_count = current_count - last_offset if to_schedule_count == 0: self.log.info("No new packages to schedule, exiting...") return cmd = [ 'java', '-Xmx768m', '-Djava.io.tmpdir={}'.format(java_temp_dir), '-jar', 'maven-index-checker.jar', '-r', '0-{}'.format(to_schedule_count) ] output = TimedCommand.get_command_output(cmd, is_json=True, graceful=False, timeout=1200) self.log.info( "Found %d new packages to analyse, scheduling analyses...", len(output)) for entry in output: self.run_selinon_flow( 'bayesianFlow', { 'ecosystem': 'maven', 'name': '{groupId}:{artifactId}'.format(**entry), 'version': entry['version'], 'recursive_limit': 0 }) s3.set_last_offset(current_count) self.log.info( "All new maven releases scheduled for analysis, exiting..")
def fetch_npm_artifact(ecosystem, name, version, target_dir): """Fetch npm artifact using system 'npm' tool.""" git = Git.create_git(target_dir) npm_cmd = ['npm', '--registry', ecosystem.fetch_url] # $ npm config get cache # /root/.npm cache_path = TimedCommand.get_command_output( npm_cmd + ['config', 'get', 'cache'], graceful=False).pop() # add package to cache: # /root/.npm/express/ # └── 4.13.4 # ├── package # │ ├── History.md # │ ├── index.js # │ ├── lib # │ ├── LICENSE # │ ├── package.json # │ └── Readme.md # └── package.tgz # 3 directories, 6 files name_ver = name try: # importing here to avoid circular dependency from f8a_worker.solver import NpmReleasesFetcher version_list = NpmReleasesFetcher(ecosystem).fetch_releases( name_ver)[1] if version not in version_list: raise NotABugTaskError( "Provided version is not supported '%s'" % name) else: name_ver = "{}@{}".format(name, version) except ValueError as e: raise NotABugTaskError( 'No versions for package NPM package {p} ({e})'.format( p=name, e=str(e))) # make sure the artifact is not in the cache yet TimedCommand.get_command_output(npm_cmd + ['cache', 'clean', name], graceful=False) logger.info("downloading npm module %s", name_ver) cmd = npm_cmd + ['cache', 'add', name_ver] TimedCommand.get_command_output(cmd, graceful=False) # copy tarball to workpath tarball_name = "package.tgz" glob_path = os.path.join(cache_path, name, "*") cache_abs_path = os.path.abspath(glob.glob(glob_path).pop()) artifact_path = os.path.join(cache_abs_path, tarball_name) logger.debug("[cache] tarball path = %s", artifact_path) artifact_path = shutil.copy(artifact_path, target_dir) logger.debug("[workdir] tarball path = %s", artifact_path) # Prior to npm-2.x.x (Fedora 24) # npm client was repackaging modules on download. It modified file permissions inside # package.tgz so they matched UID/GID of a user running npm command. Therefore its # digest was different then of a tarball downloaded directly from registry.npmjs.org. digest = compute_digest(artifact_path) Archive.extract(artifact_path, target_dir) Archive.fix_permissions(os.path.join(cache_abs_path, 'package')) # copy package/package.json over the extracted one, # because it contains (since npm >= 2.x.x) more information. npm_package_json = os.path.join(cache_abs_path, 'package', 'package.json') shutil.copy(npm_package_json, target_dir) # copy package/npm-shrinkwrap.json to target_dir npm_shrinkwrap_json = os.path.join(target_dir, 'package', 'npm-shrinkwrap.json') if os.path.isfile(npm_shrinkwrap_json): shutil.copy(npm_shrinkwrap_json, target_dir) git.add_and_commit_everything() return digest, artifact_path