예제 #1
0
    def _python_scan(self, arguments):
        """
        Run OWASP dependency-check experimental analyzer for Python artifacts

        https://jeremylong.github.io/DependencyCheck/analyzers/python-analyzer.html
        """
        tarball = ObjectCache.get_from_dict(arguments).get_source_tarball()
        if tarball.endswith('zip') or tarball.endswith('.whl'):  # tar.gz seems to be not supported
            scan_path = tarball
        else:
            extracted_tarball = ObjectCache.get_from_dict(arguments).get_extracted_source_tarball()
            # depcheck needs to be pointed to a specific file, we can't just scan whole directory
            egg_info, pkg_info, metadata = None, None, None
            for root, dirs, files in os.walk(extracted_tarball):
                if root.endswith('.egg-info'):
                    egg_info = root
                if 'PKG-INFO' in files:
                    pkg_info = os.path.join(root, 'PKG-INFO')
                if 'METADATA' in files:
                    metadata = os.path.join(root, 'METADATA')

            scan_path = egg_info or pkg_info or metadata

        if not scan_path:
            return {'summary': ['File types not supported by OWASP dependency-check'],
                    'status': 'error',
                    'details': []}

        return self._run_owasp_dep_check(scan_path, experimental=True)
    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        source_path = ObjectCache.get_from_dict(arguments).get_sources()
        header, language_stats = self._get_generic_result(source_path)

        for language in language_stats.keys():
            for handler in self._LANGUAGE_ANALYZER_HANDLERS.get(language, []):
                metrics_data = handler(self, source_path)
                if not metrics_data:
                    continue

                if 'metrics' not in language_stats[language]:
                    language_stats[language]['metrics'] = {}

                language_stats[language]['metrics'].update(metrics_data)

        # we don't want to have possibly unique keys and we want to avoid enumerating all languages that are
        # supported by cloc - convert a dict to a list of language-specific entries
        result = {'languages': []}
        for language in language_stats.keys():
            record = language_stats.get(language)
            record['language'] = language
            result['languages'].append(record)

        return {'summary': header, 'status': 'success', 'details': result}
    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        results = []
        cache_path = ObjectCache.get_from_dict(
            arguments).get_extracted_source_tarball()

        def worker(path):
            mime = TimedCommand.get_command_output(['file', path, '-b',
                                                    '-i']).pop()
            self.log.debug("%s mime = %s", path, mime)
            typ = TimedCommand.get_command_output(['file', path, '-b'])
            self.log.debug("%s filetype = %s", path, typ)

            linguist = None
            if 'charset=binary' not in mime:
                linguist = self._parse_linguist(
                    TimedCommand.get_command_output(['linguist', path]))
                self.log.debug("%s linguist output = %s", path, linguist)

            results.append({
                "type": typ,
                "output": linguist,
                "path": os.path.relpath(path, cache_path),
            })

        with ThreadPool(target=worker) as tp:
            for path in get_all_files_from(cache_path,
                                           path_filter=skip_git_files):
                tp.add_task(path)

        return {'summary': [], 'status': 'success', 'details': results}
예제 #4
0
    def execute(self, arguments):
        "Execute mercator and convert it's output to JSON object"
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        # TODO: make this even uglier; looks like we didn't get the abstraction quite right
        #       when we were adding support for Java/Maven.
        if self.storage.get_ecosystem(arguments['ecosystem']).is_backed_by(
                EcosystemBackend.maven):
            # cache_path now points directly to the pom
            cache_path = ObjectCache.get_from_dict(arguments).get_pom_xml()
        else:
            cache_path = ObjectCache.get_from_dict(
                arguments).get_extracted_source_tarball()
        return self.run_mercator(arguments, cache_path)
    def execute(self, arguments):
        """
        task code

        :param arguments: dictionary with arguments
        :return: {}, results
        """
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        try:
            cache_path = ObjectCache.get_from_dict(arguments).get_sources()
        except Exception as e:
            eco = arguments.get('ecosystem')
            pkg = arguments.get('name')
            ver = arguments.get('version')
            if arguments['ecosystem'] != 'maven':
                self.log.error(
                    'Could not get sources for package {e}/{p}/{v}'.format(
                        e=eco, p=pkg, v=ver))
                raise
            self.log.info('Could not get sources for maven package {p}/{v},'
                          'will try to run on binary jar'.format(p=pkg, v=ver))
            cache_path = ObjectCache.get_from_dict(
                arguments).get_extracted_source_tarball()

        result_data = {'status': 'unknown', 'summary': {}, 'details': {}}
        try:
            result_data['details'] = TimedCommand.get_command_output(
                ['license_check.py', cache_path], graceful=False, is_json=True)
            result_data['status'] = result_data['details'].pop('status')
            result_data['summary'] = result_data['details'].pop('summary')
        except:
            self.log.exception("License scan failed")
            result_data['status'] = 'error'

        return result_data
    def run(self, node_args):
        if self.storage and isinstance(self.storage, BayesianPostgres):
            # SQS guarantees 'deliver at least once', so there could be multiple messages of a type, give up immediately
            if self.storage.get_worker_id_count(self.task_id) > 0:
                raise FatalTaskError(
                    "Task with ID '%s' was already processed" % self.task_id)

        start = datetime.now()
        try:
            result = self.execute(node_args)
        finally:
            # remove all files that were downloaded for this task
            ObjectCache.wipe()
        end = datetime.now()

        if result:
            # Ensure result complies with the defined schema (if any) before saving
            self.validate_result(result)

        if result is None:
            # Keep track of None results and add _audit and _release keys
            result = {}

        if self.add_audit_info:
            # `_audit` key is added to every analysis info submitted
            result['_audit'] = {
                'started_at': json_serial(start),
                'ended_at': json_serial(end),
                'version': 'v1'
            }

            ecosystem_name = node_args.get('ecosystem')
            result['_release'] = '{}:{}:{}'.format(ecosystem_name,
                                                   node_args.get('name'),
                                                   node_args.get('version'))
        return result
예제 #7
0
    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        result_data = {'status': 'unknown', 'summary': [], 'details': {}}

        if self._is_valid_ecosystem(arguments['ecosystem']):
            hub = self._get_hub()

            # BlackDuck project doesn't have a notion of ecosystem, so we need to
            # namespace the project names ourselves, so for package `crumb` in the NPM ecosystem
            # we'll end up with the name `npm-crumb`
            project = self._get_project_name(arguments)
            version = arguments['version']

            # Check if the given project had already been scanned
            data = self._release_data(hub, project, version)

            if not data and self._allow_cli_scan:
                self.log.debug("No data available for project {p} {v}".format(
                    p=project, v=version))
                # No data available, issue a new scan and re-query release data
                source_tarball_path = ObjectCache.get_from_dict(
                    arguments).get_source_tarball()
                command = self._prepare_command(project, version,
                                                source_tarball_path)
                self.log.debug(
                    "Executing command, timeout={timeout}: {cmd}".format(
                        timeout=self._BLACKDUCK_CLI_TIMEOUT, cmd=command))
                bd = TimedCommand(command)
                status, output, error = bd.run(
                    timeout=self._BLACKDUCK_CLI_TIMEOUT,
                    update_env={'BD_HUB_PASSWORD': config.blackduck_password})
                self.log.debug("status = %s, error = %s", status, error)
                self.log.debug("output = %s", output)
                data = self._release_data(hub, project, version)

            self.log.debug("Release data for project {p} {v}: {d}".format(
                p=project, v=version, d=data))
            result_data['details'] = data
            result_data['status'] = 'success' if data else 'error'
        else:
            result_data['status'] = 'error'

        return result_data
    def execute(self, arguments):
        """
        task code

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        result_data = {'status': 'unknown', 'summary': [], 'details': []}

        source_tarball_path = ObjectCache.get_from_dict(
            arguments).get_source_tarball()
        sa = StaticAnalysis(source_tarball_path)

        try:
            analysis_result = sa.analyze()

            # make output reproducible - scanning the same
            # input multiple times should always produce
            # the same output
            del analysis_result["scan"]["time-created"]
            del analysis_result["scan"]["time-finished"]
            del analysis_result["scan"]["host"]
            del analysis_result["scan"]["store-results-to"]

            stats = {}
            for defect in analysis_result["defects"]:
                stats.setdefault(defect["checker"], {"count": 0})
                stats[defect["checker"]]["count"] += 1
                try:
                    stats[defect["checker"]]["cwe"] = defect["cwe"]
                except KeyError:
                    pass
            result_data['summary'] = stats
            result_data['status'] = 'success'
            result_data['details'] = analysis_result
        except Exception as ex:
            self.log.error("static analysis was not successful: %r", ex)
            result_data['status'] = 'error'

        return result_data
예제 #9
0
    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        cache_path = ObjectCache.get_from_dict(arguments).get_source_tarball()

        results = []
        for path in get_all_files_from(cache_path, path_filter=skip_git_files):
            self.log.debug("path = %s", path)

            bw = TimedCommand(['binwalk', '-B', path])
            status, output, error = bw.run(timeout=60)
            self.log.debug("status = %s, error = %s", status, error)
            self.log.debug("output = %s", output)

            parsed_binwalk = self.parse_binwalk(output)
            results.append({
                "path": os.path.relpath(path, cache_path),
                "output": parsed_binwalk,
            })
        return {'summary': [], 'status': 'success', 'details': results}
예제 #10
0
    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        cache_path = ObjectCache.get_from_dict(arguments).get_extracted_source_tarball()

        results = {'status': 'unknown',
                   'summary': {},
                   'details': []}

        try:
            oscc = TimedCommand.get_command_output(['oscryptocatcher', '--subdir-in-result', cache_path],
                                                   graceful=False, is_json=True)

            self.log.debug("oscryptocatcher %s output: %s", cache_path, oscc)
            results['details'] = oscc['details']
            results['summary'] = oscc['summary']
            results['status'] = 'success'
        except:
            results['status'] = 'error'

        return results
    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        epv_cache = ObjectCache.get_from_dict(arguments)
        cache_path = epv_cache.get_extracted_source_tarball()

        results = []
        for f in get_all_files_from(cache_path, path_filter=skip_git_files):
            results.append(self.compute_digests(cache_path, f))

        # In case of nodejs, prior to npm-2.x.x (Fedora 24)
        # npm client was repackaging modules on download.
        # It modified file permissions inside package.tgz so they matched UID/GID
        # of a user running npm command. Therefore its digest was different
        # then of a tarball downloaded directly from registry.npmjs.org.
        source_tarball_path = epv_cache.get_source_tarball()
        results.append(
            self.compute_digests(source_tarball_path,
                                 source_tarball_path,
                                 artifact=True))

        return {'summary': [], 'status': 'success', 'details': results}
    def execute(self, arguments):
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))
        self._strict_assert(arguments.get('ecosystem'))

        db = self.storage.session
        e = Ecosystem.by_name(db, arguments['ecosystem'])
        p = Package.get_or_create(db,
                                  ecosystem_id=e.id,
                                  name=arguments['name'])
        v = Version.get_or_create(db,
                                  package_id=p.id,
                                  identifier=arguments['version'])

        if not arguments.get('force'):
            # TODO: this is OK for now, but if we will scale and there will be 2+ workers running this task
            # they can potentially schedule two flows of a same type at the same time
            if db.query(Analysis).filter(
                    Analysis.version_id == v.id).count() > 0:
                # we need to propagate flags that were passed to flow, but not E/P/V - this way we are sure that for
                # example graph import is scheduled (arguments['force_graph_sync'] == True)
                arguments.pop('name')
                arguments.pop('version')
                arguments.pop('ecosystem')
                return arguments

        cache_path = mkdtemp(dir=self.configuration.worker_data_dir)
        epv_cache = ObjectCache.get_from_dict(arguments)
        ecosystem = Ecosystem.by_name(db, arguments['ecosystem'])

        try:
            if not epv_cache.has_source_tarball():
                _, source_tarball_path = IndianaJones.fetch_artifact(
                    ecosystem=ecosystem,
                    artifact=arguments['name'],
                    version=arguments['version'],
                    target_dir=cache_path)
                epv_cache.put_source_tarball(source_tarball_path)

            if ecosystem.is_backed_by(EcosystemBackend.maven):
                if not epv_cache.has_source_jar():
                    try:
                        source_jar_path = self._download_source_jar(
                            cache_path, ecosystem, arguments)
                        epv_cache.put_source_jar(source_jar_path)
                    except Exception as e:
                        self.log.info(
                            'Failed to fetch source jar for maven artifact "{e}/{p}/{v}": {err}'
                            .format(e=arguments.get('ecosystem'),
                                    p=arguments.get('name'),
                                    v=arguments.get('version'),
                                    err=str(e)))

                if not epv_cache.has_pom_xml():
                    pom_xml_path = self._download_pom_xml(
                        cache_path, ecosystem, arguments)
                    epv_cache.put_pom_xml(pom_xml_path)
        finally:
            # always clean up cache
            shutil.rmtree(cache_path)

        a = Analysis(version=v,
                     access_count=1,
                     started_at=datetime.datetime.now())
        db.add(a)
        db.commit()

        arguments['document_id'] = a.id
        return arguments
예제 #13
0
 def _maven_scan(self, arguments):
     """
     Run OWASP dependency-check
     """
     jar_path = ObjectCache.get_from_dict(arguments).get_source_tarball()
     return self._run_owasp_dep_check(jar_path, experimental=False)
예제 #14
0
    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        try:
            cache_path = ObjectCache.get_from_dict(arguments).get_sources()
        except Exception:
            eco = arguments.get('ecosystem')
            pkg = arguments.get('name')
            ver = arguments.get('version')
            if arguments['ecosystem'] != 'maven':
                self.log.error(
                    'Could not get sources for package {e}/{p}/{v}'.format(
                        e=eco, p=pkg, v=ver))
                raise
            self.log.info('Could not get sources for maven package {p}/{v},'
                          'will try to run on binary jar'.format(p=pkg, v=ver))
            cache_path = ObjectCache.get_from_dict(
                arguments).get_extracted_source_tarball()

        result_data = {'status': 'unknown', 'summary': {}, 'details': {}}
        try:
            command = [
                os.path.join(
                    os.getenv('SCANCODE_PATH', '/opt/scancode-toolkit/'),
                    'scancode'),
                # Scan for licenses
                '--license',
                # Do not return license matches with scores lower than this score
                '--license-score',
                SCANCODE_LICENSE_SCORE,
                # Files without findings are omitted
                '--only-findings',
                # Use n parallel processes
                '--processes',
                SCANCODE_PROCESSES,
                # Do not print summary or progress messages
                '--quiet',
                # Strip the root directory segment of all paths
                '--strip-root',
                # Stop scanning a file if scanning takes longer than a timeout in seconds
                '--timeout',
                SCANCODE_TIMEOUT,
                cache_path
            ]
            output = TimedCommand.get_command_output(command,
                                                     graceful=False,
                                                     is_json=True,
                                                     timeout=600)
            details = self.process_output(output)
            result_data['details'] = details
            result_data['status'] = 'success'
            result_data['summary'] = {
                'sure_licenses': list(details['licenses'].keys())
            }
        except:
            self.log.exception("License scan failed")
            result_data['status'] = 'error'

        return result_data