예제 #1
0
    def _strict_assert(cls, assert_cond):
        """Assert on condition.

        If condition is False, fatal error is raised so task is not retried.
        """
        if not assert_cond:
            raise FatalTaskError("Strict assert failed in task '%s'" % cls.__name__)
예제 #2
0
    def _python_scan(self, arguments):
        """Run OWASP dependency-check experimental analyzer for Python artifacts.

        https://jeremylong.github.io/DependencyCheck/analyzers/python.html
        """
        extracted_tarball = ObjectCache.get_from_dict(arguments).get_extracted_source_tarball()
        # depcheck needs to be pointed to a specific file, we can't just scan whole directory
        egg_info = pkg_info = metadata = None
        for root, _, files in os.walk(extracted_tarball):
            if root.endswith('.egg-info') or root.endswith('.dist-info'):
                egg_info = root
            if 'PKG-INFO' in files:
                pkg_info = os.path.join(root, 'PKG-INFO')
            if 'METADATA' in files:
                metadata = os.path.join(root, 'METADATA')
        scan_path = egg_info or pkg_info or metadata
        if pkg_info and not egg_info:
            # Work-around for dependency-check ignoring PKG-INFO outside .dist-info/
            # https://github.com/jeremylong/DependencyCheck/issues/896
            egg_info_dir = os.path.join(extracted_tarball, arguments['name'] + '.egg-info')
            try:
                os.mkdir(egg_info_dir)
                copy(pkg_info, egg_info_dir)
                scan_path = egg_info_dir
            except os.error:
                self.log.warning('Failed to copy %s to %s', pkg_info, egg_info_dir)

        if not scan_path:
            raise FatalTaskError('File types not supported by OWASP dependency-check')

        return self._run_owasp_dep_check(scan_path, experimental=True)
    def execute(self, arguments):
        """Run oscryptocatcher tool for matching crypto algorithms."""
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        cache_path = ObjectCache.get_from_dict(
            arguments).get_extracted_source_tarball()

        results = {'status': 'unknown', 'summary': {}, 'details': []}

        try:
            oscc = TimedCommand.get_command_output(
                ['oscryptocatcher', '--subdir-in-result', cache_path],
                graceful=False,
                is_json=True)

            self.log.debug("oscryptocatcher %s output: %s", cache_path, oscc)
            results['details'] = oscc['details']
            results['summary'] = oscc['summary']
            results['status'] = 'success'
        except Exception:
            raise FatalTaskError('oscryptocatcher failed')

        return results
예제 #4
0
 def _scrape_page(url):
     """Web scrape URL."""
     response = requests.get(url)
     if response.status_code != 200:
         raise FatalTaskError("Unable to access package web page at '%s'" %
                              url)
     return BeautifulSoup(response.text, 'lxml')
예제 #5
0
 def _handle_external_deps(ecosystem, deps):
     """Resolve external dependency specifications."""
     if not ecosystem or not deps:
         return []
     solver = get_ecosystem_solver(ecosystem)
     try:
         versions = solver.solve(deps)
     except Exception as exc:
         raise FatalTaskError("Dependencies could not be resolved: '{}'" .format(deps)) from exc
     return [{"package": k, "version": v} for k, v in versions.items()]
    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('ecosystem'))

        # get rid of version if scheduled from the core analyses
        arguments.pop('version', None)
        arguments.pop('document_id', None)

        db = self.storage.session
        try:
            ecosystem = Ecosystem.by_name(db, arguments['ecosystem'])
        except NoResultFound:
            raise FatalTaskError('Unknown ecosystem: %r' %
                                 arguments['ecosystem'])
        package = Package.get_or_create(db,
                                        ecosystem_id=ecosystem.id,
                                        name=arguments['name'])
        url = self.get_upstream_url(arguments)
        upstream = self.get_upstream_entry(package, url)
        if upstream is None:
            upstream = self.add_or_update_upstream(package, url)
        arguments['url'] = upstream.url

        if not arguments.get('force'):
            # can potentially schedule two flows of a same type at the same
            # time as there is no lock, but let's say it's OK
            if upstream.updated_at is not None \
                    and datetime.datetime.utcnow() - upstream.updated_at < self._UPDATE_INTERVAL:
                self.log.info(
                    'Skipping upstream package check as data are considered as recent - '
                    'last update %s.', upstream.updated_at)
                # keep track of start, but do not schedule nothing more
                # discard changes like updates
                db.rollback()
                return arguments

        # if this fails, it's actually OK, as there could be concurrency
        package_analysis = PackageAnalysis(
            package_id=package.id,
            started_at=datetime.datetime.utcnow(),
            finished_at=None)
        db.add(package_analysis)

        # keep track of updates
        upstream.updated_at = datetime.datetime.utcnow()

        db.commit()
        arguments['document_id'] = package_analysis.id
        return arguments
    def collect_npm(self, name):
        """Collect plain text description from npmjs.com for the given package.

        :param name: package name for which the plain text description should be gathered
        :return: plain text description
        """
        url = self._NPM_PACKAGE_URL.format(package=name)
        content = self._scrape_page(url).find(class_='content-column')

        if not content:
            raise FatalTaskError("No content was found at '%s' for NPM package '%s'", name)

        return content.text
    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))

        collector = self._COLLECTOR_HANDLERS.get(arguments['ecosystem'])

        if not collector:
            raise FatalTaskError(
                "No repository description collector registered for ecosystem '%s'"
                % arguments['ecosystem'])

        # TODO: we should probably do some additional post-processing later
        return collector(self, arguments['name'])
    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self._strict_assert(isinstance(arguments.get('ecosystem'), str))
        self._strict_assert(isinstance(arguments.get('name'), str))
        self._strict_assert(isinstance(arguments.get('version'), str))

        if arguments['ecosystem'] not in _SUPPORTED_ECOSYSTEMS:
            raise FatalTaskError('Unknown ecosystem: %r' % arguments['ecosystem'])

        return arguments
예제 #10
0
    def collect_npm(self, name):
        """Collect plain text description from npmjs.com for the given package.

        :param name: package name for which the plain text description should be gathered
        :return: plain text description
        """
        url = self._NPM_PACKAGE_URL.format(package=name)
        content = self._scrape_page(url).body

        if not content:
            raise FatalTaskError("No content was found at '%s' for NPM package '%s'", name)

        # rip out all script and style elements
        for script in content(["script", "style"]):
            script.extract()

        return content.text
    def run_gofedlib(self, topdir, timeout):
        """Run gofedlib-cli to extract dependencies from golang sources."""
        tc = TimedCommand([
            'gofedlib-cli', '--dependencies-main', '--dependencies-packages',
            '--dependencies-test', '--skip-errors', topdir
        ])
        status, data, err = tc.run(timeout=timeout)

        if status:
            raise FatalTaskError('gofedlib-cli failed: {err}'.format(err=err))

        result = json.loads(data[0])
        main_deps_count = len(result.get('deps-main', []))
        packages_count = len(result.get('deps-packages', []))
        self.log.debug('gofedlib found %i dependencies',
                       main_deps_count + packages_count)

        return [{'ecosystem': 'gofedlib', 'result': result}]
예제 #12
0
    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        result_data = {'status': 'unknown', 'summary': [], 'details': []}

        source_tarball_path = ObjectCache.get_from_dict(
            arguments).get_source_tarball()
        sa = StaticAnalysis(source_tarball_path)

        try:
            analysis_result = sa.analyze()

            # make output reproducible - scanning the same
            # input multiple times should always produce
            # the same output
            del analysis_result["scan"]["time-created"]
            del analysis_result["scan"]["time-finished"]
            del analysis_result["scan"]["host"]
            del analysis_result["scan"]["store-results-to"]

            stats = {}
            for defect in analysis_result["defects"]:
                stats.setdefault(defect["checker"], {"count": 0})
                stats[defect["checker"]]["count"] += 1
                try:
                    stats[defect["checker"]]["cwe"] = defect["cwe"]
                except KeyError:
                    pass
            result_data['summary'] = stats
            result_data['status'] = 'success'
            result_data['details'] = analysis_result
        except Exception as ex:
            msg = "static analysis was not successful: %r" % ex
            self.log.error(msg)
            raise FatalTaskError(msg) from ex

        return result_data
예제 #13
0
    def collect_pypi(self, name):
        """Collect plain text description from PyPI for the given package.

        :param name: package name for which the plain text description should be gathered
        :return: plain text description
        """
        url = self._PYPI_PACKAGE_URL.format(package=name)
        content = self._scrape_page(url).find(class_='project-description')

        if not content:
            raise FatalTaskError("No content was found at '%s' for PyPI package '%s'", name)

        # Remove content that is automatically added by PyPI - this content is
        # on the bottom and keeps info extracted from setup.py. We already keep
        # this data, so remove duplicity in fact.
        nodot = content.find(class_='nodot')
        if nodot:
            nodot.decompose()
        return content.text
예제 #14
0
    def run(self, node_args):
        """To be transparently called by Selinon.

        Selinon transparently calls run(), which takes care of task audit and
        some additional checks and calls execute().
        """
        # SQS guarantees 'deliver at least once', so there could be multiple
        # messages of a type, give up immediately
        if self.storage and isinstance(self.storage,
                                       (BayesianPostgres, PackagePostgres)):
            if self.storage.get_worker_id_count(self.task_id) > 0:
                raise FatalTaskError(
                    "Task with ID '%s' was already processed" % self.task_id)

        start = datetime.utcnow()
        try:
            result = self.execute(node_args)
        finally:
            # remove all files that were downloaded for this task
            ObjectCache.wipe()
        end = datetime.utcnow()

        if result:
            # Ensure result complies with the defined schema (if any) before saving
            self.validate_result(result)

        if result is None:
            # Keep track of None results and add _audit and _release keys
            result = {}

        if self.add_audit_info:
            # `_audit` key is added to every analysis info submitted
            result['_audit'] = {
                'started_at': json_serial(start),
                'ended_at': json_serial(end),
                'version': 'v1'
            }

            ecosystem_name = node_args.get('ecosystem')
            result['_release'] = '{}:{}:{}'.format(ecosystem_name,
                                                   node_args.get('name'),
                                                   node_args.get('version'))
        return result
예제 #15
0
    def run_scancode(scan_path):
        """Run scancode tool."""
        result_data = {'status': 'unknown', 'summary': {}, 'details': {}}
        command = [
            path.join(configuration.SCANCODE_PATH, 'scancode'),
            # Scan for licenses
            '--license',
            # Do not return license matches with scores lower than this score
            '--license-score',
            configuration.SCANCODE_LICENSE_SCORE,
            # Files without findings are omitted
            '--only-findings',
            # Use n parallel processes
            '--processes',
            configuration.SCANCODE_PROCESSES,
            # Do not print summary or progress messages
            '--quiet',
            # Strip the root directory segment of all paths
            '--strip-root',
            # Stop scanning a file if scanning takes longer than a timeout in seconds
            '--timeout',
            configuration.SCANCODE_TIMEOUT,
            scan_path
        ]
        for ignore_pattern in configuration.SCANCODE_IGNORE:
            command += ['--ignore', '{}'.format(ignore_pattern)]
        with username():
            tc = TimedCommand(command)
            status, output, error = tc.run(is_json=True, timeout=1200)
            if status != 0:
                raise FatalTaskError(
                    "Error (%s) during running command %s: %r" %
                    (str(status), command, error))

        details = LicenseCheckTask.process_output(output)
        result_data['details'] = details
        result_data['status'] = 'success'
        result_data['summary'] = {
            'sure_licenses': list(details['licenses'].keys())
        }

        return result_data
    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self._strict_assert(isinstance(arguments.get('ecosystem'), str))
        self._strict_assert(isinstance(arguments.get('name'), str))

        if arguments['ecosystem'] not in _SUPPORTED_ECOSYSTEMS:
            raise FatalTaskError('Unknown ecosystem: %r' %
                                 arguments['ecosystem'])

        # Don't ingest for private packages
        if not is_pkg_public(arguments['ecosystem'], arguments['name']):
            logger.info("Private package ingestion ignored %s %s",
                        arguments['ecosystem'], arguments['name'])
            raise NotABugFatalTaskError("Private package alert {} {}".format(
                arguments['ecosystem'], arguments['name']))

        return arguments
예제 #17
0
파일: base.py 프로젝트: pombredanne/worker
    def validate_result(self, result):
        """Ensure that results comply with the task schema, if defined.

        Tasks define a schema by setting schema_ref appropriately.
        Schemas are retrieved from workers/schemas/generated via pkgutil.
        """
        # Skip validation if no schema is defined
        schema_ref = self.schema_ref
        if schema_ref is None:
            return
        # Load schema if not yet loaded
        schema = self._schema
        if schema is None:
            schema = self._schema = load_worker_schema(schema_ref)
        # Validate result against schema
        try:
            jsonschema.validate(result, schema)
        except jsonschema.exceptions.ValidationError as e:
            raise FatalTaskError(
                'Schema validation failed: {e}'.format(e=str(e)))
        # Record the validated schema details
        set_schema_ref(result, schema_ref)
예제 #18
0
    def run_gofedlib(self, topdir, name, version, timeout):
        """Run gofedlib-cli to extract dependencies from golang sources."""
        tc = TimedCommand([
            'gofedlib-cli', '--dependencies-main', '--dependencies-packages',
            '--dependencies-test', '--skip-errors', topdir
        ])
        status, data, err = tc.run(timeout=timeout)

        if status:
            raise FatalTaskError('gofedlib-cli failed: {err}'.format(err=err))

        result = json.loads(data[0])
        main_deps_count = len(result.get('deps-main', []))
        packages_count = len(result.get('deps-packages', []))
        self.log.debug('gofedlib found %i dependencies',
                       main_deps_count + packages_count)

        result['code_repository'] = {
            'type': 'git',
            'url': 'https://{name}'.format(name=name)
        }
        result['name'] = name
        result['version'] = version
        return [{'ecosystem': 'gofedlib', 'result': result}]
예제 #19
0
    def run_mercator(self,
                     arguments,
                     cache_path,
                     keep_path=False,
                     outermost_only=True,
                     timeout=300,
                     resolve_poms=True):
        result_data = {'status': 'unknown', 'summary': [], 'details': []}
        mercator_target = arguments.get('cache_sources_path', cache_path)

        if arguments['ecosystem'] == 'go':
            # no Go support in Mercator-go yet, we handle it separately here
            tc = TimedCommand([
                'gofedlib-cli', '--dependencies-main',
                '--dependencies-packages', '--dependencies-test',
                '--skip-errors', mercator_target
            ])
            status, data, err = tc.run(timeout=timeout)
        else:
            tc = TimedCommand(['mercator', mercator_target])
            update_env = {
                'MERCATOR_JAVA_RESOLVE_POMS': 'true'
            } if resolve_poms else {}
            status, data, err = tc.run(timeout=timeout,
                                       is_json=True,
                                       update_env=update_env)
        if status != 0:
            self.log.error(err)
            raise FatalTaskError(err)

        ecosystem_object = self.storage.get_ecosystem(arguments['ecosystem'])
        if ecosystem_object.is_backed_by(EcosystemBackend.pypi):
            # TODO: attempt static setup.py parsing with mercator
            items = [self._merge_python_items(mercator_target, data)]
        elif arguments['ecosystem'] == 'go':
            result = {'result': json.loads(data[0])}
            # data normalized expects this
            result['ecosystem'] = 'gofedlib'
            # we only support git now
            result['result']['code_repository'] = {
                'type': 'git',
                'url': 'https://{name}'.format(name=arguments.get('name'))
            }

            result['result']['name'] = arguments.get('name')
            result['result']['version'] = arguments.get('version')
            items = [result]
            main_deps_count = len(result['result'].get('deps-main', []))
            packages_count = len(result['result'].get('deps-packages', []))
            self.log.debug('gofedlib found %i dependencies',
                           main_deps_count + packages_count)
        else:
            if outermost_only:
                # process only root level manifests (or the ones closest to the root level)
                items = self._data_normalizer.get_outermost_items(
                    data.get('items') or [])
            else:
                items = data.get('items') or []
            self.log.debug('mercator found %i projects, outermost %i',
                           len(data), len(items))

            if ecosystem_object.is_backed_by(EcosystemBackend.maven):
                # for maven we download both Jar and POM, we consider POM to be *the*
                #  source of information and don't want to duplicate info by including
                #  data from pom included in artifact (assuming it's included)
                items = [
                    d for d in items if d['ecosystem'].lower() == 'java-pom'
                ]

        result_data['details'] = [
            self._data_normalizer.handle_data(d, keep_path=keep_path)
            for d in items
        ]
        result_data['status'] = 'success'
        return result_data
예제 #20
0
    def _run_owasp_dep_check(self, scan_path, experimental=False):
        """Run OWASP Dependency-Check."""
        def _clean_dep_check_tmp():
            for dcdir in glob(os.path.join(gettempdir(), 'dctemp*')):
                rmtree(dcdir)

        s3 = StoragePool.get_connected_storage('S3VulnDB')
        depcheck = configuration.dependency_check_script_path
        with TemporaryDirectory() as temp_data_dir:
            if not s3.retrieve_depcheck_db_if_exists(temp_data_dir):
                self.log.debug('No cached OWASP Dependency-Check DB, generating fresh now ...')
                self.update_depcheck_db_on_s3()
                s3.retrieve_depcheck_db_if_exists(temp_data_dir)

            report_path = os.path.join(temp_data_dir, 'report.xml')
            command = [depcheck,
                       '--noupdate',
                       '--format', 'XML',
                       '--project', 'CVEcheckerTask',
                       '--data', temp_data_dir,
                       '--scan', scan_path,
                       '--out', report_path]
            if experimental:
                command.extend(['--enableExperimental'])
            for suppress_xml in glob(os.path.join(os.environ['OWASP_DEP_CHECK_SUPPRESS_PATH'],
                                                  '*.xml')):
                command.extend(['--suppress', suppress_xml])

            output = []
            old_java_opts = os.getenv('JAVA_OPTS', '')
            try:
                self.log.debug('Running OWASP Dependency-Check to scan %s for vulnerabilities' %
                               scan_path)
                os.environ['JAVA_OPTS'] = CVEcheckerTask.dependency_check_jvm_mem_limit
                output = TimedCommand.get_command_output(command,
                                                         graceful=False,
                                                         timeout=600)  # 10 minutes
                with open(report_path) as r:
                    report_dict = anymarkup.parse(r.read())
            except (TaskError, FileNotFoundError) as e:
                _clean_dep_check_tmp()
                for line in output:
                    self.log.warning(line)
                self.log.exception(str(e))
                raise FatalTaskError('OWASP Dependency-Check scan failed') from e
            finally:
                os.environ['JAVA_OPTS'] = old_java_opts
            _clean_dep_check_tmp()

        results = []
        dependencies = report_dict.get('analysis', {}).get('dependencies')  # value can be None
        dependencies = dependencies.get('dependency', []) if dependencies else []
        if not isinstance(dependencies, list):
            dependencies = [dependencies]
        for dependency in dependencies:
            vulnerabilities = dependency.get('vulnerabilities')  # value can be None
            vulnerabilities = vulnerabilities.get('vulnerability', []) if vulnerabilities else []
            if not isinstance(vulnerabilities, list):
                vulnerabilities = [vulnerabilities]
            for vulnerability in vulnerabilities:
                av = vulnerability.get('cvssAccessVector')
                av = av[0] if av else '?'
                ac = vulnerability.get('cvssAccessComplexity')
                ac = ac[0] if ac else '?'
                au = vulnerability.get('cvssAuthenticationr')
                au = au[0] if au else '?'
                c = vulnerability.get('cvssConfidentialImpact')
                c = c[0] if c else '?'
                i = vulnerability.get('cvssIntegrityImpact')
                i = i[0] if i else '?'
                a = vulnerability.get('cvssAvailabilityImpact')
                a = a[0] if a else '?'
                vector = "AV:{AV}/AC:{AC}/Au:{Au}/C:{C}/I:{Integrity}/A:{A}".\
                    format(AV=av, AC=ac, Au=au, C=c, Integrity=i, A=a)
                result = {
                    'cvss': {
                        'score': vulnerability.get('cvssScore'),
                        'vector': vector
                    }
                }
                references = vulnerability.get('references', {}).get('reference', [])
                if not isinstance(references, list):
                    references = [references]
                result['references'] = [r.get('url') for r in references]
                for field in ['severity', 'description']:
                    result[field] = vulnerability.get(field)
                result['id'] = vulnerability.get('name')
                results.append(result)

        return {'summary': [r['id'] for r in results],
                'status': 'success',
                'details': results}
예제 #21
0
    def execute(self, arguments):
        self._strict_assert(arguments.get('manifest'))
        self._strict_assert(arguments.get('user_profile'))

        user_profile = arguments['user_profile']
        self.store_in_bucket(user_profile)

        # If we receive a manifest file we need to save it first
        result = []
        for manifest in arguments['manifest']:
            temp_path = mkdtemp()

            with open(os.path.join(temp_path, manifest['filename']),
                      'a+') as fd:
                fd.write(manifest['content'])

            # mercator-go does not work if there is no package.json
            if 'shrinkwrap' in manifest['filename'].lower():
                with open(os.path.join(temp_path, 'package.json'), 'w') as f:
                    f.write(json.dumps({}))

            # TODO: this is a workaround since stack analysis is not handled by dispatcher, so we create instance manually for now
            subtask = MercatorTask(None, None, None, None, None)
            # since we're creating MercatorTask dynamically in code, we need to make sure
            #  that it has storage; storage is assigned to tasks dynamically based on task_name
            subtask.task_name = self.task_name
            arguments['ecosystem'] = manifest['ecosystem']
            out = subtask.run_mercator(arguments, temp_path)

            if temp_path:
                rmtree(temp_path, ignore_errors=True)
            if not out["details"]:
                raise FatalTaskError(
                    "No metadata found processing manifest file '{}'".format(
                        manifest['filename']))
            out["details"][0]['manifest_file'] = manifest['filename']
            out["details"][0]['ecosystem'] = manifest['ecosystem']

            # If we're handling an external request we need to convert dependency specifications to
            # concrete versions that we can query later on in the `AggregatorTask`
            manifest_descriptor = get_manifest_descriptor_by_filename(
                manifest['filename'])
            if 'external_request_id' in arguments:
                if manifest_descriptor.has_resolved_deps:  # npm-shrinkwrap.json, pom.xml, requirements.txt
                    if "_dependency_tree_lock" in out["details"][
                            0]:  # npm-shrinkwrap.json, requirements.txt
                        manifest_dependencies = out["details"][0][
                            "_dependency_tree_lock"]["dependencies"]
                    else:  # pom.xml
                        manifest_dependencies = out["details"][0][
                            "dependencies"]
                    if manifest_descriptor.has_recursive_deps:  # npm-shrinkwrap.json

                        def _flatten(deps, collect):
                            for dep in deps:
                                collect.append({
                                    'package': dep['name'],
                                    'version': dep['version']
                                })
                                _flatten(dep['dependencies'], collect)

                        resolved_deps = []
                        _flatten(manifest_dependencies, resolved_deps)
                    else:  # pom.xml, requirements.txt
                        resolved_deps =\
                            [{'package': x.split(' ')[0], 'version': x.split(' ')[1]}
                             for x in manifest_dependencies]
                else:  # package.json
                    resolved_deps = self._handle_external_deps(
                        self.storage.get_ecosystem(arguments['ecosystem']),
                        out["details"][0]["dependencies"])
                out["details"][0]['_resolved'] = resolved_deps
            result.append(out)

        return {'result': result, 'user_profile': user_profile}
    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self.log.debug("Input Arguments: {}".format(arguments))
        self._strict_assert(isinstance(arguments.get('ecosystem'), str))
        self._strict_assert(isinstance(arguments.get('name'), str))
        self._strict_assert(isinstance(arguments.get('version'), str))

        db = self.storage.session
        try:
            ecosystem = Ecosystem.by_name(db, arguments['ecosystem'])
        except NoResultFound:
            raise FatalTaskError('Unknown ecosystem: %r' %
                                 arguments['ecosystem'])

        # make sure we store package name in its normalized form
        arguments['name'] = normalize_package_name(ecosystem.backend.name,
                                                   arguments['name'])

        if len(pattern_ignore.findall(arguments['version'])) > 0:
            self.log.info("Incorrect version alert {} {}".format(
                arguments['name'], arguments['version']))
            raise NotABugFatalTaskError("Incorrect version alert {} {}".format(
                arguments['name'], arguments['version']))

        # Dont try ingestion for private packages
        if is_pkg_public(arguments['ecosystem'], arguments['name']):
            self.log.info("Ingestion flow for {} {}".format(
                arguments['ecosystem'], arguments['name']))
        else:
            self.log.info("Private package ingestion ignored {} {}".format(
                arguments['ecosystem'], arguments['name']))
            raise NotABugFatalTaskError("Private package alert {} {}".format(
                arguments['ecosystem'], arguments['name']))

        p = Package.get_or_create(db,
                                  ecosystem_id=ecosystem.id,
                                  name=arguments['name'])
        v = Version.get_or_create(db,
                                  package_id=p.id,
                                  identifier=arguments['version'])

        if not arguments.get('force'):
            if db.query(Analysis).filter(
                    Analysis.version_id == v.id).count() > 0:
                arguments['analysis_already_exists'] = True
                self.log.debug(
                    "Arguments returned by initAnalysisFlow without force: {}".
                    format(arguments))
                return arguments

        cache_path = mkdtemp(dir=self.configuration.WORKER_DATA_DIR)
        epv_cache = ObjectCache.get_from_dict(arguments)
        npm_dir = self.configuration.NPM_DATA_DIR

        try:
            if not epv_cache.\
                    has_source_tarball():
                _, source_tarball_path = IndianaJones.fetch_artifact(
                    ecosystem=ecosystem,
                    artifact=arguments['name'],
                    version=arguments['version'],
                    target_dir=cache_path)
                epv_cache.put_source_tarball(source_tarball_path)

            if ecosystem.is_backed_by(EcosystemBackend.maven):
                if not epv_cache.has_source_jar():
                    try:
                        source_jar_path = self._download_source_jar(
                            cache_path, ecosystem, arguments)
                        epv_cache.put_source_jar(source_jar_path)
                    except Exception as e:
                        self.log.info(
                            'Failed to fetch source jar for maven artifact "{n}/{v}": {err}'
                            .format(n=arguments.get('name'),
                                    v=arguments.get('version'),
                                    err=str(e)))

                if not epv_cache.has_pom_xml():
                    pom_xml_path = self._download_pom_xml(
                        cache_path, ecosystem, arguments)
                    epv_cache.put_pom_xml(pom_xml_path)
        finally:
            # always clean up cache
            shutil.rmtree(cache_path)
            if arguments['ecosystem'] == "npm":
                shutil.rmtree(npm_dir, True)

        a = Analysis(version=v,
                     access_count=1,
                     started_at=datetime.datetime.utcnow())
        db.add(a)
        db.commit()

        arguments['document_id'] = a.id

        # export ecosystem backend so we can use it to easily control flow later
        arguments['ecosystem_backend'] = ecosystem.backend.name

        self.log.debug(
            "Arguments returned by InitAnalysisFlow are: {}".format(arguments))
        return arguments
예제 #23
0
    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self._strict_assert(arguments.get('data'))
        self._strict_assert(arguments.get('external_request_id'))

        db = self.storage.session
        try:
            results = db.query(StackAnalysisRequest)\
                        .filter(StackAnalysisRequest.id == arguments.get('external_request_id'))\
                        .first()
        except SQLAlchemyError:
            db.rollback()
            raise

        manifests = []
        if results is not None:
            row = results.to_dict()
            request_json = row.get("requestJson", {})
            manifests = request_json.get('manifest', [])

        # If we receive a manifest file we need to save it first
        result = []
        for manifest in manifests:
            with TemporaryDirectory() as temp_path:
                with open(os.path.join(temp_path, manifest['filename']),
                          'a+') as fd:
                    fd.write(manifest['content'])

                # mercator-go does not work if there is no package.json
                if 'shrinkwrap' in manifest['filename'].lower():
                    with open(os.path.join(temp_path, 'package.json'),
                              'w') as f:
                        f.write(json.dumps({}))

                # Create instance manually since stack analysis is not handled by dispatcher
                subtask = MercatorTask.create_test_instance(
                    task_name=self.task_name)
                arguments['ecosystem'] = manifest['ecosystem']
                out = subtask.run_mercator(arguments, temp_path)

            if not out["details"]:
                raise FatalTaskError(
                    "No metadata found processing manifest file '{}'".format(
                        manifest['filename']))

            if 'dependencies' not in out['details'][0] and out.get(
                    'status', None) == 'success':
                raise FatalTaskError(
                    "Dependencies could not be resolved from manifest file '{}'"
                    .format(manifest['filename']))

            out["details"][0]['manifest_file'] = manifest['filename']
            out["details"][0]['ecosystem'] = manifest['ecosystem']
            out["details"][0]['manifest_file_path'] = manifest.get(
                'filepath', 'File path not available')

            # If we're handling an external request we need to convert dependency specifications to
            # concrete versions that we can query later on in the `AggregatorTask`
            manifest_descriptor = get_manifest_descriptor_by_filename(
                manifest['filename'])
            if 'external_request_id' in arguments:
                manifest_dependencies = []
                if manifest_descriptor.has_resolved_deps:  # npm-shrinkwrap.json, pom.xml
                    if "_dependency_tree_lock" in out["details"][
                            0]:  # npm-shrinkwrap.json
                        if 'dependencies' in out['details'][0][
                                "_dependency_tree_lock"]:
                            manifest_dependencies = out["details"][0][
                                "_dependency_tree_lock"].get(
                                    "dependencies", [])
                    else:  # pom.xml
                        if 'dependencies' in out['details'][0]:
                            manifest_dependencies = out["details"][0].get(
                                "dependencies", [])
                    if manifest_descriptor.has_recursive_deps:  # npm-shrinkwrap.json

                        def _flatten(deps, collect):
                            for dep in deps:
                                collect.append({
                                    'package': dep['name'],
                                    'version': dep['version']
                                })
                                _flatten(dep['dependencies'], collect)

                        resolved_deps = []
                        _flatten(manifest_dependencies, resolved_deps)
                    else:  # pom.xml
                        resolved_deps =\
                            [{'package': x.split(' ')[0], 'version': x.split(' ')[1]}
                             for x in manifest_dependencies]
                else:  # package.json, requirements.txt
                    resolved_deps = self._handle_external_deps(
                        self.storage.get_ecosystem(arguments['ecosystem']),
                        out["details"][0]["dependencies"])
                out["details"][0]['_resolved'] = resolved_deps
            result.append(out)

        return {'result': result}
예제 #24
0
    def execute(self, arguments, db, manifests, source=None):
        """Dependency finder logic."""
        # TODO: reduce cyclomatic complexity
        # If we receive a manifest file we need to save it first
        result = []
        for manifest in manifests:
            content_hash = None
            if source == 'osio':
                content_hash = generate_content_hash(manifest['content'])
                current_app.logger.info("{} file digest is {}".format(manifest['filename'],
                                                                      content_hash))

                s3 = AmazonS3(bucket_name='boosters-manifest')
                try:
                    s3.connect()
                    manifest['content'] = s3.retrieve_blob(content_hash).decode('utf-8')
                except ClientError as e:
                    current_app.logger.error("Unexpected error while retrieving S3 data: %s" % e)
                    raise

            with TemporaryDirectory() as temp_path:
                with open(os.path.join(temp_path, manifest['filename']), 'a+') as fd:
                    fd.write(manifest['content'])

                # mercator-go does not work if there is no package.json
                if 'shrinkwrap' in manifest['filename'].lower():
                    with open(os.path.join(temp_path, 'package.json'), 'w') as f:
                        f.write(json.dumps({}))

                # Create instance manually since stack analysis is not handled by dispatcher
                subtask = MercatorTask.create_test_instance(task_name='metadata')
                arguments['ecosystem'] = manifest['ecosystem']
                out = subtask.run_mercator(arguments, temp_path, resolve_poms=False)

            if not out["details"]:
                raise FatalTaskError("No metadata found processing manifest file '{}'"
                                     .format(manifest['filename']))

            if 'dependencies' not in out['details'][0] and out.get('status', None) == 'success':
                raise FatalTaskError("Dependencies could not be resolved from manifest file '{}'"
                                     .format(manifest['filename']))

            out["details"][0]['manifest_file'] = manifest['filename']
            out["details"][0]['ecosystem'] = manifest['ecosystem']
            out["details"][0]['manifest_file_path'] = manifest.get('filepath',
                                                                   'File path not available')

            # If we're handling an external request we need to convert dependency specifications to
            # concrete versions that we can query later on in the `AggregatorTask`
            manifest_descriptor = get_manifest_descriptor_by_filename(manifest['filename'])
            if 'external_request_id' in arguments:
                manifest_dependencies = []
                if manifest_descriptor.has_resolved_deps:  # npm-shrinkwrap.json, pom.xml
                    if "_dependency_tree_lock" in out["details"][0]:  # npm-shrinkwrap.json
                        if 'dependencies' in out['details'][0]["_dependency_tree_lock"]:
                            manifest_dependencies = out["details"][0]["_dependency_tree_lock"].get(
                                "dependencies", [])
                    else:  # pom.xml
                        if 'dependencies' in out['details'][0]:
                            manifest_dependencies = out["details"][0].get("dependencies", [])
                    if manifest_descriptor.has_recursive_deps:  # npm-shrinkwrap.json
                        def _flatten(deps, collect):
                            for dep in deps:
                                collect.append({'package': dep['name'], 'version': dep['version']})
                                _flatten(dep['dependencies'], collect)
                        resolved_deps = []
                        _flatten(manifest_dependencies, resolved_deps)
                    else:  # pom.xml
                        resolved_deps =\
                            [{'package': x.split(' ')[0], 'version': x.split(' ')[1]}
                             for x in manifest_dependencies]
                else:  # package.json, requirements.txt
                    try:
                        resolved_deps = self._handle_external_deps(
                            Ecosystem.by_name(db, arguments['ecosystem']),
                            out["details"][0]["dependencies"])
                    except Exception:
                        raise

                out["details"][0]['_resolved'] = resolved_deps
            result.append(out)

        return {'result': result}
예제 #25
0
 def _strict_assert(self, assert_cond):
     if not assert_cond:
         raise FatalTaskError("Strict assert failed.")
예제 #26
0
    def run_mercator(self,
                     arguments,
                     cache_path,
                     keep_path=False,
                     outermost_only=True,
                     timeout=300,
                     resolve_poms=True):
        """Run mercator tool."""
        # TODO: reduce cyclomatic complexity
        result_data = {'status': 'unknown', 'summary': [], 'details': []}
        mercator_target = arguments.get('cache_sources_path', cache_path)

        tc = TimedCommand(['mercator', mercator_target])
        update_env = {
            'MERCATOR_JAVA_RESOLVE_POMS': 'true'
        } if resolve_poms else {}
        status, data, err = tc.run(timeout=timeout,
                                   is_json=True,
                                   update_env=update_env)
        if status != 0:
            self.log.error(err)
            raise FatalTaskError(err)

        ecosystem_object = self.storage.get_ecosystem(arguments['ecosystem'])
        if ecosystem_object.is_backed_by(EcosystemBackend.pypi):
            # TODO: attempt static setup.py parsing with mercator
            items = [self._merge_python_items(mercator_target, data)]
            if items == [None]:
                raise NotABugFatalTaskError(
                    'Found no usable PKG-INFO/metadata.json/requirements.txt')
        else:
            if outermost_only:
                # process only root level manifests (or the ones closest to the root level)
                items = self._data_normalizer.get_outermost_items(
                    data.get('items') or [])
            else:
                items = data.get('items') or []
            self.log.debug('mercator found %i projects, outermost %i',
                           len(data), len(items))

            if ecosystem_object.is_backed_by(EcosystemBackend.maven):
                # for maven we download both Jar and POM, we consider POM to be *the*
                #  source of information and don't want to duplicate info by including
                #  data from pom included in artifact (assuming it's included)
                items = [
                    d for d in items if d['ecosystem'].lower() == 'java-pom'
                ]
            elif ecosystem_object.is_backed_by(EcosystemBackend.npm):
                # ignore other metadata files, e.g. requirements.txt
                items = [d for d in items if d['ecosystem'].lower() == 'npm']
            elif arguments['ecosystem'] == 'go':
                items = [
                    d for d in items if d['ecosystem'].lower() == 'go-glide'
                ]
                if not items:
                    # Mercator found no Go Glide files, run gofedlib
                    items = self.run_gofedlib(topdir=mercator_target,
                                              name=arguments.get('name'),
                                              version=arguments.get('version'),
                                              timeout=timeout)

        result_data['details'] = [
            self._data_normalizer.handle_data(d, keep_path=keep_path)
            for d in items
        ]
        result_data['status'] = 'success'
        return result_data
예제 #27
0
    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self.log.debug("Input Arguments: {}".format(arguments))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))
        self._strict_assert(arguments.get('ecosystem'))

        # make sure we store package name based on ecosystem package naming case sensitivity
        arguments['name'] = normalize_package_name(arguments['ecosystem'], arguments['name'])

        db = self.storage.session
        try:
            ecosystem = Ecosystem.by_name(db, arguments['ecosystem'])
        except NoResultFound:
            raise FatalTaskError('Unknown ecosystem: %r' % arguments['ecosystem'])

        p = Package.get_or_create(db, ecosystem_id=ecosystem.id, name=arguments['name'])
        v = Version.get_or_create(db, package_id=p.id, identifier=arguments['version'])

        if not arguments.get('force'):
            # TODO: this is OK for now, but if we will scale and there will be
            # 2+ workers running this task they can potentially schedule two
            # flows of a same type at the same time
            if db.query(Analysis).filter(Analysis.version_id == v.id).count() > 0:
                # we need to propagate flags that were passed to flow, but not
                # E/P/V - this way we are sure that for example graph import is
                # scheduled (arguments['force_graph_sync'] == True)
                arguments.pop('name')
                arguments.pop('version')
                arguments.pop('ecosystem')
                self.log.debug("Arguments returned by initAnalysisFlow without force: {}"
                               .format(arguments))
                return arguments

        cache_path = mkdtemp(dir=self.configuration.WORKER_DATA_DIR)
        epv_cache = ObjectCache.get_from_dict(arguments)

        try:
            if not epv_cache.\
                    has_source_tarball():
                _, source_tarball_path = IndianaJones.fetch_artifact(
                    ecosystem=ecosystem,
                    artifact=arguments['name'],
                    version=arguments['version'],
                    target_dir=cache_path
                )
                epv_cache.put_source_tarball(source_tarball_path)

            if ecosystem.is_backed_by(EcosystemBackend.maven):
                if not epv_cache.has_source_jar():
                    try:
                        source_jar_path = self._download_source_jar(cache_path, ecosystem,
                                                                    arguments)
                        epv_cache.put_source_jar(source_jar_path)
                    except Exception as e:
                        self.log.info(
                            'Failed to fetch source jar for maven artifact "{n}/{v}": {err}'.
                            format(n=arguments.get('name'),
                                   v=arguments.get('version'),
                                   err=str(e))
                        )

                if not epv_cache.has_pom_xml():
                    pom_xml_path = self._download_pom_xml(cache_path, ecosystem, arguments)
                    epv_cache.put_pom_xml(pom_xml_path)
        finally:
            # always clean up cache
            shutil.rmtree(cache_path)

        a = Analysis(version=v, access_count=1, started_at=datetime.datetime.utcnow())
        db.add(a)
        db.commit()

        arguments['document_id'] = a.id

        # export ecosystem backend so we can use it to easily control flow later
        arguments['ecosystem_backend'] = ecosystem.backend.name

        self.log.debug("Arguments returned by InitAnalysisFlow are: {}".format(arguments))
        return arguments