class LibrariesIoTask(BaseTask):
    """Collects statistics from Libraries.io."""

    _analysis_name = "libraries_io"
    schema_ref = SchemaRef(_analysis_name, '2-0-0')

    @staticmethod
    def recent_releases(versions, count=10):
        """Sort versions by 'published_at' and return 'count' latest."""
        return sorted(versions, key=itemgetter('published_at'))[-count:]

    def execute(self, arguments):
        """Task entrypoint."""
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))

        name = arguments['name']
        ecosystem = arguments['ecosystem']
        if ecosystem == 'go':
            name = quote(name, safe='')

        project_url = self.configuration.libraries_io_project_url(ecosystem, name)
        project = get_response(project_url)
        versions = project['versions']
        details = {'dependent_repositories': {'count': project['dependent_repos_count']},
                   'dependents': {'count': project['dependents_count']},
                   'releases': {'count': len(versions),
                                'recent': self.recent_releases(versions)
                                }
                   }

        return {'status': 'success',
                'summary': [],
                'details': details}
class OSCryptoCatcherTask(BaseTask):
    """ Runs oscryptocatcher tool for matching crypto algorithms """
    _analysis_name = 'crypto_algorithms'
    schema_ref = SchemaRef(_analysis_name, '1-0-0')

    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        cache_path = ObjectCache.get_from_dict(
            arguments).get_extracted_source_tarball()

        results = {'status': 'unknown', 'summary': {}, 'details': []}

        try:
            oscc = TimedCommand.get_command_output(
                ['oscryptocatcher', '--subdir-in-result', cache_path],
                graceful=False,
                is_json=True)

            self.log.debug("oscryptocatcher %s output: %s", cache_path, oscc)
            results['details'] = oscc['details']
            results['summary'] = oscc['summary']
            results['status'] = 'success'
        except Exception:
            results['status'] = 'error'

        return results
예제 #3
0
class ComponentAnalyses(ResourceWithSchema):
    method_decorators = [login_required]

    schema_ref = SchemaRef('analyses_graphdb', '1-2-0')

    @staticmethod
    def get(ecosystem, package, version):
        decoded = decode_token()            
        if ecosystem == 'maven':
            package = MavenCoordinates.normalize_str(package)
        package = case_sensitivity_transform(ecosystem, package)
        result = get_analyses_from_graph(ecosystem, package, version)

        if result is not None:
            # Known component for Bayesian
            server_create_component_bookkeeping(ecosystem, package, version, decoded)
            return result

        if os.environ.get("INVOKE_API_WORKERS", "") == "1":
            # Enter the unknown path
            server_create_analysis(ecosystem, package, version, user_profile=decoded, api_flow=True, force=False, force_graph_sync=True)
            msg = "Package {ecosystem}/{package}/{version} is unavailable. The package will be available shortly,"\
                  " please retry after some time.".format(ecosystem=ecosystem, package=package, version=version)
            raise HTTPError(202, msg)
        else:
            server_create_analysis(ecosystem, package, version, user_profile=decoded, api_flow=False, force=False, force_graph_sync=True)
            msg = "No data found for {ecosystem} Package {package}/{version}".format(ecosystem=ecosystem,
                                                                                     package=package, version=version)
            raise HTTPError(404, msg)
class BlackDuckRelease(object):
    """
    Release object consist of version string, unique identifier
    and `datetime.datetime` information when this particular version was released
    """
    @schema.input(SchemaRef("blackduck-release", "1-0-0"))
    def __init__(self, json_data, project):
        self._version = json_data['version']
        self._id = json_data['versionId']
        self._released_at = datetime.strptime(json_data['releasedOn'], "%Y-%m-%dT%H:%M:%S.%fZ")
        self._project = project

    @property
    def project(self):
        return self._project

    @property
    def version(self):
        """ Release version """
        return self._version

    @property
    def id(self):
        """ Unique identifier """
        return self._id

    @property
    def released_at(self):
        """ Release date time """
        return self._released_at
예제 #5
0
class LinguistTask(BaseTask):
    """GitHub's tool to figure out what language is used in code."""

    _analysis_name = 'languages'
    schema_ref = SchemaRef(_analysis_name, '1-0-0')

    def _parse_linguist(self, output):
        if not output:
            return None

        def extract_value(line):
            """Extract the language name: `language:   Python` -> `Python`."""
            return line.split(':', 1)[1].strip()

        lines_matcher = re.compile('(\d+) lines \((\d+) sloc\)')
        m = lines_matcher.search(output[0])
        lines, sloc = 0, 0
        if m:
            lines, sloc = int(m.groups(1)[0]), int(m.groups(2)[0])
        tml = zip(['type', 'mime', 'language'],
                  [extract_value(l) for l in output[1:4]])
        data = dict(tml,
                    lines=lines,
                    sloc=sloc)
        return data

    def execute(self, arguments):
        """Start the task."""
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        results = []
        cache_path = ObjectCache.get_from_dict(arguments).get_extracted_source_tarball()

        def worker(path):
            mime = TimedCommand.get_command_output(['file', path, '-b', '-i']).pop()
            self.log.debug("%s mime = %s", path, mime)
            typ = TimedCommand.get_command_output(['file', path, '-b'])
            self.log.debug("%s filetype = %s", path, typ)

            linguist = None
            if 'charset=binary' not in mime:
                linguist = self._parse_linguist(
                    TimedCommand.get_command_output(['linguist', path])
                )
                self.log.debug("%s linguist output = %s", path, linguist)

            results.append({
                "type": typ,
                "output": linguist,
                "path": os.path.relpath(path, cache_path),
            })

        with ThreadPool(target=worker) as tp:
            for path in get_all_files_from(cache_path, path_filter=skip_git_files):
                tp.add_task(path)

        return {'summary': [], 'status': 'success', 'details': results}
예제 #6
0
class DigesterTask(BaseTask):
    """Computes various digests of all files found in target cache path."""

    _analysis_name = 'digests'
    schema_ref = SchemaRef(_analysis_name, '1-0-0')

    def compute_ssdeep(self, target):
        """Compute SSdeep piece-wise linear hash of target."""
        # 0 : ssdeep header
        # 1 : hash,filename
        data = TimedCommand.get_command_output(['ssdeep', '-c', '-s', target])
        try:
            return data[1].split(',')[0].strip()
        except IndexError as exc:
            self.log.error("unable to compute ssdeep of %r", target)
            raise RuntimeError("can't compute digest of %r" % target) from exc

    def compute_digests(self, cache_path, f, artifact=False):
        """Compute digests of tarball f."""
        f_digests = {
            'sha256': compute_digest(f, 'sha256'),
            'sha1': compute_digest(f, 'sha1'),
            'md5': compute_digest(f, 'md5'),
            'ssdeep': self.compute_ssdeep(f)
        }

        if artifact:
            f_digests['artifact'] = True
            f_digests['path'] = os.path.basename(f)
        else:
            f_digests['path'] = os.path.relpath(f, cache_path)

        return f_digests

    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        epv_cache = ObjectCache.get_from_dict(arguments)
        # cache_path = epv_cache.get_extracted_source_tarball()

        results = []
        # We don't compute digests of files in extracted tarball, only the tarball itself
        # for f in get_all_files_from(cache_path, path_filter=skip_git_files):
        #    results.append(self.compute_digests(cache_path, f))

        source_tarball_path = epv_cache.get_source_tarball()
        # Compute digests of tarball and mark it as such
        results.append(
            self.compute_digests(None, source_tarball_path, artifact=True))

        return {'summary': [], 'status': 'success', 'details': results}
예제 #7
0
class LibrariesIoTask(BaseTask):
    """Collects statistics from Libraries.io."""

    _analysis_name = "libraries_io"
    schema_ref = SchemaRef(_analysis_name, '2-0-0')

    @staticmethod
    def recent_releases(versions, count=10):
        """Sort versions by 'published_at' and return 'count' latest."""
        version_list = sorted(versions,
                              key=itemgetter('published_at'))[-count:]
        final_ver_list = []
        for ver in version_list:
            tmp = {
                "number": ver['number'],
                "published_at": ver['published_at']
            }
            final_ver_list.append(tmp)
        return final_ver_list

    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))

        rdb_session = StoragePool.get_connected_storage(
            'BayesianPostgres').session

        name = arguments['name']
        ecosystem = arguments['ecosystem']
        if ecosystem == 'go':
            name = quote(name, safe='')

        project_url = self.configuration.libraries_io_project_url(
            Ecosystem.by_name(rdb_session, ecosystem), name)
        project = get_response(project_url)
        versions = project['versions']
        details = {
            'dependent_repositories': {
                'count': project['dependent_repos_count']
            },
            'dependents': {
                'count': project['dependents_count']
            },
            'releases': {
                'count': len(versions),
                'recent': self.recent_releases(versions)
            }
        }

        return {'status': 'success', 'summary': [], 'details': details}
 def test_schema_lookup(self, tmpdir):
     library = SchemaLibrary(str(tmpdir))
     requested_schema = SchemaRef("example", "1-0-0")
     with pytest.raises(SchemaLookupError):
         library.load_schema(requested_schema)
     schema_path = tmpdir.join("example-v1-0-0.schema.json")
     dummy_schema = {"dummy-schema": "example"}
     serialized_schema = json.dumps(dummy_schema).encode('utf-8')
     schema_path.write_binary(serialized_schema)
     assert library.read_binary_schema(
         requested_schema) == serialized_schema
     assert library.load_schema(requested_schema) == dummy_schema
class KeywordsTaggingTask(KeywordsTaggingTaskBase):
    """Compute tags based on gathered natural text - package-version level keywords."""

    _analysis_name = 'keywords_tagging'
    schema_ref = SchemaRef(_analysis_name, '1-0-0')

    def _package_version_level_keywords(self, keywords_file_name,
                                        stopwords_file_name, arguments):
        """Compute package-version level keywords from metadata."""
        # Keep f8a_tagger import local as other components dependent on
        # f8a_worker do not require it installed.
        from f8a_tagger import lookup_text as keywords_lookup_text

        details = {}
        if 'metadata' in self.parent.keys():
            details['description'] = {}
            metadata = self.parent_task_result('metadata')
            description = metadata.get('details',
                                       [{}])[0].get('description', '')
            if description:
                self.log.debug("Computing keywords from description: '%s'",
                               description)
                details['description'] = keywords_lookup_text(
                    description,
                    keywords_file=keywords_file_name,
                    stopwords_file=stopwords_file_name,
                    **self._LOOKUP_CONF)

            # explicitly gather declared keywords by publisher
            self.log.debug(
                "Aggregating explicitly stated keywords by publisher")
            details['keywords'] = metadata.get('details',
                                               [{}])[0].get('keywords', [])

        return details

    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        keywords_file_name, stopwords_file_name = self._get_config_files(
            arguments['ecosystem'])
        details = self._package_version_level_keywords(keywords_file_name,
                                                       stopwords_file_name,
                                                       arguments)

        return {'status': 'success', 'summary': [], 'details': details}
 def test_bundled_schema_lookup(self, tmpdir):
     pkgdir = tmpdir.mkdir(tmpdir.basename)
     pkgdir.ensure("__init__.py")
     schemadir = pkgdir.mkdir("schemas")
     module = pkgdir.pyimport()
     library = BundledSchemaLibrary("schemas", module.__name__)
     requested_schema = SchemaRef("example", "1-0-0")
     with pytest.raises(SchemaLookupError):
         library.load_schema(requested_schema)
     schema_path = schemadir.join("example-v1-0-0.schema.json")
     dummy_schema = {"dummy-schema": "example"}
     serialized_schema = json.dumps(dummy_schema).encode('utf-8')
     schema_path.write_binary(serialized_schema)
     assert library.read_binary_schema(
         requested_schema) == serialized_schema
     assert library.load_schema(requested_schema) == dummy_schema
예제 #11
0
class BinwalkTask(BaseTask):
    """Find and extract interesting files / data from binary images."""

    _analysis_name = 'binary_data'
    schema_ref = SchemaRef(_analysis_name, '1-0-0')

    @staticmethod
    def parse_binwalk(output):
        """Parse binwalk tool output and accumulate descriptions."""
        if not output:
            return None
        import re
        matcher = re.compile(r'^\d{,8}\s*0x[A-Fa-f0-9]{,8}\s*(.*)$')
        matched = []
        for line in output:
            match = matcher.match(line)
            if match:
                matched.append(match.groups(1)[0])
        return matched

    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        cache_path = ObjectCache.get_from_dict(arguments).get_source_tarball()

        results = []
        for path in get_all_files_from(cache_path, path_filter=skip_git_files):
            self.log.debug("path = %s", path)

            bw = TimedCommand(['binwalk', '-B', path])
            status, output, error = bw.run(timeout=60)
            self.log.debug("status = %s, error = %s", status, error)
            self.log.debug("output = %s", output)

            parsed_binwalk = self.parse_binwalk(output)
            results.append({
                "path": os.path.relpath(path, cache_path),
                "output": parsed_binwalk,
            })
        return {'summary': [], 'status': 'success', 'details': results}
예제 #12
0
class StackAnalysesByGraphGET(ResourceWithSchema):
    method_decorators = [login_required]
    schema_ref = SchemaRef('stack_analyses', '2-1-4')

    @staticmethod
    def get(external_request_id):
        if get_request_count(rdb, external_request_id) < 1:
            raise HTTPError(404, "Invalid request ID '{t}'.".format(t=external_request_id))

        stack_result = retrieve_worker_result(rdb, external_request_id, "stack_aggregator")
        reco_result = retrieve_worker_result(rdb, external_request_id, "recommendation")

        if stack_result is None and reco_result is None:
            raise HTTPError(202, "Analysis for request ID '{t}' is in progress".format(t=external_request_id))

        if stack_result == -1 and reco_result == -1:
            raise HTTPError(404, "Worker result for request ID '{t}' doesn't exist yet".format(t=external_request_id))

        started_at = None
        finished_at = None
        manifest_response = []
        recommendations = {}

        if stack_result != None and 'task_result' in stack_result:
            if stack_result["task_result"] != None:
                started_at = stack_result["task_result"]["_audit"]["started_at"]
                finished_at = stack_result["task_result"]["_audit"]["ended_at"]
                manifest_response.append(stack_result["task_result"])

        if reco_result is not None and 'task_result' in reco_result:
            if reco_result["task_result"] != None:
                recommendations = reco_result['task_result']

        return {
            "started_at": started_at,
            "finished_at": finished_at,
            "request_id": external_request_id,
            "result": manifest_response,
            "recommendation": recommendations
        }
class BlackDuckProject(object):
    """
    Project contains information about specific {ecosystem}-{package} pair
    """
    @schema.input(SchemaRef("blackduck-project", "1-0-0"))
    def __init__(self, json_data):
        self._source = json_data
        self._name = json_data['name']
        self._id = json_data['id']
        self._canonical_release_id = json_data['canonicalReleaseId']
        self._urls = {k: v for k, v in json_data.items() if k.endswith('Url')}

    @property
    def name(self):
        """ Name of the project """
        return self._name

    @property
    def id(self):
        """ Unique identifier of the project """
        return self._id

    @property
    def urls(self):
        """ Flat list of additional URLs for this project """
        return self._urls

    @property
    def canonical_release_id(self):
        """ Latest release for the given project (in terms of version number) """
        return self._canonical_release_id

    @property
    def source(self):
        """ Source JSON from which this object was parsed """
        return self._source
예제 #14
0
 def test_bundled_dynamic_schema_lookup(self, tmpdir, monkeypatch):
     """Tests for bundled dynamic schema lookup."""
     pkgdir = tmpdir.mkdir(tmpdir.basename)
     pkgdir.ensure("__init__.py")
     schemadir = pkgdir.mkdir("schemas")
     schemadir.ensure("__init__.py")
     library = BundledDynamicSchemaLibrary('.'.join(
         [tmpdir.basename, "schemas"]))
     schema1 = SchemaRef("example", "1-0-0")
     schema2 = SchemaRef("example2", "1-0-0")
     schema3 = SchemaRef("example3", "1-0-0")
     schema4 = SchemaRef("example4", "1-0-0")
     schema5 = SchemaRef("example4", "2-0-0")  # intentionally example4
     schema6 = SchemaRef("example6", "2-0-0")
     with pytest.raises(SchemaImportError):
         library.load_schema_class_and_role(schema1)
     # sch2 doesn't have the ROLE_v1_0_0 variable
     sch2 = "import jsl;\nclass Schema(jsl.Document):\n x = jsl.StringField()\n"
     # sch3 doesn't have THE_SCHEMA variable
     sch3 = sch2 + "\nROLE_v1_0_0 = 'v1-0-0'\n"
     # sch4 is ok
     sch4 = sch3 + "\nTHE_SCHEMA = Schema\n"
     # no sch5; sch6 is ok and has two roles
     sch6 = sch4 + "\nROLE_v2_0_0 = 'v2-0-0'\n"
     schemadir.join("example2.py").write(sch2)
     schemadir.join("example3.py").write(sch3)
     schemadir.join("example4.py").write(sch4)
     schemadir.join("example6.py").write(sch6)
     monkeypatch.syspath_prepend(pkgdir.dirname)
     with pytest.raises(SchemaModuleAttributeError):
         library.load_schema_class_and_role(schema2)
     with pytest.raises(SchemaModuleAttributeError):
         library.load_schema_class_and_role(schema3)
     klass, role = library.load_schema_class_and_role(schema4)
     assert "x" in dir(klass)
     assert role == "v1-0-0"
     with pytest.raises(SchemaModuleAttributeError):
         # example 5 is the same as example 4, but doesn't have the required version 2-0-0
         library.load_schema_class_and_role(schema5)
     klass6, role6 = library.load_schema_class_and_role(schema6)
     assert "x" in dir(klass)
     assert role6 == "v2-0-0"
예제 #15
0
class BlackDuckTask(BaseTask):
    """ Scan the package using Black Duck """
    _analysis_name = 'blackduck'
    _valid_ecosystems = ["npm", "maven", "pypi"]
    _allow_cli_scan = True
    schema_ref = SchemaRef(_analysis_name, '1-0-0')

    _BLACKDUCK_CLI_TIMEOUT = 600

    def _format_hub_url(self):
        """
        Format Hub connection string from supplied config

        :return:
        """
        return "{scheme}://{host}:{port}/".format(
            scheme=self.configuration.BLACKDUCK_SCHEME,
            host=self.configuration.BLACKDUCK_HOST,
            port=self.configuration.BLACKDUCK_PORT)

    def _is_valid_ecosystem(self, ecosystem_id):
        """
        Determine whether the given ecosystem is valid for
        Black Duck analysis

        :param ecosystem_id: int, the ID of the ecosystem
        :return: bool
        """
        return ecosystem_id in self._valid_ecosystems

    def _find_blackduck_cli_root(self):
        """
        Find the base directory where the BlackDuck CLI got
        extracted

        :return: str, path to the CLI root
        """
        base = self.configuration.BLACKDUCK_PATH
        dirs = listdir(base)
        if not dirs:
            raise TaskError("Unable to find BlackDuck CLI directory")
        if len(dirs) > 1:
            raise TaskError("More than 1 BlackDuck CLI directory")

        return path.join(base, dirs.pop())

    def _prepare_command(self, project, version, archive):
        """
        Prepare the necessary CLI parameters

        :param project: str, name of the project
        :param version: str, version of the release
        :param archive: str, path to the archive with the sources
        :return: List[str], command list ready to be run
        """

        binary = "{base}/{rel}".format(base=self._find_blackduck_cli_root(),
                                       rel="bin/scan.cli.sh")

        return [
            binary, "--host", self.configuration.BLACKDUCK_HOST, "--port",
            str(int(self.configuration.BLACKDUCK_PORT)), "--scheme",
            self.configuration.BLACKDUCK_SCHEME, "--username",
            self.configuration.BLACKDUCK_USERNAME, "--project", project,
            "--release", version, archive
        ]

    def _get_release(self, hub, project, version):
        """
        Get release ID for given project version

        :param hub: BlackDuckHub, hub object to use
        :param project: str, name of the project
        :param version: str, version
        :return: BlackDuckRelease object or None if not found
        """
        # check that the specified project exists
        proj = hub.find_project(project)
        if not proj:
            return None

        # check that we have the proper version
        releases = hub.get_releases(proj)
        return releases.get(version, None)

    def _release_data(self, hub, project, version):
        """
        Fetch release data for the given project and version

        :param hub: BlackDuckHub, hub object to use
        :param project: str, name of the project
        :param version: str, version
        :return: dict, BoM information about the release
        """
        release = self._get_release(hub, project, version)
        if release is None:
            return None
        return hub.get_release_bom_json(release)

    def _get_hub(self):
        # connect to the Black Duck Hub
        hub_url = self._format_hub_url()
        self.log.debug("hub url: {url}".format(url=hub_url))
        hub = BlackDuckHub(hub_url)
        hub.connect_session(self.configuration.BLACKDUCK_USERNAME,
                            self.configuration.BLACKDUCK_PASSWORD)
        return hub

    def _get_project_name(self, arguments):
        return "{ecosystem}-{package}".format(ecosystem=arguments['ecosystem'],
                                              package=arguments['name'])

    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        result_data = {'status': 'unknown', 'summary': [], 'details': {}}

        if self._is_valid_ecosystem(arguments['ecosystem']):
            hub = self._get_hub()

            # BlackDuck project doesn't have a notion of ecosystem, so we need to
            # namespace the project names ourselves, so for package `crumb` in the NPM ecosystem
            # we'll end up with the name `npm-crumb`
            project = self._get_project_name(arguments)
            version = arguments['version']

            # Check if the given project had already been scanned
            data = self._release_data(hub, project, version)

            if not data and self._allow_cli_scan:
                self.log.debug("No data available for project {p} {v}".format(
                    p=project, v=version))
                # No data available, issue a new scan and re-query release data
                source_tarball_path = ObjectCache.get_from_dict(
                    arguments).get_source_tarball()
                command = self._prepare_command(project, version,
                                                source_tarball_path)
                self.log.debug(
                    "Executing command, timeout={timeout}: {cmd}".format(
                        timeout=self._BLACKDUCK_CLI_TIMEOUT, cmd=command))
                bd = TimedCommand(command)
                status, output, error = \
                    bd.run(timeout=self._BLACKDUCK_CLI_TIMEOUT,
                           update_env={'BD_HUB_PASSWORD': self.configuration.BLACKDUCK_PASSWORD})
                self.log.debug("status = %s, error = %s", status, error)
                self.log.debug("output = %s", output)
                data = self._release_data(hub, project, version)

            self.log.debug("Release data for project {p} {v}: {d}".format(
                p=project, v=version, d=data))
            result_data['details'] = data
            result_data['status'] = 'success' if data else 'error'
        else:
            result_data['status'] = 'error'

        return result_data
예제 #16
0
class GithubTask(BaseTask):
    """Collects statistics using Github API."""

    _analysis_name = "github_details"
    schema_ref = SchemaRef(_analysis_name, '2-0-2')
    # used for testing
    _repo_name = None
    _repo_url = None

    _headers = {
        'Accept':
        'application/vnd.github.mercy-preview+json, '  # for topics
        'application/vnd.github.v3+json'  # recommended by GitHub for License API
    }

    @classmethod
    def create_test_instance(cls, repo_name, repo_url):
        """Create instance of task for tests."""
        assert cls
        instance = super().create_test_instance()
        # set for testing as we are not querying DB for mercator results
        instance._repo_name = repo_name
        instance._repo_url = repo_url
        return instance

    def _get_last_years_commits(self, repo_url):
        """Get weekly commit activity for last year."""
        try:
            activity = get_response(
                urljoin(repo_url + '/', "stats/commit_activity"),
                self._headers)
        except NotABugTaskError as e:
            self.log.debug(e)
            return []
        return [x['total'] for x in activity]

    def _get_repo_stats(self, repo):
        """Collect various repository properties."""
        try:
            if repo.get('contributors_url', ''):
                contributors = get_response(repo.get('contributors_url', ''),
                                            self._headers)
            else:
                contributors = {}
        except NotABugTaskError as e:
            self.log.debug(e)
            contributors = {}
        d = {
            'contributors_count':
            len(list(contributors)) if contributors is not None else 'N/A'
        }
        for prop in REPO_PROPS:
            d[prop] = repo.get(prop, -1)
        return d

    def _get_repo_name(self, url):
        """Retrieve GitHub repo from a preceding Mercator scan."""
        parsed = parse_gh_repo(url)
        if not parsed:
            self.log.debug('Could not parse Github repo URL %s', url)
        else:
            self._repo_url = 'https://github.com/' + parsed
        return parsed

    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        result_data = {'status': 'unknown', 'summary': [], 'details': {}}
        # For testing purposes, a repo may be specified at task creation time
        if self._repo_name is None:
            # Otherwise, get the repo name from earlier Mercator scan results
            self._repo_name = self._get_repo_name(arguments['url'])
            if self._repo_name is None:
                # Not a GitHub hosted project
                return result_data

        try:
            _, header = self.configuration.select_random_github_token()
            self._headers.update(header)
        except F8AConfigurationException as e:
            self.log.error(e)
            raise FatalTaskError from e

        repo_url = urljoin(self.configuration.GITHUB_API + "repos/",
                           self._repo_name)
        try:
            repo = get_response(repo_url, self._headers)
        except NotABugTaskError as e:
            self.log.error(e)
            raise NotABugFatalTaskError from e

        result_data['status'] = 'success'

        issues = {}
        # Get Repo Statistics
        notoriety = self._get_repo_stats(repo)

        if notoriety:
            issues.update(notoriety)
        issues['topics'] = repo.get('topics', [])
        issues['license'] = repo.get('license') or {}

        # Get Commit Statistics
        last_year_commits = self._get_last_years_commits(repo['url'])
        commits = {
            'last_year_commits': {
                'sum': sum(last_year_commits),
                'weekly': last_year_commits
            }
        }
        t_stamp = datetime.datetime.utcnow()
        refreshed_on = {'updated_on': t_stamp.strftime("%Y-%m-%d %H:%M:%S")}
        issues.update(refreshed_on)
        issues.update(commits)
        result_data['details'] = issues
        return result_data
예제 #17
0
class MercatorTask(BaseTask):
    """Collects `Release` specific information from Mercator."""

    _analysis_name = 'metadata'
    _dependency_tree_lock = '_dependency_tree_lock'
    schema_ref = SchemaRef(_analysis_name, '3-3-0')
    _data_normalizer = DataNormalizer()

    def _parse_requires_txt(self, path):
        requires = []
        try:
            with open(path, 'r') as f:
                for line in f.readlines():
                    line = line.strip()
                    if line.startswith('['):
                        # the first named ini-like [section] ends the runtime requirements
                        break
                    elif line:
                        requires.append(line)
        except Exception as e:
            self.log.warning('Failed to process "{p}": {e}'.format(p=path,
                                                                   e=str(e)))

        return requires

    def _merge_python_items(self, topdir, data):
        # TODO: reduce cyclomatic complexity
        metadata_json = None
        pkg_info = None
        requirements_txt = None

        def get_depth(path):
            return path.rstrip('/').count('/')

        def is_deeper(item1, item2):
            """Return True if item1 is deeper in directory hierarchy than item2."""
            if item1 is None:
                return True
            return get_depth(item1['path']) > get_depth(item2['path'])

        if not data.get('items'):
            return None

        # find outermost PKG_INFO/metadata.json/requirements.txt - there can be
        #  testing ones etc.
        for item in data['items']:
            if item['ecosystem'] == 'Python-Dist' and item['path'].endswith(
                    '.json'):
                if is_deeper(metadata_json, item):
                    metadata_json = item
            elif item['ecosystem'] == 'Python-Dist':  # PKG-INFO
                # we prefer PKG_INFO files from .egg-info directories,
                #  since these have the very useful `requires.txt` next to them
                if pkg_info is None:
                    pkg_info = item
                else:
                    pkg_info_in_egg = pkg_info['path'].endswith(
                        '.egg-info/PKG-INFO')
                    item_in_egg = item['path'].endswith('.egg-info/PKG-INFO')
                    # rather than one insane condition, we use several less complex ones
                    if pkg_info_in_egg and item_in_egg and is_deeper(
                            pkg_info, item):
                        # if both are in .egg-info, but current pkg_info is deeper
                        pkg_info = item
                    elif item_in_egg and not pkg_info_in_egg:
                        # if item is in .egg-info and current pkg_info is not
                        pkg_info = item
                    elif not (item_in_egg or pkg_info_in_egg) and is_deeper(
                            pkg_info, item):
                        # if none of them are in .egg-info, but current pkg_info is deeper
                        pkg_info = item
            elif item['ecosystem'] == 'Python-RequirementsTXT':
                if not requirements_txt or is_deeper(requirements_txt, item):
                    requirements_txt = item

        if pkg_info:
            self.log.info('Found PKG-INFO at {p}'.format(p=pkg_info['path']))
        if metadata_json:
            self.log.info(
                'Found metadata.json at {p}'.format(p=metadata_json['path']))
        if requirements_txt:
            self.log.info('Found requirements.txt at {p}'.format(
                p=requirements_txt['path']))

        ret = None
        # figure out if this was packaged as wheel => metadata.json would
        #  have depth of topdir + 2
        if metadata_json and get_depth(
                metadata_json['path']) == get_depth(topdir) + 2:
            self.log.info('Seems like this is wheel, using metadata.json ...')
            ret = metadata_json
        # figure out if this was packaged as sdist => PKG_INFO would
        #  have depth of topdir + 3 (e.g. requests-2.18.1/requests.egg-info/PKG-INFO)
        #             or topdir + 4 (e.g. pydocstyle-2.0.0/src/pydocstyle.egg-info/PKG-INFO)
        #             or topdir + 5 (dxl-cluster-0.0.2/src/python/dxl_cluster.egg-info/PKG-INFO)
        #  (and perhaps there are requires.txt or requirements.txt that we could use)
        # NOTE: for now, we always treat requirements.txt as requires_dist
        elif pkg_info and get_depth(pkg_info['path']) <= get_depth(topdir) + 5:
            self.log.info(
                'Seems like this is sdist or egg, using PKG-INFO ...')
            requires_dist = []
            # in well-made sdists, there are requires.txt next to PKG_INFO
            #  (this is something different that requirements.txt)
            #  TODO: maybe mercator could do this in future
            requires = os.path.join(os.path.dirname(pkg_info['path']),
                                    'requires.txt')
            if os.path.exists(requires):
                self.log.info(
                    'Found a "requires.txt" file next to PKG-INFO, going to use it ...'
                )
                requires_dist = self._parse_requires_txt(requires)
            elif requirements_txt:
                self.log.info(
                    'No "requires.txt" file found next to PKG-INFO, but requirements.txt'
                    ' found, going to use it')
                # if requires.txt can't be found, try requirements.txt
                requires_dist = requirements_txt['result']['dependencies']
            else:
                self.log.info(
                    'Found no usable source of requirements for PKG-INFO :(')
            pkg_info['result']['requires_dist'] = requires_dist
            ret = pkg_info
        elif requirements_txt:
            self.log.info('Only requirements.txt found, going to use it ...')
            requirements_txt['result']['requires_dist'] = \
                requirements_txt['result'].get('dependencies')
            ret = requirements_txt

        return ret

    def execute(self, arguments):
        """Execute mercator and convert it's output to JSON object."""
        self._strict_assert(arguments.get('ecosystem'))

        if 'url' in arguments:
            # run mercator on a git repo
            return self.run_mercator_on_git_repo(arguments)

        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        # TODO: make this even uglier; looks like we didn't get the abstraction quite right
        #       when we were adding support for Java/Maven.
        if self.storage.get_ecosystem(arguments['ecosystem']).is_backed_by(
                EcosystemBackend.maven):
            # cache_path now points directly to the pom
            cache_path = ObjectCache.get_from_dict(arguments).get_pom_xml()
        else:
            cache_path = ObjectCache.get_from_dict(
                arguments).get_extracted_source_tarball()
        return self.run_mercator(arguments, cache_path)

    def run_mercator_on_git_repo(self, arguments):
        """Clone specified git url and run mercator on it."""
        self._strict_assert(arguments.get('url'))

        with TemporaryDirectory() as workdir:
            repo_url = arguments.get('url')
            repo = Git.clone(repo_url, path=workdir, depth=str(1))
            metadata = self.run_mercator(arguments,
                                         workdir,
                                         keep_path=True,
                                         outermost_only=False,
                                         timeout=900)
            if metadata.get('status', None) != 'success':
                self.log.error('Mercator failed on %s', repo_url)
                return None

            # add some auxiliary information so we can later find the manifest file
            head = repo.rev_parse(['HEAD'])[0]
            for detail in metadata['details']:
                path = detail['path'][len(workdir):]
                # path should look like this:
                # <git-sha1>/path/to/manifest.file
                detail['path'] = head + path

            return metadata

    def run_mercator(self,
                     arguments,
                     cache_path,
                     keep_path=False,
                     outermost_only=True,
                     timeout=300,
                     resolve_poms=True):
        """Run mercator tool."""
        # TODO: reduce cyclomatic complexity
        result_data = {'status': 'unknown', 'summary': [], 'details': []}
        mercator_target = arguments.get('cache_sources_path', cache_path)

        tc = TimedCommand(['mercator', mercator_target])
        update_env = {
            'MERCATOR_JAVA_RESOLVE_POMS': 'true'
        } if resolve_poms else {}
        status, data, err = tc.run(timeout=timeout,
                                   is_json=True,
                                   update_env=update_env)
        if status != 0:
            self.log.error(err)
            raise FatalTaskError(err)

        ecosystem_object = self.storage.get_ecosystem(arguments['ecosystem'])
        if ecosystem_object.is_backed_by(EcosystemBackend.pypi):
            # TODO: attempt static setup.py parsing with mercator
            items = [self._merge_python_items(mercator_target, data)]
            if items == [None]:
                raise NotABugFatalTaskError(
                    'Found no usable PKG-INFO/metadata.json/requirements.txt')
        else:
            if outermost_only:
                # process only root level manifests (or the ones closest to the root level)
                items = self._data_normalizer.get_outermost_items(
                    data.get('items') or [])
            else:
                items = data.get('items') or []
            self.log.debug('mercator found %i projects, outermost %i',
                           len(data), len(items))

            if ecosystem_object.is_backed_by(EcosystemBackend.maven):
                # for maven we download both Jar and POM, we consider POM to be *the*
                #  source of information and don't want to duplicate info by including
                #  data from pom included in artifact (assuming it's included)
                items = [
                    d for d in items if d['ecosystem'].lower() == 'java-pom'
                ]
            elif ecosystem_object.is_backed_by(EcosystemBackend.npm):
                # ignore other metadata files, e.g. requirements.txt
                items = [d for d in items if d['ecosystem'].lower() == 'npm']
            elif arguments['ecosystem'] == 'go':
                items = [
                    d for d in items if d['ecosystem'].lower() == 'go-glide'
                ]
                if not items:
                    # Mercator found no Go Glide files, run gofedlib
                    items = self.run_gofedlib(topdir=mercator_target,
                                              name=arguments.get('name'),
                                              version=arguments.get('version'),
                                              timeout=timeout)

        result_data['details'] = [
            self._data_normalizer.handle_data(d, keep_path=keep_path)
            for d in items
        ]
        result_data['status'] = 'success'
        return result_data

    def run_gofedlib(self, topdir, name, version, timeout):
        """Run gofedlib-cli to extract dependencies from golang sources."""
        tc = TimedCommand([
            'gofedlib-cli', '--dependencies-main', '--dependencies-packages',
            '--dependencies-test', '--skip-errors', topdir
        ])
        status, data, err = tc.run(timeout=timeout)
        result = json.loads(data[0])
        main_deps_count = len(result.get('deps-main', []))
        packages_count = len(result.get('deps-packages', []))
        self.log.debug('gofedlib found %i dependencies',
                       main_deps_count + packages_count)

        result['code_repository'] = {
            'type': 'git',
            'url': 'https://{name}'.format(name=name)
        }
        result['name'] = name
        result['version'] = version
        return [{'ecosystem': 'gofedlib', 'result': result}]
예제 #18
0
class DependencySnapshotTask(BaseTask):
    """Task that analyzes dependencies."""

    _analysis_name = 'dependency_snapshot'
    schema_ref = SchemaRef(_analysis_name, '1-0-0')

    def _collect_dependencies(self):
        """Return all dependencies for current analysis flow (operates on parent mercator result).

        :return: List[str], list of dependencies
        """
        wr = self.parent_task_result('metadata')
        if not isinstance(wr, dict):
            raise TaskError(
                'metadata task result has unexpected type: {}; expected dict'.
                format(type(wr)))

        # there can be details about multiple manifests in the metadata,
        # therefore we will collect dependency specifications from all of them
        # and exclude obvious duplicates along the way
        dependencies = list({
            dep
            for m in wr.get('details', []) if m.get('dependencies')
            for dep in m.get('dependencies', [])
        })
        return dependencies

    @staticmethod
    def _resolve_dependency(ecosystem, dep):
        ret = {
            'ecosystem': ecosystem.name,
            'declaration': dep,
            'resolved_at': json_serial(datetime.datetime.utcnow())
        }

        # first, if this is a Github dependency, return it right away (we don't resolve these yet)
        if ' ' in dep:
            # we have both package name and version (version can be an URL)
            name, spec = dep.split(' ', 1)
            if gh_dep.match(spec):
                ret['name'] = name
                ret['version'] = 'https://github.com/' + spec
            elif urllib.parse.urlparse(spec).scheme is not '':
                ret['name'] = name
                ret['version'] = spec
        else:
            if gh_dep.match(dep):
                ret['name'] = 'https://github.com/' + dep
                ret['version'] = None
            elif urllib.parse.urlparse(dep).scheme is not '':
                ret['name'] = dep
                ret['version'] = None

        if 'name' in ret:
            return ret

        # second, figure out what is the latest upstream version matching the spec and return it
        solver = get_ecosystem_solver(ecosystem)
        pkgspec = solver.solve([dep])

        if not pkgspec:
            raise TaskError("invalid dependency: {}".format(dep))

        package, version = pkgspec.popitem()
        if not version:
            raise TaskError("could not resolve {}".format(dep))

        ret['name'] = package
        ret['version'] = version
        return ret

    def execute(self, arguments):
        """Start the task that analyzes dependencies.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self._strict_assert(arguments.get('ecosystem'))

        result = {
            'summary': {
                'errors': [],
                'dependency_counts': {}
            },
            'status': 'success',
            'details': {}
        }
        ecosystem = self.storage.get_ecosystem(arguments.get('ecosystem'))
        try:
            deps = self._collect_dependencies()
        except TaskError as e:
            self.log.error(str(e))
            raise FatalTaskError from e

        resolved_deps = []
        for dep in deps:
            try:
                resolved = self._resolve_dependency(ecosystem, dep)
            except TaskError as e:
                self.log.error(str(e))
                result['summary']['errors'].append(str(e))
                result['status'] = 'error'
                # Is this fatal, i.e. should we 'raise FatalTaskError from e' ?
                break
            self.log.info('resolved dependency %r as %s', dep, resolved)
            resolved_deps.append(resolved)
        # in future, we may want to provide also build/test dependencies, not just runtime
        result['details']['runtime'] = resolved_deps
        result['summary']['dependency_counts']['runtime'] = len(resolved_deps)
        return result
class PackageKeywordsTaggingTask(KeywordsTaggingTaskBase):
    """Compute tags based on gathered natural text - strictly package level keywords."""

    _analysis_name = 'package_keywords_tagging'
    schema_ref = SchemaRef(_analysis_name, '1-0-0')

    def _package_level_keywords(self, keywords_file_name, stopwords_file_name, arguments):
        # Keep f8a_tagger import local as other components dependent on
        # f8a_worker do not require it installed.
        from f8a_tagger import lookup_readme as keywords_lookup_readme
        from f8a_tagger import lookup_text as keywords_lookup_text

        details = {}
        package_postgres = StoragePool.get_connected_storage('PackagePostgres')

        gh_info = package_postgres.get_task_result_by_analysis_id(arguments['ecosystem'],
                                                                  arguments['name'],
                                                                  'github_details',
                                                                  arguments['document_id'])
        if gh_info:
            self.log.debug("Aggregating explicitly stated keywords (topics) on GitHub")
            details['gh_topics'] = gh_info.get('details', {}).get('topics', [])

        s3_readme = StoragePool.get_connected_storage('S3Readme')
        try:
            readme_json = s3_readme.retrieve_readme_json(arguments['ecosystem'], arguments['name'])
            if readme_json:
                self.log.debug("Computing keywords from README.json")
                details['README'] = keywords_lookup_readme(readme_json,
                                                           keywords_file=keywords_file_name,
                                                           stopwords_file=stopwords_file_name,
                                                           **self._LOOKUP_CONF)
        except Exception as exc:
            self.log.info("Failed to retrieve README: %s", str(exc))

        s3_rd = StoragePool.get_connected_storage('S3RepositoryDescription')
        try:
            description = s3_rd.retrieve_repository_description(arguments['ecosystem'],
                                                                arguments['name'])
            if description:
                self.log.debug("Computing keywords on description from repository")
                details['repository_description'] = keywords_lookup_text(
                    description,
                    keywords_file=keywords_file_name,
                    stopwords_file=stopwords_file_name,
                    **self._LOOKUP_CONF)

        except Exception as exc:
            self.log.info("Failed to retrieve repository description: %s", str(exc))

        if self.task_name == 'package_keywords_tagging':
            # We are tagging on package level, add also tags that are found in package name
            name_parts = re.split('[\.\-_:]', arguments['name'])
            self.log.debug("Computing keywords from package name %s", name_parts)
            details['package_name'] = keywords_lookup_text(" ".join(name_parts),
                                                           keywords_file=keywords_file_name,
                                                           stopwords_file=stopwords_file_name,
                                                           **self._LOOKUP_CONF)

        return details

    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))

        keywords_file_name, stopwords_file_name = self._get_config_files(arguments['ecosystem'])
        details = self._package_level_keywords(keywords_file_name, stopwords_file_name,
                                               arguments)

        return {'status': 'success', 'summary': [], 'details': details}
예제 #20
0
class CVEcheckerTask(BaseTask):
    """Security issues scanner."""

    _analysis_name = 'security_issues'
    schema_ref = SchemaRef(_analysis_name, '3-0-1')

    dependency_check_jvm_mem_limit = '-Xmx768m'

    @staticmethod
    def get_cve_impact(cve_id):
        """Get more details about cve_id from NVD."""
        score = 0
        vector = ''
        severity = ''
        if cve_id:
            url = "https://nvd.nist.gov/vuln/detail/{cve_id}".format(cve_id=cve_id)
            response = requests.get(url)
            if not response.status_code == 200:
                raise IOError('Unable to reach URL: {url}'.format(url=url))

            score_v3 = score_v2 = 0
            severity_v3 = severity_v2 = vector_v3 = vector_v2 = ''
            page = BeautifulSoup(response.text, 'html.parser')
            for tag in page.find_all(href=re_compile('calculator')):
                if tag.attrs.get('data-testid') == 'vuln-cvssv3-base-score-link':
                    score_v3 = float(tag.text.strip())
                    severity_v3 = tag.find_next().text.lower()
                elif tag.attrs.get('data-testid') == 'vuln-cvssv3-vector-link':
                    vector_v3 = tag.text.strip()
                elif tag.attrs.get('data-testid') == 'vuln-cvssv2-base-score-link':
                    score_v2 = float(tag.text.strip())
                    severity_v2 = tag.find_next().text.lower()
                elif tag.attrs.get('data-testid') == 'vuln-cvssv2-vector-link':
                    vector_v2 = tag.text.strip().lstrip('(').rstrip(')')
            # Prefer CVSS v3.0 over v2
            score = score_v3 or score_v2
            severity = severity_v3 or severity_v2
            vector = vector_v3 or vector_v2

        return score, vector, severity

    @staticmethod
    def _filter_ossindex_fields(entry):
        """Create a result record for ossindex entry."""
        score, vector, severity = CVEcheckerTask.get_cve_impact(entry.get('cve'))
        result = {
            'id': entry.get('cve') or entry.get('title'),
            'description': entry.get('description'),
            'references': entry.get('references'),
            'cvss': {
                'score': score,
                'vector': vector
            },
            'severity': severity
        }
        return result

    @staticmethod
    def _filter_victims_db_entry(entry):
        """Create a result record for ossindex entry."""
        if 'cve' not in entry:
            return None
        _, vector, severity = CVEcheckerTask.get_cve_impact(entry.get('cve'))
        result = {
            'id': 'CVE-' + entry['cve'],
            'description': entry.get('description'),
            'references': entry.get('references'),
            'cvss': {
                'score': entry.get('cvss_v3') or entry.get('cvss_v2'),
                'vector': vector
            },
            'severity': severity,
            'attribution': "https://github.com/victims/victims-cve-db, CC BY-SA 4.0, modified"
        }
        return result

    @staticmethod
    def query_url(url):
        """Query url and return json."""
        response = requests.get(url)
        response.raise_for_status()
        return response.json()

    @staticmethod
    def _query_ossindex_package(ecosystem, name):
        """Get vulnerabilities for a given package ecosystem:name from OSSIndex."""
        url = "https://ossindex.net/v2.0/package/{pm}/{package}".format(pm=ecosystem, package=name)
        return CVEcheckerTask.query_url(url)

    @staticmethod
    def query_ossindex_vulnerability_fromtill(ecosystem, from_time=0, till_time=-1):
        """From OSSIndex get vulnerabilities which changed between from_time and till_time."""
        # OSS Index uses timestamp in milliseconds
        from_time = int(from_time * 1000)
        till_time = int(till_time * 1000)
        url = "https://ossindex.net/v2.0/vulnerability/pm/{pm}/fromtill/{from_time}/{till_time}".\
            format(pm=ecosystem, from_time=from_time, till_time=till_time)
        packages = []
        while url:
            response = CVEcheckerTask.query_url(url)
            for package in response.get('packages', []):
                for vulnerability in package.get('vulnerabilities', []):
                    # Sanity check:
                    # the response always contains at least one entry, even if it should be empty
                    # (when 'from_time' is higher than 'updated' time of all entries in db)
                    if int(vulnerability.get('updated')) < from_time:
                        package['vulnerabilities'].remove(vulnerability)
                if package.get('vulnerabilities', []):
                    packages.append(package)

            url = response.get('next')
        return packages

    def _query_ossindex(self, arguments):
        """Query OSS Index REST API."""
        entries = {}
        solver = get_ecosystem_solver(self.storage.get_ecosystem(arguments['ecosystem']),
                                      with_parser=OSSIndexDependencyParser())
        for package in self._query_ossindex_package(arguments['ecosystem'], arguments['name']):
            for vulnerability in package.get('vulnerabilities', []):
                for version_string in vulnerability.get('versions', []):
                    try:
                        affected_versions = solver.solve(["{} {}".format(arguments['name'],
                                                                         version_string)],
                                                         all_versions=True)
                    except Exception:
                        self.log.exception("Failed to resolve %r for %s:%s", version_string,
                                           arguments['ecosystem'], arguments['name'])
                        continue
                    if arguments['version'] in affected_versions.get(arguments['name'], []):
                        entry = self._filter_ossindex_fields(vulnerability)
                        if entry.get('id'):
                            entries[entry['id']] = entry

        return {'summary': list(entries.keys()),
                'status': 'success',
                'details': list(entries.values())}

    def _npm_scan(self, arguments):
        """Get vulnerabilities info about given npm package."""
        return self._query_ossindex(arguments)

    @staticmethod
    def update_depcheck_db_on_s3():
        """Update OWASP Dependency-check DB on S3."""
        s3 = StoragePool.get_connected_storage('S3VulnDB')
        depcheck = configuration.dependency_check_script_path
        with TemporaryDirectory() as temp_data_dir:
            s3.retrieve_depcheck_db_if_exists(temp_data_dir)
            old_java_opts = os.getenv('JAVA_OPTS', '')
            os.environ['JAVA_OPTS'] = CVEcheckerTask.dependency_check_jvm_mem_limit
            # give DependencyCheck 25 minutes to download the DB
            if TimedCommand.get_command_output([depcheck, '--updateonly', '--data', temp_data_dir],
                                               timeout=1500):
                s3.store_depcheck_db(temp_data_dir)
            os.environ['JAVA_OPTS'] = old_java_opts

    def _run_owasp_dep_check(self, scan_path, experimental=False):
        """Run OWASP Dependency-Check."""
        def _clean_dep_check_tmp():
            for dcdir in glob(os.path.join(gettempdir(), 'dctemp*')):
                rmtree(dcdir)

        s3 = StoragePool.get_connected_storage('S3VulnDB')
        depcheck = configuration.dependency_check_script_path
        with TemporaryDirectory() as temp_data_dir:
            if not s3.retrieve_depcheck_db_if_exists(temp_data_dir):
                self.log.debug('No cached OWASP Dependency-Check DB, generating fresh now ...')
                self.update_depcheck_db_on_s3()
                s3.retrieve_depcheck_db_if_exists(temp_data_dir)

            report_path = os.path.join(temp_data_dir, 'report.xml')
            command = [depcheck,
                       '--noupdate',
                       '--format', 'XML',
                       '--project', 'CVEcheckerTask',
                       '--data', temp_data_dir,
                       '--scan', scan_path,
                       '--out', report_path]
            if experimental:
                command.extend(['--enableExperimental'])
            for suppress_xml in glob(os.path.join(os.environ['OWASP_DEP_CHECK_SUPPRESS_PATH'],
                                                  '*.xml')):
                command.extend(['--suppress', suppress_xml])

            output = []
            old_java_opts = os.getenv('JAVA_OPTS', '')
            try:
                self.log.debug('Running OWASP Dependency-Check to scan %s for vulnerabilities' %
                               scan_path)
                os.environ['JAVA_OPTS'] = CVEcheckerTask.dependency_check_jvm_mem_limit
                output = TimedCommand.get_command_output(command,
                                                         graceful=False,
                                                         timeout=600)  # 10 minutes
                with open(report_path) as r:
                    report_dict = anymarkup.parse(r.read())
            except (TaskError, FileNotFoundError) as e:
                _clean_dep_check_tmp()
                for line in output:
                    self.log.warning(line)
                self.log.exception(str(e))
                raise FatalTaskError('OWASP Dependency-Check scan failed') from e
            finally:
                os.environ['JAVA_OPTS'] = old_java_opts
            _clean_dep_check_tmp()

        results = []
        dependencies = report_dict.get('analysis', {}).get('dependencies')  # value can be None
        dependencies = dependencies.get('dependency', []) if dependencies else []
        if not isinstance(dependencies, list):
            dependencies = [dependencies]
        for dependency in dependencies:
            vulnerabilities = dependency.get('vulnerabilities')  # value can be None
            vulnerabilities = vulnerabilities.get('vulnerability', []) if vulnerabilities else []
            if not isinstance(vulnerabilities, list):
                vulnerabilities = [vulnerabilities]
            for vulnerability in vulnerabilities:
                av = vulnerability.get('cvssAccessVector')
                av = av[0] if av else '?'
                ac = vulnerability.get('cvssAccessComplexity')
                ac = ac[0] if ac else '?'
                au = vulnerability.get('cvssAuthenticationr')
                au = au[0] if au else '?'
                c = vulnerability.get('cvssConfidentialImpact')
                c = c[0] if c else '?'
                i = vulnerability.get('cvssIntegrityImpact')
                i = i[0] if i else '?'
                a = vulnerability.get('cvssAvailabilityImpact')
                a = a[0] if a else '?'
                vector = "AV:{AV}/AC:{AC}/Au:{Au}/C:{C}/I:{Integrity}/A:{A}".\
                    format(AV=av, AC=ac, Au=au, C=c, Integrity=i, A=a)
                result = {
                    'cvss': {
                        'score': vulnerability.get('cvssScore'),
                        'vector': vector
                    }
                }
                references = vulnerability.get('references', {}).get('reference', [])
                if not isinstance(references, list):
                    references = [references]
                result['references'] = [r.get('url') for r in references]
                for field in ['severity', 'description']:
                    result[field] = vulnerability.get(field)
                result['id'] = vulnerability.get('name')
                results.append(result)

        return {'summary': [r['id'] for r in results],
                'status': 'success',
                'details': results}

    @staticmethod
    def update_victims_cve_db_on_s3():
        """Update Victims CVE DB on S3."""
        repo_url = 'https://github.com/victims/victims-cve-db.git'
        s3 = StoragePool.get_connected_storage('S3VulnDB')
        with TemporaryDirectory() as temp_dir:
            Git.clone(repo_url, temp_dir, depth="1")
            s3.store_victims_db(temp_dir)

    def _run_victims_cve_db_cli(self, arguments):
        """Run Victims CVE DB CLI."""
        s3 = StoragePool.get_connected_storage('S3VulnDB')
        output = []

        with TemporaryDirectory() as temp_victims_db_dir:
            if not s3.retrieve_victims_db_if_exists(temp_victims_db_dir):
                self.log.debug('No Victims CVE DB found on S3, cloning from github')
                self.update_victims_cve_db_on_s3()
                s3.retrieve_victims_db_if_exists(temp_victims_db_dir)

            try:
                cli = os.path.join(temp_victims_db_dir, 'victims-cve-db-cli.py')
                command = [cli, 'search',
                           '--ecosystem', 'java',
                           '--name', arguments['name'],
                           '--version', arguments['version']]
                output = TimedCommand.get_command_output(command,
                                                         graceful=False,
                                                         is_json=True,
                                                         timeout=60)  # 1 minute
            except TaskError as e:
                self.log.exception(e)

        return output

    def _maven_scan(self, arguments):
        """Run OWASP dependency-check & Victims CVE DB CLI."""
        jar_path = ObjectCache.get_from_dict(arguments).get_source_tarball()
        results = self._run_owasp_dep_check(jar_path, experimental=False)
        if results.get('status') != 'success':
            return results
        # merge with Victims CVE DB results
        victims_cve_db_results = self._run_victims_cve_db_cli(arguments)
        for vulnerability in victims_cve_db_results:
            vulnerability = self._filter_victims_db_entry(vulnerability)
            if not vulnerability:
                continue
            if vulnerability['id'] not in results['summary']:
                results['summary'].append(vulnerability['id'])
                results['details'].append(vulnerability)
        return results

    def _python_scan(self, arguments):
        """Run OWASP dependency-check experimental analyzer for Python artifacts.

        https://jeremylong.github.io/DependencyCheck/analyzers/python.html
        """
        extracted_tarball = ObjectCache.get_from_dict(arguments).get_extracted_source_tarball()
        # depcheck needs to be pointed to a specific file, we can't just scan whole directory
        egg_info = pkg_info = metadata = None
        for root, _, files in os.walk(extracted_tarball):
            if root.endswith('.egg-info') or root.endswith('.dist-info'):
                egg_info = root
            if 'PKG-INFO' in files:
                pkg_info = os.path.join(root, 'PKG-INFO')
            if 'METADATA' in files:
                metadata = os.path.join(root, 'METADATA')
        scan_path = egg_info or pkg_info or metadata
        if pkg_info and not egg_info:
            # Work-around for dependency-check ignoring PKG-INFO outside .dist-info/
            # https://github.com/jeremylong/DependencyCheck/issues/896
            egg_info_dir = os.path.join(extracted_tarball, arguments['name'] + '.egg-info')
            try:
                os.mkdir(egg_info_dir)
                copy(pkg_info, egg_info_dir)
                scan_path = egg_info_dir
            except os.error:
                self.log.warning('Failed to copy %s to %s', pkg_info, egg_info_dir)

        if not scan_path:
            raise FatalTaskError('File types not supported by OWASP dependency-check')

        return self._run_owasp_dep_check(scan_path, experimental=True)

    def _nuget_scan(self, arguments):
        """Get vulnerabilities info about given nuget package."""
        return self._query_ossindex(arguments)

    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        if arguments['ecosystem'] == 'maven':
            return self._maven_scan(arguments)
        elif arguments['ecosystem'] == 'npm':
            return self._npm_scan(arguments)
        elif arguments['ecosystem'] == 'pypi':
            return self._python_scan(arguments)
        elif arguments['ecosystem'] == 'nuget':
            return self._nuget_scan(arguments)
        else:
            raise RequestError('Unsupported ecosystem')
class CodeMetricsTask(BaseTask):
    """Compute various code metrics for a project."""

    _analysis_name = 'code_metrics'
    schema_ref = SchemaRef(_analysis_name, '1-0-0')
    _CLI_TIMEOUT = 300

    def _run_analyzer(self, command, json_output=True):
        """Run command (analyzer), if a JSON output is expected, parse it.

        :param command: command to be run (command with argument vector as array)
        :param json_output: True if output should be parsed
        :return: status, output, error triplet
        """
        self.log.debug("Executing command, timeout={timeout}: {cmd}".format(
            timeout=self._CLI_TIMEOUT, cmd=command))
        cmd = TimedCommand(command)
        status, output, error = cmd.run(timeout=self._CLI_TIMEOUT)
        self.log.debug("status: %d, output: %s, error: %s", status, output,
                       error)

        if status != 0:
            self.log.warning(
                "Executing command failed, return value: %d, stderr: '%s' ",
                status, error)

        # Some tools such as complexity-report write zero bytes to output (they
        # are propagated from sources like for npm/glob/7.0.3). This caused
        # failures when pushing results to Postgres as Postgres cannot store
        # null bytes in results. Let's be safe here.
        output = list(line.replace('\\u0000', '\\\\0') for line in output)

        if json_output:
            if output:
                output = "".join(output)
                output = json.loads(output)
            else:
                output = {}

        return status, output, error

    def _get_generic_result(self, source_path):
        """Get core result of CodeMetricsTask task that is based on cloc tool.

        This output is later enriched with output of tools based on languages
        that were found by cloc

        :param source_path: path to sources where analyzed artefact resists
        :return: tuple where generic information with ecosystem specific dict
        """
        command = ['cloc', '--json', source_path]
        status, output, error = self._run_analyzer(command)

        if status != 0:
            # Let the whole task fail
            raise RuntimeError("Running cloc command failed: '%s'" % error)

        # cloc places generic summary here, we will maintain it in top level so
        # remove misleading key
        header = {
            'total_files': output['header'].pop('n_files'),
            'total_lines': output['header'].pop('n_lines')
        }
        output.pop('header')

        if 'SUM' in output:
            header['blank_lines'] = output['SUM']['blank']
            header['comment_lines'] = output['SUM']['comment']
            header['code_lines'] = output['SUM']['code']
            output.pop('SUM', None)

        # rename to be more precise with naming
        wanted_keys = (('blank', 'blank_lines'), ('code', 'code_lines'),
                       ('comment', 'comment_lines'), ('nFiles', 'files_count'))
        for key in output.keys():
            # filter only language-specific results, leave statistics untouched
            if isinstance(output[key], dict):
                output[key] = DataNormalizer.transform_keys(
                    output[key], wanted_keys)

        return header, output

    @staticmethod
    def _normalize_complexity_report_output(output, source_path):
        """Normalize complexity_report output.

        See https://github.com/escomplex/escomplex/blob/master/README.md#metrics

        :param output: output dict to be normalized
        :param source_path: path to sources that was used
        :return: normalized output
        """
        # For metrics meaning see:
        wanted_keys = (('maintainability', 'project_maintainability'),
                       ('changeCost', 'cost_change'),
                       ('cyclomatic', 'average_cyclomatic_complexity'),
                       ('effort',
                        'average_halstead_effort'), ('firstOrderDensity',
                                                     'first_order_density'),
                       ('loc', 'average_function_lines_of_code'),
                       ('params',
                        'average_function_parameters_count'), ('reports',
                                                               'modules'))
        output = DataNormalizer.transform_keys(output, wanted_keys)

        wanted_module_keys = (('maintainability',
                               'module_maintainability'), ('dependencies', ),
                              ('loc',
                               'average_function_lines_of_code'), ('path', ),
                              ('params', 'average_function_parameters_count'),
                              ('functions', ))

        for idx, module in enumerate(output.get('modules', [])):
            output['modules'][idx] = DataNormalizer.transform_keys(
                module, wanted_module_keys)

            source_path_len = len(source_path) + 1
            if 'path' in module:
                output['modules'][idx]['path'] = module['path'][
                    source_path_len:]

            for fun_idx, function in enumerate(module.get('functions')):
                if 'cyclomaticDensity' in function:
                    function['cyclomatic_density'] = function.pop(
                        'cyclomaticDensity')

        return output

    @staticmethod
    def _normalize_javancss_output(output):
        """Parse and normalize JavaNCSS ASCII output.

        :param output: output dict to be normalized
        :return: normalized output
        """
        output = output.get('javancss', {})
        result = {'functions': {}, 'objects': {}, 'packages': {}}

        # The output of JavaNCSS is an XML, which is parsed using anymarkup.
        # This can introduce some pitfalls here if there is found exactly one
        # item of a type. E.g.:
        #
        #  <functions>
        #    <function>...<function/>
        #  <functions>
        #
        # Is parsed as object 'functions' containing *one object* 'function', whereas:
        #
        #  <functions>
        #    <function>...<function/>
        #    <function>...<function/>
        #  <functions>
        #
        # Is parsed as object 'functions' containing a *list of objects*
        # 'function'. Thus the isinstance(.., list) checks.

        # Parse functions section
        if 'functions' in output:
            functions = output['functions']

            wanted_function_keys = (('ccn', 'cyclomatic_complexity'),
                                    ('javadocs', ), ('name', ))

            result['functions']['function'] = []
            if 'function' in functions:
                if not isinstance(functions['function'], list):
                    functions['function'] = [functions['function']]

                for function in functions['function']:
                    result['functions']['function'].append(
                        DataNormalizer.transform_keys(function,
                                                      wanted_function_keys))

            function_averages = functions.get('function_averages', {})

            result['functions'][
                'average_cyclomatic_complexity'] = function_averages.get('ccn')
            result['functions']['average_javadocs'] = function_averages.get(
                'javadocs')

        # Parse objects section
        if 'objects' in output:
            objects = output['objects']

            wanted_objects_keys = (('classes', ), ('functions', ), ('name', ),
                                   ('javadocs', ))

            result['objects']['object'] = []
            if 'object' in objects:
                if not isinstance(objects['object'], list):
                    objects['object'] = [objects['object']]

                for obj in objects['object']:
                    result['objects']['object'].append(
                        DataNormalizer.transform_keys(obj,
                                                      wanted_objects_keys))

            object_averages = objects.get('averages', {})

            result['objects']['average_classes'] = object_averages.get(
                'classes')
            result['objects']['average_functions'] = object_averages.get(
                'functions')
            result['objects']['average_javadocs'] = object_averages.get(
                'javadocs')

        # Parse packages section
        if 'packages' in output:
            packages = output['packages']

            packages_total = packages.get('total', {})

            result['packages']['classes'] = packages_total.get('classes')
            result['packages']['functions'] = packages_total.get('functions')
            result['packages']['javadoc_lines'] = packages_total.get(
                'javadoc_lines')
            result['packages']['javadocs'] = packages_total.get('javadocs')
            result['packages']['multi_comment_lines'] = packages_total.get(
                'multi_comment_lines')
            result['packages']['single_comment_lines'] = packages_total.get(
                'single_comment_lines')

        return result

    def _normalize_mccabe_output(self, output):
        result = []
        for line in output:
            # NOTE: due to the way print works in python 2 vs python 3, the mccabe under
            #  python 2 returns `(<coords> <name> <complexity>)`, while the python 3
            #  version returns the same without the brackets
            coords, func_name, complexity = line.split()
            result.append({
                'name': func_name.strip("'"),
                'complexity': int(complexity.strip(')'))
            })

        return result

    def complexity_report(self, source_path):
        """Run complexity_report tool https://www.npmjs.com/package/complexity-report .

        :param source_path: path to source codes
        :return: normalized output
        """
        command = ['cr', '--format=json', source_path]
        status, output, error = self._run_analyzer(command)

        if status != 0:
            self.log.warning("Runing complexity report tool failed: %s", error)
            return {}

        if output:
            output = self._normalize_complexity_report_output(
                output, source_path)
        return output

    def javancss(self, source_path):
        """Run JavaNCSS tool http://www.kclee.de/clemens/java/javancss .

        :param source_path: path to source codes
        :return normalized output
        """
        javancss_path = os.path.join(self.configuration.JAVANCSS_PATH, 'bin',
                                     'javancss')
        command = [javancss_path, '-all', '-xml', source_path]
        status, output, error = self._run_analyzer(command, json_output=False)

        if status != 0:
            self.log.warning("JavaNCSS tool reported some errors: %s", error)

        if output:
            output = anymarkup.parse("".join(output))
            output = self._normalize_javancss_output(output)

        return output

    def python_mccabe(self, source_path):
        """Run mccabe tool https://pypi.python.org/pypi/mccabe .

        :param source_path: path to source codes
        :return: normalized output
        """
        result = {'files': []}
        # we'll compute total average cyclomatic complexity manually based as
        #  <total complexity>/<total number of functions>
        command = ['python3', '-m', 'mccabe']

        # mccabe has to be run on individual files, doesn't work recursively on directories
        for root, dirs, files in os.walk(source_path):
            for f in files:
                if f.endswith('.py'):
                    to_run = command + [os.path.join(root, f)]
                    status, output, error = self._run_analyzer(
                        to_run, json_output=False)
                    if status != 0:
                        self.log.info(
                            'Analyzing with Py3 failed, trying to analyze with Py2 ...'
                        )
                        to_run[0] = 'python2'
                        status, output, error = self._run_analyzer(
                            to_run, json_output=False)
                        if status != 0:
                            self.log.error(
                                'Failed to analyze with both Py2 and Py3')
                            continue
                    normalized = self._normalize_mccabe_output(output)

                    # compute file average cyclomatic complexity, add numbers
                    #  to overall package complexity
                    f_complexity = functools.reduce(
                        lambda x, y: x + y['complexity'], normalized, 0)
                    f_functions = len(normalized)
                    f_acc = round(f_complexity /
                                  f_functions, 1) if f_functions > 0 else 0
                    result['files'].append({
                        'name':
                        os.path.join(root, f)[len(source_path):].strip('/'),
                        'functions':
                        normalized,
                        'average_cyclomatic_complexity':
                        f_acc
                    })

        return result

    # A table that carries functions that should be called based on language
    # that was found by cloc, keys has to match keys in cloc output. Each
    # handler expect one argument - path to the source where sources sit, the
    # result is a dict. When you write new analyzer handlers, make sure that
    # there are no key collisions with new ones as results are aggregated under
    # "metrics" key.
    # See 'Recognized languages' section at http://cloc.sourceforge.net/
    _LANGUAGE_ANALYZER_HANDLERS = {
        "JavaScript": [
            complexity_report,
        ],
        "Ruby": [],
        "Java": [
            javancss,
        ],
        "Python": [
            python_mccabe,
        ],
        "Go": [],
        "Rust": []
    }

    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        source_path = ObjectCache.get_from_dict(arguments).get_sources()
        header, language_stats = self._get_generic_result(source_path)

        for language in language_stats.keys():
            for handler in self._LANGUAGE_ANALYZER_HANDLERS.get(language, []):
                metrics_data = handler(self, source_path)
                if not metrics_data:
                    continue

                if 'metrics' not in language_stats[language]:
                    language_stats[language]['metrics'] = {}

                language_stats[language]['metrics'].update(metrics_data)

        # we don't want to have possibly unique keys and we want to avoid
        # enumerating all languages that are supported by cloc - convert a dict
        # to a list of language-specific entries
        result = {'languages': []}
        for language in language_stats.keys():
            record = language_stats.get(language)
            record['language'] = language
            result['languages'].append(record)

        return {'summary': header, 'status': 'success', 'details': result}
class DownstreamUsageTask(BaseTask):
    """Queries Red Hat's internal toolchain for downstream component usage

    - queries Anitya for downstream package names
    - uses the package name and component version to query:
      - Brew for internal SRPM and build details
      - the Pulp CDN for redistribution details
    """
    _analysis_name = 'redhat_downstream'
    schema_ref = SchemaRef(_analysis_name, '2-2-1')

    _backend_to_anitya_ecosystem = {
        EcosystemBackend.npm: 'npm',
        EcosystemBackend.maven: 'maven',
        EcosystemBackend.pypi: 'pypi',
        EcosystemBackend.rubygems: 'rubygems',
        EcosystemBackend.nuget: 'nuget'
    }

    _ecosystem_to_prefix = {
        'npm': 'nodejs',
        'pypi': 'python',
        'rubygems': 'rubygem'
    }

    # Give CLI 10 minutes to retrieve results
    _BREWUTILS_CLI_TIMEOUT = 600

    def _get_artifact_hash(self, algorithm=None):
        wr = self.parent_task_result('digests')
        if wr:
            for details in wr['details']:
                if details.get('artifact'):
                    return details[algorithm or 'md5']
        return None

    @staticmethod
    def _prefix_package_name(name, ecosystem):
        prefix = DownstreamUsageTask._ecosystem_to_prefix.get(ecosystem, '')
        if prefix:
            return '{p}-{n}'.format(p=prefix, n=name)

        return name

    def _fetch_anitya_project(self, ecosystem, package):
        eco_model = self.storage.get_ecosystem(ecosystem)
        backend = self._backend_to_anitya_ecosystem.get(eco_model.backend, None)
        if backend is None:
            raise ValueError('Don\'t know how to add ecosystem {e} with backend {b} to Anitya'.
                             format(e=ecosystem, b=eco_model.backend))
        api_path = '/api/by_ecosystem/{e}/{p}/'.format(e=ecosystem, p=package)
        anitya_url = self.configuration.ANITYA_URL
        try:
            return _query_anitya_url(anitya_url, api_path)
        except (requests.HTTPError, requests.ConnectionError):
            msg = 'Failed to contact Anitya server at {}'
            self.log.exception(msg.format(self.configuration.ANITYA_URL))
        return None

    def _get_cdn_metadata(self, srpm_filename):
        """Try to retrieve Pulp CDN metadata"""
        try:
            pulp = Pulp()
        except ValueError as e:
            self.log.error(e)
            return None
        try:
            metadata = pulp.get_cdn_metadata_for_srpm(srpm_filename)
        except Exception as e:
            self.log.exception(e)
            return None
        return metadata

    def _add_mvn_results(self, result_summary, anitya_mvn_names, version):
        def _compare_version(downstream, upstream):
            dv = downstream
            if 'redhat' in dv:
                # remove ".redhat-X" or "-redhat-X" suffix
                dv = dv[:dv.find('redhat') - 1]
            if dv == upstream:
                return True
            else:
                return False

        downstream_rebuilds = []

        for name in anitya_mvn_names:
            ga = MavenCoordinates.from_str(name).to_repo_url(ga_only=True)
            metadata_url = '{repo}/{pkg}/maven-metadata.xml'.format(repo=RH_MVN_GA_REPO,
                                                                    pkg=ga)
            res = requests.get(metadata_url)
            if res.status_code != 200:
                self.log.info('Metadata for package {pkg} not found in {repo} (status {code})'.
                              format(pkg=name, repo=RH_MVN_GA_REPO, code=res.status_code))
                continue
            versions = anymarkup.parse(res.text)['metadata']['versioning']['versions']['version']
            # make sure 'versions' is a list (it's a string if there is just one version)
            if not isinstance(versions, list):
                versions = [versions]
            self.log.info('Found versions {v} for package {p}'.format(v=versions, p=name))
            for v in versions:
                if _compare_version(v, version):
                    downstream_rebuilds.append(v)

        result_summary['rh_mvn_matched_versions'] = downstream_rebuilds
        if downstream_rebuilds:
            # For now, we don't distinguish products, we just use general "Middleware"
            #  for all Maven artifacts
            result_summary['all_rhsm_product_names'].append('Middleware')

    @staticmethod
    def _is_inside_rh():
        """Returns True if running on RH network, False otherwise."""
        is_inside = False
        try:
            is_inside = int(os.environ.get("OPENSHIFT_DEPLOYMENT", 0)) == 0
        except ValueError:
            pass
        return is_inside

    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        eco = arguments['ecosystem']
        pkg = arguments['name']
        tool_responses = {}
        result_summary = {
            'package_names': [],
            'registered_srpms': [],
            'all_rhn_channels': [],
            'all_rhsm_content_sets': [],
            'all_rhsm_product_names': []
        }
        result_data = {'status': 'error',
                       'summary': result_summary,
                       'details': tool_responses
                       }

        # bail out early; we need access to internal services or the package is
        # from Maven ecosystem, otherwise we can't comment on downstream usage
        is_maven = Ecosystem.by_name(self.storage.session, eco).is_backed_by(EcosystemBackend.maven)
        if not self._is_inside_rh() and not is_maven:
            return result_data

        self.log.debug('Fetching {e}/{p} from Anitya'.format(e=eco, p=pkg))
        res = self._fetch_anitya_project(eco, pkg)
        anitya_rpm_names = []
        anitya_mvn_names = []
        if res is None:
            result_data['status'] = 'error'
        elif res.status_code == 200:
            self.log.debug('Retrieved {e}/{p} from Anitya'.format(e=eco, p=pkg))
            anitya_response = res.json()
            tool_responses['redhat_anitya'] = anitya_response
            # For now, we assume all downstreams are ones we care about
            for entry in anitya_response['packages']:
                if entry['distro'] == RH_RPM_DISTRO_NAME:
                    anitya_rpm_names.append(entry['package_name'])
                elif entry['distro'] == RH_MVN_DISTRO_NAME:
                    anitya_mvn_names.append(entry['package_name'])
                else:
                    self.log.warning(
                        'Unknown distro {d} for downstream package {o} (package {p}) in Anitya'.
                        format(d=entry['distro'], o=entry['package_name'], p=pkg)
                    )
            self.log.debug('Candidate RPM names from Anitya: {}'.format(anitya_rpm_names))
            self.log.debug('Candidate MVN names from Anitya: {}'.format(anitya_mvn_names))
            # TODO: Report 'partial' here and switch to 'success' at the end
            result_data['status'] = 'success'
        else:
            msg = 'Failed to find Anitya project {e}/{p}. Anitya response: {r}'
            self.log.error(msg.format(e=eco, p=pkg, r=res.text))
            result_data['status'] = 'error'

        if self._is_inside_rh():
            # we have candidate downstream name mappings, check them against Brew
            seed_names = anitya_rpm_names or [self._prefix_package_name(pkg, eco)]
            self.log.debug('Checking candidate names in Brew: {}'.format(seed_names))

            args = ['brew-utils-cli', '--version', arguments['version']]
            artifact_hash = self._get_artifact_hash(algorithm='sha256')
            if artifact_hash:
                args += ['--digest', artifact_hash]
            args += seed_names

            self.log.debug("Executing command, timeout={timeout}: {cmd}".format(
                timeout=self._BREWUTILS_CLI_TIMEOUT,
                cmd=args))
            tc = TimedCommand(args)
            status, output, error = tc.run(timeout=self._BREWUTILS_CLI_TIMEOUT)
            self.log.debug("status = %s, error = %s", status, error)
            output = ''.join(output)
            self.log.debug("output = %s", output)
            if not output:
                raise TaskError("Error running command %s" % args)
            brew = json.loads(output)

            result_summary['package_names'] = brew['packages']
            result_summary['registered_srpms'] = brew['response']['registered_srpms']
            tool_responses['brew'] = brew['response']['brew']

            # we have SRPM details, fetch details on where the RPMs are shipped
            tool_responses['pulp_cdn'] = pulp_responses = []
            rhn_channels = set()
            rhsm_content_sets = set()
            rhsm_product_names = set()
            for srpm_summary in result_summary['registered_srpms']:
                srpm_filename = "{n}-{v}-{r}.src.rpm".format(n=srpm_summary['package_name'],
                                                             v=srpm_summary['version'],
                                                             r=srpm_summary['release'])
                cdn_metadata = self._get_cdn_metadata(srpm_filename)
                if cdn_metadata is None:
                    msg = 'Error getting shipping data for {e}/{p} SRPM: {srpm}'
                    self.log.error(msg.format(e=eco, p=pkg, srpm=srpm_filename))
                    continue
                pulp_responses.append(cdn_metadata)
                srpm_summary['published_in'] = cdn_metadata['rhsm_product_names']
                rhn_channels.update(cdn_metadata['rhn_channels'])
                rhsm_content_sets.update(cdn_metadata['rhsm_content_sets'])
                rhsm_product_names.update(cdn_metadata['rhsm_product_names'])
            result_summary['all_rhn_channels'] = sorted(rhn_channels)
            result_summary['all_rhsm_content_sets'] = sorted(rhsm_content_sets)
            result_summary['all_rhsm_product_names'] = sorted(rhsm_product_names)

        self._add_mvn_results(result_summary, anitya_mvn_names, arguments['version'])

        return result_data
예제 #23
0
class CVEcheckerTask(BaseTask):
    """Security issues scanner."""

    _analysis_name = 'security_issues'
    schema_ref = SchemaRef(_analysis_name, '3-0-1')

    dependency_check_jvm_mem_limit = '-Xmx768m'

    @staticmethod
    def _parse_severity_and_score(input_tag):
        """Parse BeatifulSoup tag and return CVE's score and severity from it."""
        score, severity = input_tag.text.strip().split()
        return float(score), severity.lower()

    @staticmethod
    def _parse_vector(input_tag):
        """Parse BeatifulSoup tag and return CVE vector from it."""
        vector, *_, = input_tag.text.split()
        return vector.strip().lstrip('(').rstrip(')')

    @staticmethod
    def get_cve_impact(cve_id):
        """Get more details about cve_id from NVD."""
        # TODO: reduce cyclomatic complexity
        score = 0
        vector = ''
        severity = ''
        if cve_id:
            url = "https://nvd.nist.gov/vuln/detail/{cve_id}".format(
                cve_id=cve_id)
            response = requests.get(url)
            if not response.status_code == 200:
                raise IOError('Unable to reach URL: {url}'.format(url=url))

            score_v3 = score_v2 = 0
            severity_v3 = severity_v2 = vector_v3 = vector_v2 = ''
            page = BeautifulSoup(response.text, 'html.parser')
            for tag in page.find_all():
                if tag.attrs.get(
                        'data-testid') == 'vuln-cvssv3-base-score-link':
                    score_v3, severity_v3 = CVEcheckerTask._parse_severity_and_score(
                        tag)
                elif tag.attrs.get('data-testid') == 'vuln-cvssv3-vector':
                    # I am prefixing CVSS:3.0 to preserve compatibility
                    vector_v3 = "CVSS:3.0/{}".format(
                        CVEcheckerTask._parse_vector(tag))
                elif tag.attrs.get(
                        'data-testid') == 'vuln-cvssv2-base-score-link':
                    score_v2, severity_v2 = CVEcheckerTask._parse_severity_and_score(
                        tag)
                elif tag.attrs.get('data-testid') == 'vuln-cvssv2-vector':
                    vector_v2 = CVEcheckerTask._parse_vector(tag)
            # Prefer CVSS v3.0 over v2
            score = score_v3 or score_v2
            severity = severity_v3 or severity_v2
            vector = vector_v3 or vector_v2

        return score, vector, severity

    @staticmethod
    def _filter_ossindex_fields(entry):
        """Create a result record for ossindex entry."""
        score, vector, severity = CVEcheckerTask.get_cve_impact(
            entry.get('cve'))
        result = {
            'id': entry.get('cve') or entry.get('title'),
            'description': entry.get('description'),
            'references': entry.get('references'),
            'cvss': {
                'score': score,
                'vector': vector
            },
            'severity': severity
        }
        return result

    @staticmethod
    def _filter_victims_db_entry(entry):
        """Create a result record for ossindex entry."""
        if 'cve' not in entry:
            return None
        _, vector, severity = CVEcheckerTask.get_cve_impact(entry.get('cve'))
        result = {
            'id':
            'CVE-' + entry['cve'],
            'description':
            entry.get('description'),
            'references':
            entry.get('references'),
            'cvss': {
                'score': entry.get('cvss_v3') or entry.get('cvss_v2'),
                'vector': vector
            },
            'severity':
            severity,
            'attribution':
            "https://github.com/victims/victims-cve-db, CC BY-SA 4.0, modified"
        }
        return result

    @staticmethod
    def query_url(url):
        """Query url and return json."""
        response = requests.get(url)
        response.raise_for_status()
        return response.json()

    @staticmethod
    def _query_ossindex_package(ecosystem, name):
        """Get vulnerabilities for a given package ecosystem:name from OSSIndex."""
        url = "https://ossindex.net/v2.0/package/{pm}/{package}".format(
            pm=ecosystem, package=name)
        return CVEcheckerTask.query_url(url)

    @staticmethod
    def query_ossindex_vulnerability_fromtill(ecosystem,
                                              from_time=0,
                                              till_time=-1):
        """From OSSIndex get vulnerabilities which changed between from_time and till_time."""
        # OSS Index uses timestamp in milliseconds
        from_time = int(from_time * 1000)
        till_time = int(till_time * 1000)
        url = "https://ossindex.net/v2.0/vulnerability/pm/{pm}/fromtill/{from_time}/{till_time}".\
            format(pm=ecosystem, from_time=from_time, till_time=till_time)
        packages = []
        while url:
            response = CVEcheckerTask.query_url(url)
            for package in response.get('packages', []):
                for vulnerability in package.get('vulnerabilities', []):
                    # Sanity check:
                    # the response always contains at least one entry, even if it should be empty
                    # (when 'from_time' is higher than 'updated' time of all entries in db)
                    if int(vulnerability.get('updated')) < from_time:
                        package['vulnerabilities'].remove(vulnerability)
                if package.get('vulnerabilities', []):
                    packages.append(package)

            url = response.get('next')
        return packages

    def _query_ossindex(self, arguments):
        """Query OSS Index REST API."""
        entries = {}
        solver = get_ecosystem_solver(self.storage.get_ecosystem(
            arguments['ecosystem']),
                                      with_parser=OSSIndexDependencyParser())
        for package in self._query_ossindex_package(arguments['ecosystem'],
                                                    arguments['name']):
            for vulnerability in package.get('vulnerabilities', []):
                for version_string in vulnerability.get('versions', []):
                    try:
                        affected_versions = solver.solve([
                            "{} {}".format(arguments['name'], version_string)
                        ],
                                                         all_versions=True)
                    except Exception:
                        self.log.exception("Failed to resolve %r for %s:%s",
                                           version_string,
                                           arguments['ecosystem'],
                                           arguments['name'])
                        continue
                    if arguments['version'] in affected_versions.get(
                            arguments['name'], []):
                        entry = self._filter_ossindex_fields(vulnerability)
                        if entry.get('id'):
                            entries[entry['id']] = entry

        return {
            'summary': list(entries.keys()),
            'status': 'success',
            'details': list(entries.values())
        }

    @staticmethod
    def update_victims_cve_db_on_s3():
        """Update Victims CVE DB on S3."""
        with VictimsDB.build_from_git() as db:
            db.store_on_s3()

    def _query_victims(self, arguments, ecosystem):
        """Check EPV with VictimsDB."""
        db = None
        try:
            db = VictimsDB.from_s3()
            if not db:
                self.log.debug(
                    'No Victims CVE DB found on S3, cloning from github')
                db = VictimsDB.build_from_git()
                db.store_on_s3()

            return db.get_vulnerabilities_for_epv(ecosystem, arguments['name'],
                                                  arguments['version'])
        finally:
            if db:
                db.close()

    def _victims_scan(self, arguments, ecosystem):
        """Run Victims CVE DB CLI."""
        results = {'summary': [], 'status': 'success', 'details': []}
        victims_cve_db_results = self._query_victims(arguments, ecosystem)
        for vulnerability in victims_cve_db_results:
            vulnerability = self._filter_victims_db_entry(vulnerability)
            if not vulnerability:
                continue
            if vulnerability['id'] not in results['summary']:
                results['summary'].append(vulnerability['id'])
                results['details'].append(vulnerability)
        return results

    def _nuget_scan(self, arguments):
        """Get vulnerabilities info about given nuget package."""
        return self._query_ossindex(arguments)

    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))

        rdb = StoragePool.get_connected_storage('BayesianPostgres')
        ecosystem = Ecosystem.by_name(rdb.session, arguments.get('ecosystem'))

        if arguments['ecosystem'] in ('maven', 'pypi', 'npm'):
            return self._victims_scan(arguments, ecosystem)
        elif arguments['ecosystem'] == 'nuget':
            return self._nuget_scan(arguments)
        else:
            raise RequestError('Unsupported ecosystem')
예제 #24
0
class GithubTask(BaseTask):
    """ Collects statistics using Github API """
    _analysis_name = "github_details"
    schema_ref = SchemaRef(_analysis_name, '1-0-4')
    # used for testing
    _repo_name = None
    _repo_url = None

    @classmethod
    def create_test_instance(cls, repo_name, repo_url):
        instance = super().create_test_instance()
        # set for testing as we are not querying DB for mercator results
        instance._repo_name = repo_name
        instance._repo_url = repo_url
        return instance

    @staticmethod
    def _retry_no_cached(call, sleep_time=2, retry_count=10):
        """ Deal with cached results from GitHub as PyGitHub does not check this

        https://developer.github.com/v3/repos/statistics/#a-word-about-caching
        """
        result = None

        for _ in range(retry_count):
            result = call()
            if result:
                break
            time.sleep(sleep_time)

        return result

    @classmethod
    def _get_last_years_commits(cls, repo):
        activity = cls._retry_no_cached(repo.get_stats_commit_activity)
        if not activity:
            return []
        return [x.total for x in activity]

    @staticmethod
    def _rate_limit_exceeded(gh):
        return gh.rate_limiting[0] == 0

    @classmethod
    def _get_repo_stats(cls, repo):
        # len(list()) is workaround for totalCount being None
        # https://github.com/PyGithub/PyGithub/issues/415
        contributors = cls._retry_no_cached(repo.get_contributors)
        d = {
            'contributors_count':
            len(list(contributors)) if contributors is not None else 'N/A'
        }
        for prop in REPO_PROPS:
            d[prop] = repo.raw_data.get(prop, -1)
        return d

    def _get_repo_name(self, url):
        """Retrieve GitHub repo from a preceding Mercator scan"""
        parsed = parse_gh_repo(url)
        if not parsed:
            self.log.debug('Could not parse Github repo URL %s', url)
        else:
            self._repo_url = 'https://github.com/' + parsed
        return parsed

    def _get_topics(self):
        if not self._repo_url:
            return []

        pop = requests.get('{url}'.format(url=self._repo_url))
        poppage = BeautifulSoup(pop.text, 'html.parser')

        topics = []
        for link in poppage.find_all("a", class_="topic-tag"):
            topics.append(link.text.strip())

        return topics

    def execute(self, arguments):
        result_data = {'status': 'unknown', 'summary': [], 'details': {}}
        # For testing purposes, a repo may be specified at task creation time
        if self._repo_name is None:
            # Otherwise, get the repo name from earlier Mercator scan results
            self._repo_name = self._get_repo_name(arguments['url'])
            if self._repo_name is None:
                # Not a GitHub hosted project
                return result_data

        token = self.configuration.GITHUB_TOKEN
        if not token:
            if self._rate_limit_exceeded(github.Github()):
                self.log.error(
                    "No Github API token provided (GITHUB_TOKEN env variable), "
                    "and rate limit exceeded! "
                    "Ending now to not wait endlessly")
                result_data['status'] = 'error'
                return result_data
            else:
                self.log.warning(
                    "No Github API token provided (GITHUB_TOKEN env variable), "
                    "requests will be unauthenticated, "
                    "i.e. limited to 60 per hour")
        else:
            # there might be more comma-separated tokens, randomly select one
            token = random.choice(token.split(',')).strip()

        gh = github.Github(login_or_token=token)
        try:
            repo = gh.get_repo(full_name_or_id=self._repo_name, lazy=False)
        except github.GithubException:
            self.log.error("Failed to get repo %s" % self._repo_name)
            result_data['status'] = 'error'
            return result_data

        result_data['status'] = 'success'

        issues = {}
        # Get Repo Statistics
        notoriety = self._get_repo_stats(repo)
        if notoriety:
            issues.update(notoriety)
        issues['topics'] = self._get_topics()

        # Get Commit Statistics
        last_year_commits = self._get_last_years_commits(repo)
        commits = {
            'last_year_commits': {
                'sum': sum(last_year_commits),
                'weekly': last_year_commits
            }
        }
        issues.update(commits)
        result_data['details'] = issues
        return result_data
예제 #25
0
class StackAnalysesById(ResourceWithSchema):
    schema_ref = SchemaRef('stack_analyses', '2-1-3')

    def get(self, external_request_id):
        manifest_appstackid_map = {}
        try:
            results = rdb.session.query(StackAnalysisRequest)\
                                 .filter(StackAnalysisRequest.id == external_request_id)
            if results.count() <= 0:
                raise HTTPError(404, "Invalid request ID '{id}' received".format(id=external_request_id))

            row = results.first().to_dict()
            submitted_at = row["submitTime"]
            request_json = json.loads(row["requestJson"])

            for manifest in request_json["manifest"]:
                if manifest.get('appstack_id', 0):
                    manifest_appstackid_map[manifest["filename"]] = manifest["appstack_id"]

        except SQLAlchemyError:
            raise HTTPError(500, "Error fetching data for request ID '{id}'".format(id=external_request_id))

        try:
            results = rdb.session.query(WorkerResult)\
                                 .filter(WorkerResult.external_request_id == external_request_id,
                                         WorkerResult.worker == "dependency_aggregator")
            if results.count() <= 0:
                raise HTTPError(202, "Analysis for request ID '{t}' is in progress".format(t=external_request_id))
        except SQLAlchemyError:
            raise HTTPError(500, "Worker result for request ID '{t}' doesn't exist yet".format(t=external_request_id))

        try:
            if results.count() > 0:
                result = results.first().to_dict()
                audit = result["task_result"]["_audit"]
                manifest_response = []

                # TODO: this will probably need some refactoring

                for manifest in result["task_result"]["result"]:
                    for component in manifest["components"]:
                        component["latest_version"] = safe_get_latest_version(component["ecosystem"],
                                                                              component["name"])
                        component["dependents_count"] = get_dependents_count(component["ecosystem"],
                                                                             component["name"],
                                                                             component["version"], rdb.session)
                        rank = get_component_percentile_rank(
                            component["ecosystem"],
                            component["name"],
                            component["version"],
                            rdb.session
                        )
                        component["relative_usage"] = usage_rank2str(rank)
                    manifest_appstack_id = manifest_appstackid_map.get(manifest["manifest_name"], '')
                    if manifest_appstack_id != '':
                        url = current_app.config['BAYESIAN_ANALYTICS_URL']
                        endpoint = "{analytics_baseurl}/api/v1.0/recommendation/{appstack_id}"\
                                   .format(analytics_baseurl=url, appstack_id=manifest_appstack_id)
                        resp = requests.get(endpoint)
                        if resp.status_code == 200:
                            recommendation = resp.json()

                            # Adding URI of the stacks to the recommendation
                            if recommendation.get("input_stack", {}).get("appstack_id", "") != "":
                                uri = "{analytics_baseurl}/api/v1.0/appstack/{appstack_id}"\
                                      .format(analytics_baseurl=url,
                                              appstack_id=recommendation["input_stack"]["appstack_id"])
                                recommendation["input_stack"]["uri"] = uri

                            if recommendation.get("recommendations", {}).get("similar_stacks", "") != "":
                                for r in recommendation["recommendations"]["similar_stacks"]:
                                    if r["stack_id"] != "":
                                        r["uri"] = "{analytics_baseurl}/api/v1.0/appstack/{appstack_id}"\
                                            .format(analytics_baseurl=url, appstack_id=r["stack_id"])
                            manifest["recommendation"] = recommendation
                        else:
                            current_app.logger.warn("{status}: {error}".format(status=resp.status_code,
                                                                               error=resp.content))

                    manifest_response.append(manifest)
                response = {
                    "status": result["task_result"]["status"],
                    "submitted_at": submitted_at,
                    "started_at": audit["started_at"],
                    "finished_at": audit["ended_at"],
                    "request_id": result["external_request_id"],
                    "result": manifest_response
                }
                return response
        except:
            raise HTTPError(500, "Error creating response for request {t}".format(t=external_request_id))
 def test_next_model(self):
     schema_ref = SchemaRef("example", "1-0-0")
     assert schema_ref.next_model() == SchemaRef("example", "2-0-0")
 def test_next_revision(self):
     schema_ref = SchemaRef("example", "1-0-0")
     assert schema_ref.next_revision() == SchemaRef("example", "1-1-0")
 def test_next_addition(self):
     schema_ref = SchemaRef("example", "1-0-0")
     assert schema_ref.next_addition() == SchemaRef("example", "1-0-1")
class LicenseCheckTask(BaseTask):
    """Check licences of all files of a package."""

    _analysis_name = 'source_licenses'
    schema_ref = SchemaRef(_analysis_name, '3-0-0')

    @staticmethod
    def process_output(data):
        # not interested in these
        keys_to_remove = [
            'start_line', 'end_line', 'matched_rule', 'score', 'key'
        ]
        # 'files' is a list of file paths along with info about detected licenses.
        # If there's the same license text in most files, then almost the same license info
        # accompanies each file path.
        # Therefore transform it into dict of licenses (keys) along with info about the license plus
        # paths of files where the license has been detected.
        licenses = {}
        for file in data.pop('files'):
            for _license in file['licenses']:
                # short_name becomes key
                short_name = _license.pop('short_name')
                if short_name not in licenses.keys():
                    for key in keys_to_remove:
                        del _license[key]
                    _license['paths'] = {file['path']}
                    licenses[short_name] = _license
                else:
                    licenses[short_name]['paths'].add(file['path'])
        for l in licenses.values():
            l['paths'] = list(l['paths'])  # set -> list
        data['licenses'] = licenses

        del data['scancode_options']
        return data

    @staticmethod
    def run_scancode(scan_path):
        result_data = {'status': 'unknown', 'summary': {}, 'details': {}}
        command = [
            path.join(configuration.SCANCODE_PATH, 'scancode'),
            # Scan for licenses
            '--license',
            # Do not return license matches with scores lower than this score
            '--license-score',
            configuration.SCANCODE_LICENSE_SCORE,
            # Files without findings are omitted
            '--only-findings',
            # Use n parallel processes
            '--processes',
            configuration.SCANCODE_PROCESSES,
            # Do not print summary or progress messages
            '--quiet',
            # Strip the root directory segment of all paths
            '--strip-root',
            # Stop scanning a file if scanning takes longer than a timeout in seconds
            '--timeout',
            configuration.SCANCODE_TIMEOUT,
            scan_path
        ]
        for ignore_pattern in configuration.SCANCODE_IGNORE:
            command += ['--ignore', '{}'.format(ignore_pattern)]
        with username():
            tc = TimedCommand(command)
            status, output, error = tc.run(is_json=True, timeout=1200)
            if status != 0:
                raise FatalTaskError(
                    "Error (%s) during running command %s: %r" %
                    (str(status), command, error))

        details = LicenseCheckTask.process_output(output)
        result_data['details'] = details
        result_data['status'] = 'success'
        result_data['summary'] = {
            'sure_licenses': list(details['licenses'].keys())
        }

        return result_data

    def execute(self, arguments):
        self._strict_assert(arguments.get('ecosystem'))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))
        eco = arguments['ecosystem']
        pkg = arguments['name']
        ver = arguments['version']

        try:
            cache_path = ObjectCache.get_from_dict(arguments).get_sources()
        except Exception:
            if arguments['ecosystem'] != 'maven':
                self.log.error(
                    'Could not get sources for package {e}/{p}/{v}'.format(
                        e=eco, p=pkg, v=ver))
                raise
            self.log.info('Could not get sources for maven package {p}/{v},'
                          'will try to run on binary jar'.format(p=pkg, v=ver))
            cache_path = ObjectCache.get_from_dict(
                arguments).get_extracted_source_tarball()

        result_data = self.run_scancode(cache_path)
        return result_data
class BlackDuckHub(object):
    """
    Hub provides access around Black Duck Hub APIs
    """

    # The authentication token is returned in a cookie with this name
    COOKIE_NAME = 'JSESSIONID'

    def __init__(self, url):
        self._url = url
        self._session = None

    @property
    def url(self):
        """ URL of the Hub with trailing slash, example `https://hub.blackducksoftware.com/` """
        return self._url

    def _api(self, param):
        """
        Format a new API call, checks session validity as well

        :param param: str, parameters to append to base url
        :return: str, formatted API call
        """
        return "{}{}".format(self.url, param)

    def _api_get(self, param):
        """
        Perform a get request against the API using local `_session`

        :param param: str, full request URL
        :return: requests.Request, a request object
        """
        return get(self._api(param), cookies={self.COOKIE_NAME: self._session.api_token.token},
                   verify=False)

    def connect_session(self, username, password):
        """
        Establishes a new session with the HUB using the provided credentials

        :param username: str
        :param password: str
        :return: BlackDuckSession, a session object
        :raises: BlackDuckSessionException
        """
        req = post(self._api("j_spring_security_check"),
                   data={
                       'j_username': username,
                       'j_password': password
                   },
                   verify=False)

        if req.status_code != 204:
            raise BlackDuckSessionException("Black Duck authentication error")

        token = req.cookies.get(self.COOKIE_NAME)
        self._session = BlackDuckSession(BlackDuckApiToken(token))

        return self._session

    @needs_session
    def find_project(self, name):
        """
        Find a Project by Name

        :param name: str, name of the project
        :return: BlackDuckProject, found project or `None`
        :raises: BlackDuckSessionException
        """
        preq = self._api_get('api/v1/projects?name=' + name)
        if preq.status_code == 200:
            pdata = preq.json()
            return BlackDuckProject(pdata)
        else:
            return None

    @needs_session
    @schema.result(SchemaRef("blackduck-project-list", "1-0-0"))
    def _list_projects_json(self):
        req = self._api_get('api/projects/')
        if req.status_code == 200:
            return req.json()
        else:
            raise BlackDuckException('Unable to list projects')

    def list_projects(self):
        """
        Lists all projects valid for the current session

        :return: List[BlackDuckProject], list of projects
        :raises: BlackDuckException, BlackDuckSessionException
        """
        names = [project['name'] for project in self._list_projects_json()]
        projects = []

        for name in names:
            projects.append(self.find_project(name))

        return projects

    @needs_session
    def get_releases(self, project_id):
        """
        Get all releases of the given project

        :param project_id: BlackDuckProject or str, project reference or ID
        :return: Dict[str, BlackDuckRelease], a map of version strings to release objects
        :raises: BlackDuckException, BlackDuckSessionException
        """
        if isinstance(project_id, BlackDuckProject):
            project_id = project_id.id

        req = self._api_get('api/v1/projects/{id}/version-summaries'.format(id=project_id))
        if req.status_code == 200:
            data = req.json()
            return {obj['version']: BlackDuckRelease(obj, project_id) for obj in data['items']}
        else:
            raise BlackDuckException('Unable to fetch releases for ' + project_id)

    @needs_session
    @schema.result(SchemaRef("blackduck-vulnerable-bom", "1-0-0"))
    def get_release_bom_json(self, release_id):
        """
        Get the Bill of Materials for specific release

        :param release_id: BlackDuckRelease or str, release reference or ID
        :return: dict, the BOM JSON as a dictionary
        :raises: BlackDuckException, BlackDuckSessionException
        """
        release = release_id

        if isinstance(release_id, BlackDuckRelease):
            release_id = release_id.id

        req = self._api_get('api/projects/{p}/versions/{i}/vulnerable-bom-components'.format(
            i=release_id,
            p=release.project))

        if req.status_code == 200:
            return req.json()
        else:
            raise BlackDuckException('Unable to fetch release information ' + release_id + " " +
                                     release.project)

    @needs_session
    def get_release_code_locations(self, release_id):
        """
        Get code locations for given release

        :param release_id: BlackDuckRelease or str, release reference or ID
        :return: dict, response json containing the retrieved code locations list
        :raises: BlackDuckException, BlackDuckSessionException
        """
        release = release_id

        if isinstance(release_id, BlackDuckRelease):
            release_id = release_id.id

        req = self._api_get('api/projects/{p}/versions/{i}/codelocations'.format(i=release_id,
                                                                                 p=release.project))

        if req.status_code == 200:
            return req.json()
        else:
            raise BlackDuckException('Unable to fetch code locations for {relid} {relproj}'.
                                     format(relid=release_id, relproj=release.project))

    @needs_session
    def get_code_location_scan_summary(self, location_id):
        """
        Get scan summary for given code location ID

        :param location_id: str
        :return: dict, the code location
        :raises: BlackDuckException, BlackDuckSessionException
        """
        req = self._api_get('api/codelocations/{locid}/scan-summaries'.format(locid=location_id))

        if req.status_code == 200:
            return req.json()
        else:
            raise BlackDuckException('Unable to fetch scan summary for code location {locid}'.
                                     format(locid=location_id))