Exemplo n.º 1
0
    def setUp(self):
        self.config = Config(CONF_FILE)
        mordred = Mordred(self.config)

        task = TaskProjects(self.config)
        self.assertEqual(task.execute(), None)

        self.backends = mordred._get_repos_by_backend()
        self.backend_tasks = [TaskRawDataCollection, TaskEnrich]
        self.stopper = threading.Event()
Exemplo n.º 2
0
    def test__get_projects_from_url(self):
        """Test downloading projects from an URL """
        setup_http_server()

        projects_url = 'http://localhost/projects.json'
        config = Config(CONF_FILE)
        config.set_param('projects', 'projects_url', projects_url)
        task = TaskProjects(config)
        self.assertEqual(task.execute(), None)

        projects = task.get_projects()
        self.assertTrue(URL_PROJECTS_MAIN in projects)
Exemplo n.º 3
0
    def test_run_eclipse(self):
        """Test whether the Task could be run getting projects from Eclipse"""
        setup_http_server()

        # Create a empty projects file for testing
        projects_file = 'test-projects-eclipse.json'

        config = Config(CONF_FILE)
        config.set_param('projects', 'load_eclipse', True)
        config.set_param('projects', 'projects_file', projects_file)
        task = TaskProjects(config)

        self.assertEqual(task.execute(), None)
        self.assertEqual(len(task.get_projects().keys()), 302)

        # Let's remove some projects to track changes
        with open(ECLIPSE_PROJECTS_FILE) as eproj:
            remove_project = 'birt'
            add_project = 'new_project'
            new_projects = task.convert_from_eclipse(json.load(eproj)['projects'])
            new_projects.pop(remove_project)
            new_projects.update({add_project: {}})
            task.set_projects(new_projects)
            self.assertEqual(task.get_projects_last_diff().sort(),
                             [add_project, remove_project].sort())

        remove(projects_file)
    def test_convert_from_eclipse(self):
        """Test the conversion from eclipse projects to grimoire projects"""
        setup_http_server()

        projects_file = 'test-projects-eclipse.json'
        config = Config(CONF_FILE)
        config.set_param('projects', 'load_eclipse', True)
        config.set_param('projects', 'projects_file', projects_file)
        task = TaskProjects(config)
        self.assertEqual(task.execute(), None)

        projects = task.get_projects()
        self.assertTrue(TaskProjects.GLOBAL_PROJECT in projects)

        remove(projects_file)
Exemplo n.º 5
0
    def _get_repos_by_backend(self):
        #
        # return dict with backend and list of repositories
        #
        output = {}
        projects = TaskProjects.get_projects()

        for backend_section in Config.get_backend_sections():
            for pro in projects:
                backend = Task.get_backend(backend_section)
                if backend in projects[pro]:
                    if backend_section not in output:
                        output[backend_section] = projects[pro][backend]
                    else:
                        output[backend_section] += projects[pro][backend]

        # backend could be in project/repo file but not enabled in
        # mordred conf file
        enabled = {}
        for k in output:
            if k in self.conf:
                enabled[k] = output[k]

        # logger.debug('repos to be retrieved: %s ', enabled)
        return enabled
Exemplo n.º 6
0
    def test_run(self):
        """Test whether the Task could be run"""
        config = Config(CONF_FILE)
        cfg = config.get_conf()
        # We need to load the projects
        TaskProjects(config).execute()
        backend_section = GIT_BACKEND_SECTION
        task = TaskEnrich(config, backend_section=backend_section)
        self.assertEqual(task.execute(), None)

        # Check that the enrichment went well
        es_collection = cfg['es_collection']['url']
        es_enrichment = cfg['es_enrichment']['url']
        raw_index = es_collection + "/" + cfg[GIT_BACKEND_SECTION]['raw_index']
        enrich_index = es_enrichment + "/" + cfg[GIT_BACKEND_SECTION][
            'enriched_index']

        r = requests.get(raw_index + "/_search?size=0")
        raw_items = r.json()['hits']['total']
        r = requests.get(enrich_index + "/_search?size=0")
        enriched_items = r.json()['hits']['total']

        # the number of raw items is bigger since the enriched items are generated based on:
        # https://github.com/VizGrimoire/GrimoireLib
        # --filters-raw-prefix data.files.file:grimoirelib_alch data.files.file:README.md
        # see [git] section in tests/test-projects.json
        self.assertGreater(raw_items, enriched_items)
Exemplo n.º 7
0
    def _get_repos_by_backend(self):
        #
        # return dict with backend and list of repositories
        #
        output = {}
        projects = TaskProjects.get_projects()

        for backend_section in Config.get_backend_sections():
            for pro in projects:
                backend = Task.get_backend(backend_section)
                if backend in projects[pro]:
                    if backend_section not in output:
                        output[backend_section] = projects[pro][backend]
                    else:
                        output[backend_section] += projects[pro][backend]

        # backend could be in project/repo file but not enabled in
        # mordred conf file
        enabled = {}
        for k in output:
            if k in self.conf:
                enabled[k] = output[k]

        # logger.debug('repos to be retrieved: %s ', enabled)
        return enabled
Exemplo n.º 8
0
    def test_studies(self):
        """Test whether the studies configuration works """
        config = Config(CONF_FILE)
        cfg = config.get_conf()
        # We need to load the projects
        TaskProjects(config).execute()
        backend_section = GIT_BACKEND_SECTION
        task = TaskEnrich(config, backend_section=backend_section)
        self.assertEqual(task.execute(), None)

        # Configure a wrong study
        cfg['git']['studies'] = ['bad_study']
        with self.assertRaises(RuntimeError):
            self.assertEqual(task.execute(), None)

        # Configure no studies
        cfg['git']['studies'] = None
        self.assertEqual(task.execute(), None)

        # Configure several studies
        cfg['git']['studies'] = ["enrich_demography", "enrich_areas_of_code"]
        self.assertEqual(task.execute(), None)

        # Configure several studies, one wrong
        cfg['git']['studies'] = ["enrich_demography", "enrich_areas_of_code1"]
        with self.assertRaises(RuntimeError):
            self.assertEqual(task.execute(), None)
Exemplo n.º 9
0
    def test_initialization(self):
        """Test whether attributes are initializated"""

        config = Config(CONF_FILE)
        task = TaskProjects(config)

        self.assertEqual(task.config, config)
Exemplo n.º 10
0
    def test_convert_from_eclipse(self):
        """Test the conversion from eclipse projects to grimoire projects"""
        setup_http_server()

        projects_file = 'test-projects-eclipse.json'
        config = Config(CONF_FILE)
        config.set_param('projects', 'load_eclipse', True)
        config.set_param('projects', 'projects_file', projects_file)
        task = TaskProjects(config)
        self.assertEqual(task.execute(), None)

        projects = task.get_projects()
        self.assertTrue(TaskProjects.GLOBAL_PROJECT in projects)

        self.assertEqual(projects['birt']['github'][0], 'https://github.com/eclipse/birt')

        remove(projects_file)
Exemplo n.º 11
0
 def test_run(self):
     """Test whether the Task could be run"""
     config = Config(CONF_FILE)
     backend_section = GIT_BACKEND_SECTION
     task = TaskRawDataCollection(config, backend_section=backend_section)
     # We need to load the projects
     TaskProjects(config).execute()
     self.assertEqual(task.execute(), None)
Exemplo n.º 12
0
    def test_execute_from_archive(self):
        """Test fetching data from archives"""

        # proj_file -> 'test-projects-archive.json' stored within the conf file
        conf_file = 'archive-test.cfg'
        config = Config(conf_file)

        backend_sections = [
            'askbot', 'bugzilla', 'bugzillarest', 'confluence', 'discourse',
            'dockerhub', 'gerrit', 'github', 'jenkins', 'jira', 'mediawiki',
            'meetup', 'nntp', 'phabricator', 'redmine', 'rss', 'stackexchange',
            'slack', 'telegram'
        ]

        for backend_section in backend_sections:
            task = TaskRawDataCollection(config,
                                         backend_section=backend_section)
            # We need to load the projects
            TaskProjects(config).execute()
            self.assertEqual(task.execute(), None)
Exemplo n.º 13
0
    def execute(self):
        cfg = self.config.get_conf()

        if 'gerrit' not in cfg or 'git' not in cfg:
            logger.error("gerrit and git are needed for track items.")
            return

        # We need to track the items in all git repositories from OPNFV
        git_repos = []
        repos_raw = TaskProjects.get_repos_by_backend_section("git")
        # git://git.opnfv.org/apex -> https://git.opnfv.org/apex/plain/UPSTREAM
        for repo in repos_raw:
            repo = repo.replace("git://", "https://")
            repo += "/plain/UPSTREAM"
            git_repos.append(repo)

        project = cfg['track_items']['project']
        elastic_url_enrich = cfg['es_enrichment']['url']

        # The raw data comes from upstream project
        elastic_url_raw = cfg['track_items']['upstream_raw_es_url']
        index_gerrit_raw = cfg['track_items']['raw_index_gerrit']
        index_git_raw = cfg['track_items']['raw_index_git']

        index_gerrit_enrich = cfg['gerrit']['enriched_index']
        index_git_enrich = cfg['git']['enriched_index']

        db_config = {
            "database": cfg['sortinghat']['database'],
            "user": cfg['sortinghat']['user'],
            "password": cfg['sortinghat']['password'],
            "host": cfg['sortinghat']['host']
        }

        logger.debug("Importing track items from %s ", git_repos)

        #
        # Gerrit Reviews
        #
        gerrit_uris = []
        for git_repo in git_repos:
            gerrit_uris += fetch_track_items(git_repo, self.ITEMS_DATA_SOURCE)

        gerrit_numbers = get_gerrit_numbers(gerrit_uris)
        logger.info("Total gerrit track items to be imported: %i", len(gerrit_numbers))
        enriched_items = enrich_gerrit_items(elastic_url_raw,
                                             index_gerrit_raw, gerrit_numbers,
                                             project, db_config)
        logger.info("Total gerrit track items enriched: %i", len(enriched_items))
        elastic = ElasticSearch(elastic_url_enrich, index_gerrit_enrich)
        total = elastic.bulk_upload(enriched_items, "uuid")

        #
        # Git Commits
        #
        commits_sha = get_commits_from_gerrit(elastic_url_raw,
                                              index_gerrit_raw, gerrit_numbers)
        logger.info("Total git track items to be checked: %i", len(commits_sha))
        enriched_items = enrich_git_items(elastic_url_raw,
                                          index_git_raw, commits_sha,
                                          project, db_config)
        logger.info("Total git track items enriched: %i", len(enriched_items))
        elastic = ElasticSearch(elastic_url_enrich, index_git_enrich)
        total = elastic.bulk_upload(enriched_items, "uuid")
Exemplo n.º 14
0
    def __enrich_items(self):

        time_start = time.time()

        # logger.info('%s starts for %s ', 'enrichment', self.backend_section)
        logger.info('[%s] enrichment starts', self.backend_section)
        print("Enrichment for {}: starting...".format(self.backend_section))

        cfg = self.config.get_conf()

        if 'scroll_size' in cfg['general']:
            ElasticItems.scroll_size = cfg['general']['scroll_size']

        if 'bulk_size' in cfg['general']:
            ElasticSearch.max_items_bulk = cfg['general']['bulk_size']

        no_incremental = False
        github_token = None
        pair_programming = False
        if 'github' in cfg and 'backend_token' in cfg['github']:
            github_token = cfg['github']['backend_token']
        if 'git' in cfg and 'pair-programming' in cfg['git']:
            pair_programming = cfg['git']['pair-programming']
        only_studies = False
        only_identities = False

        # repos could change between executions because changes in projects
        repos = TaskProjects.get_repos_by_backend_section(self.backend_section)

        if not repos:
            logger.warning("No enrich repositories for %s",
                           self.backend_section)

        for repo in repos:
            # First process p2o params from repo
            p2o_args = self._compose_p2o_params(self.backend_section, repo)
            filter_raw = p2o_args[
                'filter-raw'] if 'filter-raw' in p2o_args else None
            filters_raw_prefix = p2o_args[
                'filters-raw-prefix'] if 'filters-raw-prefix' in p2o_args else None
            jenkins_rename_file = p2o_args[
                'jenkins-rename-file'] if 'jenkins-rename-file' in p2o_args else None
            url = p2o_args['url']
            # Second process perceval params from repo
            backend_args = self._compose_perceval_params(
                self.backend_section, url)
            studies_args = None

            if 'studies' in self.conf[self.backend_section] and \
                    self.conf[self.backend_section]['studies']:
                studies_args = self.__load_studies()

            try:
                es_col_url = self._get_collection_url()
                logger.debug('[%s] enrichment starts for %s',
                             self.backend_section, repo)
                backend = self.get_backend(self.backend_section)
                enrich_backend(
                    es_col_url,
                    self.clean,
                    backend,
                    backend_args,
                    cfg[self.backend_section]['raw_index'],
                    cfg[self.backend_section]['enriched_index'],
                    None,  # projects_db is deprecated
                    cfg['projects']['projects_file'],
                    cfg['sortinghat']['database'],
                    no_incremental,
                    only_identities,
                    github_token,
                    False,  # studies are executed in its own Task
                    only_studies,
                    cfg['es_enrichment']['url'],
                    None,  # args.events_enrich
                    cfg['sortinghat']['user'],
                    cfg['sortinghat']['password'],
                    cfg['sortinghat']['host'],
                    None,  # args.refresh_projects,
                    None,  # args.refresh_identities,
                    author_id=None,
                    author_uuid=None,
                    filter_raw=filter_raw,
                    filters_raw_prefix=filters_raw_prefix,
                    jenkins_rename_file=jenkins_rename_file,
                    unaffiliated_group=cfg['sortinghat']['unaffiliated_group'],
                    pair_programming=pair_programming,
                    studies_args=studies_args)
            except Exception as ex:
                logger.error(
                    "Something went wrong producing enriched data for %s . "
                    "Using the backend_args: %s ", self.backend_section,
                    str(backend_args))
                logger.error("Exception: %s", ex)
                raise DataEnrichmentError(
                    'Failed to produce enriched data for ' +
                    self.backend_section)

            # Let's try to create the aliases for the enriched index
            if not self.enrich_aliases:
                logger.debug("Creating aliases after enrich")
                task_aliases = TaskPanelsAliases(self.config)
                task_aliases.set_backend_section(self.backend_section)
                task_aliases.execute()
                logger.debug("Done creating aliases after enrich")
                self.enrich_aliases = True

        spent_time = time.strftime("%H:%M:%S",
                                   time.gmtime(time.time() - time_start))
        logger.info('[%s] enrichment finished in %s', self.backend_section,
                    spent_time)
        print("Enrichment for {}: finished after {} hours".format(
            self.backend_section, spent_time))
Exemplo n.º 15
0
    def __enrich_items(self):

        time_start = time.time()

        # logger.info('%s starts for %s ', 'enrichment', self.backend_section)
        logger.info('[%s] enrichment starts', self.backend_section)
        print("Enrichment for {}: starting...".format(self.backend_section))

        cfg = self.config.get_conf()

        if 'scroll_size' in cfg['general']:
            ElasticItems.scroll_size = cfg['general']['scroll_size']

        if 'bulk_size' in cfg['general']:
            ElasticSearch.max_items_bulk = cfg['general']['bulk_size']

        no_incremental = False
        github_token = None
        pair_programming = False
        if 'github' in cfg and 'backend_token' in cfg['github']:
            github_token = cfg['github']['backend_token']
        if 'git' in cfg and 'pair-programming' in cfg['git']:
            pair_programming = cfg['git']['pair-programming']
        only_studies = False
        only_identities = False

        # repos could change between executions because changes in projects
        repos = TaskProjects.get_repos_by_backend_section(self.backend_section)

        if not repos:
            logger.warning("No enrich repositories for %s", self.backend_section)

        for repo in repos:
            # First process p2o params from repo
            p2o_args = self._compose_p2o_params(self.backend_section, repo)
            filter_raw = p2o_args['filter-raw'] if 'filter-raw' in p2o_args else None
            filters_raw_prefix = p2o_args['filters-raw-prefix'] if 'filters-raw-prefix' in p2o_args else None
            jenkins_rename_file = p2o_args['jenkins-rename-file'] if 'jenkins-rename-file' in p2o_args else None
            url = p2o_args['url']
            # Second process perceval params from repo
            backend_args = self._compose_perceval_params(self.backend_section, url)
            studies_args = None

            if 'studies' in self.conf[self.backend_section] and self.conf[self.backend_section]['studies']:
                studies_args = self.__load_studies()

            try:
                es_col_url = self._get_collection_url()
                logger.debug('[%s] enrichment starts for %s', self.backend_section, repo)
                backend = self.get_backend(self.backend_section)
                enrich_backend(es_col_url, self.clean, backend, backend_args,
                               cfg[self.backend_section]['raw_index'],
                               cfg[self.backend_section]['enriched_index'],
                               None,  # projects_db is deprecated
                               cfg['projects']['projects_file'],
                               cfg['sortinghat']['database'],
                               no_incremental, only_identities,
                               github_token,
                               False,  # studies are executed in its own Task
                               only_studies,
                               cfg['es_enrichment']['url'],
                               None,  # args.events_enrich
                               cfg['sortinghat']['user'],
                               cfg['sortinghat']['password'],
                               cfg['sortinghat']['host'],
                               None,  # args.refresh_projects,
                               None,  # args.refresh_identities,
                               author_id=None,
                               author_uuid=None,
                               filter_raw=filter_raw,
                               filters_raw_prefix=filters_raw_prefix,
                               jenkins_rename_file=jenkins_rename_file,
                               unaffiliated_group=cfg['sortinghat']['unaffiliated_group'],
                               pair_programming=pair_programming,
                               studies_args=studies_args)
            except Exception as ex:
                logger.error("Something went wrong producing enriched data for %s . "
                             "Using the backend_args: %s ", self.backend_section, str(backend_args))
                logger.error("Exception: %s", ex)
                raise DataEnrichmentError('Failed to produce enriched data for ' + self.backend_section)

            # Let's try to create the aliases for the enriched index
            if not self.enrich_aliases:
                logger.debug("Creating aliases after enrich")
                task_aliases = TaskPanelsAliases(self.config)
                task_aliases.set_backend_section(self.backend_section)
                task_aliases.execute()
                logger.debug("Done creating aliases after enrich")
                self.enrich_aliases = True

        spent_time = time.strftime("%H:%M:%S", time.gmtime(time.time() - time_start))
        logger.info('[%s] enrichment finished in %s', self.backend_section, spent_time)
        print("Enrichment for {}: finished after {} hours".format(self.backend_section,
                                                                  spent_time))
    def execute(self):
        def check_arthur_task(repo, backend_args):
            """ Check if a task exists in arthur and if not, create it """
            arthur_repo_json = self.__create_arthur_json(repo, backend_args)
            logger.debug('JSON config for arthur %s',
                         json.dumps(arthur_repo_json, indent=True))

            # First check is the task already exists
            try:
                r = requests.post(self.arthur_url + "/tasks")
            except requests.exceptions.ConnectionError as ex:
                logging.error("Can not connect to %s", self.arthur_url)
                raise RuntimeError("Can not connect to " + self.arthur_url)

            task_ids = [task['task_id'] for task in r.json()['tasks']]
            new_task_ids = [
                task['task_id'] for task in arthur_repo_json['tasks']
            ]
            # TODO: if a tasks already exists maybe we should delete and readd it
            already_tasks = list(set(task_ids).intersection(set(new_task_ids)))
            if len(already_tasks) > 0:
                logger.warning(
                    "Tasks not added to arthur because there are already existing tasks %s",
                    already_tasks)
            else:
                r = requests.post(self.arthur_url + "/add",
                                  json=arthur_repo_json)
                r.raise_for_status()
                logger.info('[%s] collection configured in arthur for %s',
                            self.backend_section, repo)

        def collect_arthur_items(repo):
            aitems = self.__feed_backend_arthur(repo)
            if not aitems:
                return
            connector = get_connector_from_name(self.backend_section)
            klass = connector[1]  # Ocean backend for the connector
            ocean_backend = klass(None)
            es_col_url = self._get_collection_url()
            es_index = self.conf[self.backend_section]['raw_index']
            clean = False
            elastic_ocean = get_elastic(es_col_url, es_index, clean,
                                        ocean_backend)
            ocean_backend.set_elastic(elastic_ocean)
            ocean_backend.feed(arthur_items=aitems)

        cfg = self.config.get_conf()

        if ('collect' in cfg[self.backend_section]
                and not cfg[self.backend_section]['collect']):
            logging.info('%s collect disabled', self.backend_section)
            return

        if 'scroll_size' in cfg['general']:
            ElasticItems.scroll_size = cfg['general']['scroll_size']

        if 'bulk_size' in cfg['general']:
            ElasticSearch.max_items_bulk = cfg['general']['bulk_size']

        logger.info('Programming arthur for [%s] raw data collection',
                    self.backend_section)
        clean = False

        fetch_archive = False
        if ('fetch-archive' in self.conf[self.backend_section]
                and self.conf[self.backend_section]['fetch-archive']):
            fetch_archive = True

        # repos could change between executions because changes in projects
        repos = TaskProjects.get_repos_by_backend_section(self.backend_section)

        if not repos:
            logger.warning("No collect repositories for %s",
                           self.backend_section)

        for repo in repos:
            # If the repo already exists don't try to add it to arthur
            tag = self.backend_tag(repo)
            if tag not in self.arthur_items:
                self.arthur_items[tag] = []
                p2o_args = self._compose_p2o_params(self.backend_section, repo)
                filter_raw = p2o_args[
                    'filter-raw'] if 'filter-raw' in p2o_args else None
                if filter_raw:
                    # If filter-raw exists the goal is to enrich already collected
                    # data, so don't collect anything
                    logging.warning("Not collecting filter raw repository: %s",
                                    repo)
                    continue
                backend_args = self._compose_perceval_params(
                    self.backend_section, repo)
                logger.debug(backend_args)

                check_arthur_task(repo, backend_args)

            collect_arthur_items(repo)
Exemplo n.º 17
0
 def test_run(self):
     """Test whether the Task could be run"""
     config = Config(CONF_FILE)
     task = TaskProjects(config)
     self.assertEqual(task.execute(), None)
     self.assertEqual(len(task.get_projects().keys()), 1)
    def execute(self):
        cfg = self.config.get_conf()

        if 'scroll_size' in cfg['general']:
            ElasticItems.scroll_size = cfg['general']['scroll_size']

        if 'bulk_size' in cfg['general']:
            ElasticSearch.max_items_bulk = cfg['general']['bulk_size']

        if ('collect' in cfg[self.backend_section]
                and not cfg[self.backend_section]['collect']):
            logging.info('%s collect disabled', self.backend_section)
            return

        t2 = time.time()
        logger.info('[%s] raw data collection starts', self.backend_section)
        print("Collection for {}: starting...".format(self.backend_section))
        clean = False

        fetch_archive = False
        if ('fetch-archive' in cfg[self.backend_section]
                and cfg[self.backend_section]['fetch-archive']):
            fetch_archive = True

        # repos could change between executions because changes in projects
        repos = TaskProjects.get_repos_by_backend_section(self.backend_section)

        if not repos:
            logger.warning("No collect repositories for %s",
                           self.backend_section)

        for repo in repos:
            p2o_args = self._compose_p2o_params(self.backend_section, repo)
            filter_raw = p2o_args[
                'filter-raw'] if 'filter-raw' in p2o_args else None

            if filter_raw:
                # If filter-raw exists the goal is to enrich already collected
                # data, so don't collect anything
                logging.warning("Not collecting filter raw repository: %s",
                                repo)
                continue

            url = p2o_args['url']
            backend_args = self._compose_perceval_params(
                self.backend_section, repo)
            logger.debug(backend_args)
            logger.debug('[%s] collection starts for %s', self.backend_section,
                         repo)
            es_col_url = self._get_collection_url()
            ds = self.backend_section
            backend = self.get_backend(self.backend_section)
            project = None  # just used for github in cauldron
            try:
                feed_backend(es_col_url, clean, fetch_archive, backend,
                             backend_args, cfg[ds]['raw_index'],
                             cfg[ds]['enriched_index'], project)
            except Exception:
                logger.error(
                    "Something went wrong collecting data from this %s repo: %s . "
                    "Using the backend_args: %s " %
                    (ds, url, str(backend_args)))
                traceback.print_exc()
                raise DataCollectionError('Failed to collect data from %s' %
                                          url)

        t3 = time.time()

        spent_time = time.strftime("%H:%M:%S", time.gmtime(t3 - t2))
        logger.info('[%s] Data collection finished in %s',
                    self.backend_section, spent_time)
        print("Collection for {}: finished after {} hours".format(
            self.backend_section, spent_time))