예제 #1
0
    def __load_studies(self):
        studies = [
            study for study in self.conf[self.backend_section]['studies']
            if study.strip() != ""
        ]
        if not studies:
            logger.debug('No studies for %s' % self.backend_section)
            return None

        studies_args = []

        for study in studies:
            if study not in self.conf:
                msg = 'Missing config for study %s:' % study
                logger.error(msg)
                raise DataEnrichmentError(msg)

            study_params = self.conf[study]
            studies_args.append({
                "name": study,
                "type": study.split(":")[0],
                "params": study_params
            })

        return studies_args
예제 #2
0
    def __enrich_items(self):

        time_start = time.time()

        # logger.info('%s starts for %s ', 'enrichment', self.backend_section)
        logger.info('[%s] enrichment starts', self.backend_section)
        print("Enrichment for {}: starting...".format(self.backend_section))

        cfg = self.config.get_conf()

        if 'scroll_size' in cfg['general']:
            ElasticItems.scroll_size = cfg['general']['scroll_size']

        if 'bulk_size' in cfg['general']:
            ElasticSearch.max_items_bulk = cfg['general']['bulk_size']

        no_incremental = False
        github_token = None
        pair_programming = False
        if 'github' in cfg and 'backend_token' in cfg['github']:
            github_token = cfg['github']['backend_token']
        if 'git' in cfg and 'pair-programming' in cfg['git']:
            pair_programming = cfg['git']['pair-programming']
        only_studies = False
        only_identities = False

        # repos could change between executions because changes in projects
        repos = TaskProjects.get_repos_by_backend_section(self.backend_section)

        if not repos:
            logger.warning("No enrich repositories for %s",
                           self.backend_section)

        for repo in repos:
            # First process p2o params from repo
            p2o_args = self._compose_p2o_params(self.backend_section, repo)
            filter_raw = p2o_args[
                'filter-raw'] if 'filter-raw' in p2o_args else None
            filters_raw_prefix = p2o_args[
                'filters-raw-prefix'] if 'filters-raw-prefix' in p2o_args else None
            jenkins_rename_file = p2o_args[
                'jenkins-rename-file'] if 'jenkins-rename-file' in p2o_args else None
            url = p2o_args['url']
            # Second process perceval params from repo
            backend_args = self._compose_perceval_params(
                self.backend_section, url)
            studies_args = None

            if 'studies' in self.conf[self.backend_section] and \
                    self.conf[self.backend_section]['studies']:
                studies_args = self.__load_studies()

            try:
                es_col_url = self._get_collection_url()
                logger.debug('[%s] enrichment starts for %s',
                             self.backend_section, repo)
                backend = self.get_backend(self.backend_section)
                enrich_backend(
                    es_col_url,
                    self.clean,
                    backend,
                    backend_args,
                    cfg[self.backend_section]['raw_index'],
                    cfg[self.backend_section]['enriched_index'],
                    None,  # projects_db is deprecated
                    cfg['projects']['projects_file'],
                    cfg['sortinghat']['database'],
                    no_incremental,
                    only_identities,
                    github_token,
                    False,  # studies are executed in its own Task
                    only_studies,
                    cfg['es_enrichment']['url'],
                    None,  # args.events_enrich
                    cfg['sortinghat']['user'],
                    cfg['sortinghat']['password'],
                    cfg['sortinghat']['host'],
                    None,  # args.refresh_projects,
                    None,  # args.refresh_identities,
                    author_id=None,
                    author_uuid=None,
                    filter_raw=filter_raw,
                    filters_raw_prefix=filters_raw_prefix,
                    jenkins_rename_file=jenkins_rename_file,
                    unaffiliated_group=cfg['sortinghat']['unaffiliated_group'],
                    pair_programming=pair_programming,
                    studies_args=studies_args)
            except Exception as ex:
                logger.error(
                    "Something went wrong producing enriched data for %s . "
                    "Using the backend_args: %s ", self.backend_section,
                    str(backend_args))
                logger.error("Exception: %s", ex)
                raise DataEnrichmentError(
                    'Failed to produce enriched data for ' +
                    self.backend_section)

            # Let's try to create the aliases for the enriched index
            if not self.enrich_aliases:
                logger.debug("Creating aliases after enrich")
                task_aliases = TaskPanelsAliases(self.config)
                task_aliases.set_backend_section(self.backend_section)
                task_aliases.execute()
                logger.debug("Done creating aliases after enrich")
                self.enrich_aliases = True

        spent_time = time.strftime("%H:%M:%S",
                                   time.gmtime(time.time() - time_start))
        logger.info('[%s] enrichment finished in %s', self.backend_section,
                    spent_time)
        print("Enrichment for {}: finished after {} hours".format(
            self.backend_section, spent_time))
예제 #3
0
    def __enrich_items(self):

        time_start = datetime.now()

        logger.info('[%s] enrichment phase starts', self.backend_section)

        cfg = self.config.get_conf()

        if 'scroll_size' in cfg['general']:
            ElasticItems.scroll_size = cfg['general']['scroll_size']

        if 'bulk_size' in cfg['general']:
            ElasticSearch.max_items_bulk = cfg['general']['bulk_size']

        no_incremental = False
        # not used due to https://github.com/chaoss/grimoirelab-elk/pull/773
        github_token = None
        pair_programming = False
        node_regex = None
        if 'git' in cfg and 'pair-programming' in cfg['git']:
            pair_programming = cfg['git']['pair-programming']
        if 'jenkins' in cfg and 'node_regex' in cfg['jenkins']:
            node_regex = cfg['jenkins']['node_regex']
        only_studies = False
        only_identities = False

        # repos could change between executions because changes in projects
        repos = TaskProjects.get_repos_by_backend_section(self.backend_section,
                                                          raw=False)

        if not repos:
            logger.warning("No enrich repositories for %s",
                           self.backend_section)

        # Get the metadata__timestamp value of the last item inserted in the enriched index before
        # looping over the repos which data is stored in the same index. This is needed to make sure
        # that the incremental enrichment works for data sources that are collected globally but only
        # partially enriched.
        elastic_enrich = get_elastic(
            cfg['es_enrichment']['url'],
            cfg[self.backend_section]['enriched_index'])
        last_enrich_date = elastic_enrich.get_last_item_field(
            "metadata__timestamp")
        if last_enrich_date:
            last_enrich_date = last_enrich_date.replace(tzinfo=None)

        for repo in repos:
            repo, repo_labels = self._extract_repo_labels(
                self.backend_section, repo)
            p2o_args = self._compose_p2o_params(self.backend_section, repo)
            filter_raw = p2o_args[
                'filter-raw'] if 'filter-raw' in p2o_args else None
            jenkins_rename_file = p2o_args[
                'jenkins-rename-file'] if 'jenkins-rename-file' in p2o_args else None
            url = p2o_args['url']
            # Second process perceval params from repo
            backend_args = self._compose_perceval_params(
                self.backend_section, url)
            studies_args = None

            backend = self.get_backend(self.backend_section)
            if 'studies' in self.conf[self.backend_section] and \
                    self.conf[self.backend_section]['studies']:
                studies_args = self.__load_studies()

            logger.info('[%s] enrichment starts for %s', self.backend_section,
                        self.anonymize_url(repo))
            es_enrich_aliases = self.select_aliases(cfg, self.backend_section)

            try:
                es_col_url = self._get_collection_url()
                enrich_backend(
                    es_col_url,
                    self.clean,
                    backend,
                    backend_args,
                    self.backend_section,
                    cfg[self.backend_section]['raw_index'],
                    cfg[self.backend_section]['enriched_index'],
                    None,  # projects_db is deprecated
                    cfg['projects']['projects_file'],
                    self.db_sh,
                    no_incremental,
                    only_identities,
                    github_token,
                    False,  # studies are executed in its own Task
                    only_studies,
                    cfg['es_enrichment']['url'],
                    None,  # args.events_enrich
                    self.db_user,
                    self.db_password,
                    self.db_host,
                    None,  # args.refresh_projects,
                    None,  # args.refresh_identities,
                    author_id=None,
                    author_uuid=None,
                    filter_raw=filter_raw,
                    jenkins_rename_file=jenkins_rename_file,
                    unaffiliated_group=self.db_unaffiliate_group,
                    pair_programming=pair_programming,
                    node_regex=node_regex,
                    studies_args=studies_args,
                    es_enrich_aliases=es_enrich_aliases,
                    last_enrich_date=last_enrich_date,
                    projects_json_repo=repo,
                    repo_labels=repo_labels)
            except Exception as ex:
                logger.error(
                    "Something went wrong producing enriched data for %s . "
                    "Using the backend_args: %s ", self.backend_section,
                    str(backend_args))
                logger.error("Exception: %s", ex)
                raise DataEnrichmentError(
                    'Failed to produce enriched data for ' +
                    self.backend_section)

            logger.info('[%s] enrichment finished for %s',
                        self.backend_section, self.anonymize_url(repo))

        spent_time = str(datetime.now() - time_start).split('.')[0]
        logger.info('[%s] enrichment phase finished in %s',
                    self.backend_section, spent_time)
    def __enrich_items(self):

        time_start = time.time()

        # logger.info('%s starts for %s ', 'enrichment', self.backend_section)
        logger.info('[%s] enrichment phase starts', self.backend_section)
        print("Enrichment for {}: starting...".format(self.backend_section))

        cfg = self.config.get_conf()

        if 'scroll_size' in cfg['general']:
            ElasticItems.scroll_size = cfg['general']['scroll_size']

        if 'bulk_size' in cfg['general']:
            ElasticSearch.max_items_bulk = cfg['general']['bulk_size']

        no_incremental = False
        github_token = None
        pair_programming = False
        node_regex = None
        if 'github' in cfg and 'backend_token' in cfg['github']:
            github_token = cfg['github']['backend_token']
        if 'git' in cfg and 'pair-programming' in cfg['git']:
            pair_programming = cfg['git']['pair-programming']
        if 'jenkins' in cfg and 'node_regex' in cfg['jenkins']:
            node_regex = cfg['jenkins']['node_regex']
        only_studies = False
        only_identities = False

        # repos could change between executions because changes in projects
        repos = TaskProjects.get_repos_by_backend_section(self.backend_section, raw=False)

        if not repos:
            logger.warning("No enrich repositories for %s", self.backend_section)

        # Get the metadata__timestamp value of the last item inserted in the enriched index before
        # looping over the repos which data is stored in the same index. This is needed to make sure
        # that the incremental enrichment works for data sources that are collected globally but only
        # partially enriched.
        elastic_enrich = get_elastic(cfg['es_enrichment']['url'], cfg[self.backend_section]['enriched_index'])
        last_enrich_date = elastic_enrich.get_last_item_field("metadata__timestamp")
        if last_enrich_date:
            last_enrich_date = last_enrich_date.replace(second=0, microsecond=0, tzinfo=None)

        for repo in repos:
            # First process p2o params from repo
            p2o_args = self._compose_p2o_params(self.backend_section, repo)
            filter_raw = p2o_args['filter-raw'] if 'filter-raw' in p2o_args else None
            filters_raw_prefix = p2o_args['filter-raw-prefix'] if 'filter-raw-prefix' in p2o_args else None
            jenkins_rename_file = p2o_args['jenkins-rename-file'] if 'jenkins-rename-file' in p2o_args else None
            url = p2o_args['url']
            # Second process perceval params from repo
            backend_args = self._compose_perceval_params(self.backend_section, url)
            studies_args = None

            backend = self.get_backend(self.backend_section)
            if 'studies' in self.conf[self.backend_section] and \
                    self.conf[self.backend_section]['studies']:
                studies_args = self.__load_studies()

            logger.info('[%s] enrichment starts for %s', self.backend_section, repo)
            es_enrich_aliases = self.select_aliases(cfg, self.backend_section)

            try:
                es_col_url = self._get_collection_url()
                enrich_backend(es_col_url, self.clean, backend, backend_args,
                               self.backend_section,
                               cfg[self.backend_section]['raw_index'],
                               cfg[self.backend_section]['enriched_index'],
                               None,  # projects_db is deprecated
                               cfg['projects']['projects_file'],
                               cfg['sortinghat']['database'],
                               no_incremental, only_identities,
                               github_token,
                               False,  # studies are executed in its own Task
                               only_studies,
                               cfg['es_enrichment']['url'],
                               None,  # args.events_enrich
                               cfg['sortinghat']['user'],
                               cfg['sortinghat']['password'],
                               cfg['sortinghat']['host'],
                               None,  # args.refresh_projects,
                               None,  # args.refresh_identities,
                               author_id=None,
                               author_uuid=None,
                               filter_raw=filter_raw,
                               filters_raw_prefix=filters_raw_prefix,
                               jenkins_rename_file=jenkins_rename_file,
                               unaffiliated_group=cfg['sortinghat']['unaffiliated_group'],
                               pair_programming=pair_programming,
                               node_regex=node_regex,
                               studies_args=studies_args,
                               es_enrich_aliases=es_enrich_aliases,
                               last_enrich_date=last_enrich_date)
            except Exception as ex:
                logger.error("Something went wrong producing enriched data for %s . "
                             "Using the backend_args: %s ", self.backend_section, str(backend_args))
                logger.error("Exception: %s", ex)
                raise DataEnrichmentError('Failed to produce enriched data for ' + self.backend_section)

            logger.info('[%s] enrichment finished for %s', self.backend_section, repo)

        spent_time = time.strftime("%H:%M:%S", time.gmtime(time.time() - time_start))
        logger.info('[%s] enrichment phase finished in %s', self.backend_section, spent_time)
        print("Enrichment for {}: finished after {} hours".format(self.backend_section,
                                                                  spent_time))