def __load_studies(self): studies = [ study for study in self.conf[self.backend_section]['studies'] if study.strip() != "" ] if not studies: logger.debug('No studies for %s' % self.backend_section) return None studies_args = [] for study in studies: if study not in self.conf: msg = 'Missing config for study %s:' % study logger.error(msg) raise DataEnrichmentError(msg) study_params = self.conf[study] studies_args.append({ "name": study, "type": study.split(":")[0], "params": study_params }) return studies_args
def __enrich_items(self): time_start = time.time() # logger.info('%s starts for %s ', 'enrichment', self.backend_section) logger.info('[%s] enrichment starts', self.backend_section) print("Enrichment for {}: starting...".format(self.backend_section)) cfg = self.config.get_conf() if 'scroll_size' in cfg['general']: ElasticItems.scroll_size = cfg['general']['scroll_size'] if 'bulk_size' in cfg['general']: ElasticSearch.max_items_bulk = cfg['general']['bulk_size'] no_incremental = False github_token = None pair_programming = False if 'github' in cfg and 'backend_token' in cfg['github']: github_token = cfg['github']['backend_token'] if 'git' in cfg and 'pair-programming' in cfg['git']: pair_programming = cfg['git']['pair-programming'] only_studies = False only_identities = False # repos could change between executions because changes in projects repos = TaskProjects.get_repos_by_backend_section(self.backend_section) if not repos: logger.warning("No enrich repositories for %s", self.backend_section) for repo in repos: # First process p2o params from repo p2o_args = self._compose_p2o_params(self.backend_section, repo) filter_raw = p2o_args[ 'filter-raw'] if 'filter-raw' in p2o_args else None filters_raw_prefix = p2o_args[ 'filters-raw-prefix'] if 'filters-raw-prefix' in p2o_args else None jenkins_rename_file = p2o_args[ 'jenkins-rename-file'] if 'jenkins-rename-file' in p2o_args else None url = p2o_args['url'] # Second process perceval params from repo backend_args = self._compose_perceval_params( self.backend_section, url) studies_args = None if 'studies' in self.conf[self.backend_section] and \ self.conf[self.backend_section]['studies']: studies_args = self.__load_studies() try: es_col_url = self._get_collection_url() logger.debug('[%s] enrichment starts for %s', self.backend_section, repo) backend = self.get_backend(self.backend_section) enrich_backend( es_col_url, self.clean, backend, backend_args, cfg[self.backend_section]['raw_index'], cfg[self.backend_section]['enriched_index'], None, # projects_db is deprecated cfg['projects']['projects_file'], cfg['sortinghat']['database'], no_incremental, only_identities, github_token, False, # studies are executed in its own Task only_studies, cfg['es_enrichment']['url'], None, # args.events_enrich cfg['sortinghat']['user'], cfg['sortinghat']['password'], cfg['sortinghat']['host'], None, # args.refresh_projects, None, # args.refresh_identities, author_id=None, author_uuid=None, filter_raw=filter_raw, filters_raw_prefix=filters_raw_prefix, jenkins_rename_file=jenkins_rename_file, unaffiliated_group=cfg['sortinghat']['unaffiliated_group'], pair_programming=pair_programming, studies_args=studies_args) except Exception as ex: logger.error( "Something went wrong producing enriched data for %s . " "Using the backend_args: %s ", self.backend_section, str(backend_args)) logger.error("Exception: %s", ex) raise DataEnrichmentError( 'Failed to produce enriched data for ' + self.backend_section) # Let's try to create the aliases for the enriched index if not self.enrich_aliases: logger.debug("Creating aliases after enrich") task_aliases = TaskPanelsAliases(self.config) task_aliases.set_backend_section(self.backend_section) task_aliases.execute() logger.debug("Done creating aliases after enrich") self.enrich_aliases = True spent_time = time.strftime("%H:%M:%S", time.gmtime(time.time() - time_start)) logger.info('[%s] enrichment finished in %s', self.backend_section, spent_time) print("Enrichment for {}: finished after {} hours".format( self.backend_section, spent_time))
def __enrich_items(self): time_start = datetime.now() logger.info('[%s] enrichment phase starts', self.backend_section) cfg = self.config.get_conf() if 'scroll_size' in cfg['general']: ElasticItems.scroll_size = cfg['general']['scroll_size'] if 'bulk_size' in cfg['general']: ElasticSearch.max_items_bulk = cfg['general']['bulk_size'] no_incremental = False # not used due to https://github.com/chaoss/grimoirelab-elk/pull/773 github_token = None pair_programming = False node_regex = None if 'git' in cfg and 'pair-programming' in cfg['git']: pair_programming = cfg['git']['pair-programming'] if 'jenkins' in cfg and 'node_regex' in cfg['jenkins']: node_regex = cfg['jenkins']['node_regex'] only_studies = False only_identities = False # repos could change between executions because changes in projects repos = TaskProjects.get_repos_by_backend_section(self.backend_section, raw=False) if not repos: logger.warning("No enrich repositories for %s", self.backend_section) # Get the metadata__timestamp value of the last item inserted in the enriched index before # looping over the repos which data is stored in the same index. This is needed to make sure # that the incremental enrichment works for data sources that are collected globally but only # partially enriched. elastic_enrich = get_elastic( cfg['es_enrichment']['url'], cfg[self.backend_section]['enriched_index']) last_enrich_date = elastic_enrich.get_last_item_field( "metadata__timestamp") if last_enrich_date: last_enrich_date = last_enrich_date.replace(tzinfo=None) for repo in repos: repo, repo_labels = self._extract_repo_labels( self.backend_section, repo) p2o_args = self._compose_p2o_params(self.backend_section, repo) filter_raw = p2o_args[ 'filter-raw'] if 'filter-raw' in p2o_args else None jenkins_rename_file = p2o_args[ 'jenkins-rename-file'] if 'jenkins-rename-file' in p2o_args else None url = p2o_args['url'] # Second process perceval params from repo backend_args = self._compose_perceval_params( self.backend_section, url) studies_args = None backend = self.get_backend(self.backend_section) if 'studies' in self.conf[self.backend_section] and \ self.conf[self.backend_section]['studies']: studies_args = self.__load_studies() logger.info('[%s] enrichment starts for %s', self.backend_section, self.anonymize_url(repo)) es_enrich_aliases = self.select_aliases(cfg, self.backend_section) try: es_col_url = self._get_collection_url() enrich_backend( es_col_url, self.clean, backend, backend_args, self.backend_section, cfg[self.backend_section]['raw_index'], cfg[self.backend_section]['enriched_index'], None, # projects_db is deprecated cfg['projects']['projects_file'], self.db_sh, no_incremental, only_identities, github_token, False, # studies are executed in its own Task only_studies, cfg['es_enrichment']['url'], None, # args.events_enrich self.db_user, self.db_password, self.db_host, None, # args.refresh_projects, None, # args.refresh_identities, author_id=None, author_uuid=None, filter_raw=filter_raw, jenkins_rename_file=jenkins_rename_file, unaffiliated_group=self.db_unaffiliate_group, pair_programming=pair_programming, node_regex=node_regex, studies_args=studies_args, es_enrich_aliases=es_enrich_aliases, last_enrich_date=last_enrich_date, projects_json_repo=repo, repo_labels=repo_labels) except Exception as ex: logger.error( "Something went wrong producing enriched data for %s . " "Using the backend_args: %s ", self.backend_section, str(backend_args)) logger.error("Exception: %s", ex) raise DataEnrichmentError( 'Failed to produce enriched data for ' + self.backend_section) logger.info('[%s] enrichment finished for %s', self.backend_section, self.anonymize_url(repo)) spent_time = str(datetime.now() - time_start).split('.')[0] logger.info('[%s] enrichment phase finished in %s', self.backend_section, spent_time)
def __enrich_items(self): time_start = time.time() # logger.info('%s starts for %s ', 'enrichment', self.backend_section) logger.info('[%s] enrichment phase starts', self.backend_section) print("Enrichment for {}: starting...".format(self.backend_section)) cfg = self.config.get_conf() if 'scroll_size' in cfg['general']: ElasticItems.scroll_size = cfg['general']['scroll_size'] if 'bulk_size' in cfg['general']: ElasticSearch.max_items_bulk = cfg['general']['bulk_size'] no_incremental = False github_token = None pair_programming = False node_regex = None if 'github' in cfg and 'backend_token' in cfg['github']: github_token = cfg['github']['backend_token'] if 'git' in cfg and 'pair-programming' in cfg['git']: pair_programming = cfg['git']['pair-programming'] if 'jenkins' in cfg and 'node_regex' in cfg['jenkins']: node_regex = cfg['jenkins']['node_regex'] only_studies = False only_identities = False # repos could change between executions because changes in projects repos = TaskProjects.get_repos_by_backend_section(self.backend_section, raw=False) if not repos: logger.warning("No enrich repositories for %s", self.backend_section) # Get the metadata__timestamp value of the last item inserted in the enriched index before # looping over the repos which data is stored in the same index. This is needed to make sure # that the incremental enrichment works for data sources that are collected globally but only # partially enriched. elastic_enrich = get_elastic(cfg['es_enrichment']['url'], cfg[self.backend_section]['enriched_index']) last_enrich_date = elastic_enrich.get_last_item_field("metadata__timestamp") if last_enrich_date: last_enrich_date = last_enrich_date.replace(second=0, microsecond=0, tzinfo=None) for repo in repos: # First process p2o params from repo p2o_args = self._compose_p2o_params(self.backend_section, repo) filter_raw = p2o_args['filter-raw'] if 'filter-raw' in p2o_args else None filters_raw_prefix = p2o_args['filter-raw-prefix'] if 'filter-raw-prefix' in p2o_args else None jenkins_rename_file = p2o_args['jenkins-rename-file'] if 'jenkins-rename-file' in p2o_args else None url = p2o_args['url'] # Second process perceval params from repo backend_args = self._compose_perceval_params(self.backend_section, url) studies_args = None backend = self.get_backend(self.backend_section) if 'studies' in self.conf[self.backend_section] and \ self.conf[self.backend_section]['studies']: studies_args = self.__load_studies() logger.info('[%s] enrichment starts for %s', self.backend_section, repo) es_enrich_aliases = self.select_aliases(cfg, self.backend_section) try: es_col_url = self._get_collection_url() enrich_backend(es_col_url, self.clean, backend, backend_args, self.backend_section, cfg[self.backend_section]['raw_index'], cfg[self.backend_section]['enriched_index'], None, # projects_db is deprecated cfg['projects']['projects_file'], cfg['sortinghat']['database'], no_incremental, only_identities, github_token, False, # studies are executed in its own Task only_studies, cfg['es_enrichment']['url'], None, # args.events_enrich cfg['sortinghat']['user'], cfg['sortinghat']['password'], cfg['sortinghat']['host'], None, # args.refresh_projects, None, # args.refresh_identities, author_id=None, author_uuid=None, filter_raw=filter_raw, filters_raw_prefix=filters_raw_prefix, jenkins_rename_file=jenkins_rename_file, unaffiliated_group=cfg['sortinghat']['unaffiliated_group'], pair_programming=pair_programming, node_regex=node_regex, studies_args=studies_args, es_enrich_aliases=es_enrich_aliases, last_enrich_date=last_enrich_date) except Exception as ex: logger.error("Something went wrong producing enriched data for %s . " "Using the backend_args: %s ", self.backend_section, str(backend_args)) logger.error("Exception: %s", ex) raise DataEnrichmentError('Failed to produce enriched data for ' + self.backend_section) logger.info('[%s] enrichment finished for %s', self.backend_section, repo) spent_time = time.strftime("%H:%M:%S", time.gmtime(time.time() - time_start)) logger.info('[%s] enrichment phase finished in %s', self.backend_section, spent_time) print("Enrichment for {}: finished after {} hours".format(self.backend_section, spent_time))