def execute(self): errors = [] cfg = self.config.get_conf() if 'scroll_size' in cfg['general']: ElasticItems.scroll_size = cfg['general']['scroll_size'] if 'bulk_size' in cfg['general']: ElasticSearch.max_items_bulk = cfg['general']['bulk_size'] if 'collect' in cfg[self.backend_section] and not cfg[ self.backend_section]['collect']: logging.info('%s collect disabled', self.backend_section) return errors t2 = time.time() logger.info('[%s] collection phase starts', self.backend_section) print("Collection for {}: starting...".format(self.backend_section)) clean = False fetch_archive = False if 'fetch-archive' in cfg[self.backend_section] and cfg[ self.backend_section]['fetch-archive']: fetch_archive = True # repos could change between executions because changes in projects repos = TaskProjects.get_repos_by_backend_section(self.backend_section) if not repos: logger.warning("No collect repositories for %s", self.backend_section) for repo in repos: repo, repo_labels = self._extract_repo_labels( self.backend_section, repo) p2o_args = self._compose_p2o_params(self.backend_section, repo) filter_raw = p2o_args[ 'filter-raw'] if 'filter-raw' in p2o_args else None if filter_raw: # If filter-raw exists it means that there is an equivalent URL # in the `unknown` section of the projects.json. Thus the URL with # filter-raw is ignored in the collection phase, while the URL # in `unknown` is considered in this phase. logging.warning("Not collecting filter raw repository: %s", repo) continue url = p2o_args['url'] backend_args = self._compose_perceval_params( self.backend_section, repo) logger.debug(backend_args) logger.info('[%s] collection starts for %s', self.backend_section, repo) es_col_url = self._get_collection_url() ds = self.backend_section backend = self.get_backend(self.backend_section) project = None # just used for github in cauldron es_aliases = self.select_aliases(cfg, self.backend_section) try: error_msg = feed_backend(es_col_url, clean, fetch_archive, backend, backend_args, cfg[ds]['raw_index'], cfg[ds]['enriched_index'], project, es_aliases=es_aliases, projects_json_repo=repo, repo_labels=repo_labels) error = {'backend': backend, 'repo': repo, 'error': error_msg} errors.append(error) except Exception: logger.error( "Something went wrong collecting data from this %s repo: %s . " "Using the backend_args: %s " % (ds, url, str(backend_args))) traceback.print_exc() raise DataCollectionError('Failed to collect data from %s' % url) logger.info('[%s] collection finished for %s', self.backend_section, repo) t3 = time.time() spent_time = time.strftime("%H:%M:%S", time.gmtime(t3 - t2)) logger.info('[%s] collection phase finished in %s', self.backend_section, spent_time) print("Collection for {}: finished after {} hours".format( self.backend_section, spent_time)) self.retain_data(cfg['general']['retention_time'], self.conf['es_collection']['url'], self.conf[self.backend_section]['raw_index']) return errors
def execute(self): cfg = self.config.get_conf() if 'scroll_size' in cfg['general']: ElasticItems.scroll_size = cfg['general']['scroll_size'] if 'bulk_size' in cfg['general']: ElasticSearch.max_items_bulk = cfg['general']['bulk_size'] if ('collect' in cfg[self.backend_section] and not cfg[self.backend_section]['collect']): logging.info('%s collect disabled', self.backend_section) return t2 = time.time() logger.info('[%s] raw data collection starts', self.backend_section) print("Collection for {}: starting...".format(self.backend_section)) clean = False fetch_archive = False if ('fetch-archive' in cfg[self.backend_section] and cfg[self.backend_section]['fetch-archive']): fetch_archive = True # repos could change between executions because changes in projects repos = TaskProjects.get_repos_by_backend_section(self.backend_section) if not repos: logger.warning("No collect repositories for %s", self.backend_section) for repo in repos: p2o_args = self._compose_p2o_params(self.backend_section, repo) filter_raw = p2o_args[ 'filter-raw'] if 'filter-raw' in p2o_args else None if filter_raw: # If filter-raw exists the goal is to enrich already collected # data, so don't collect anything logging.warning("Not collecting filter raw repository: %s", repo) continue url = p2o_args['url'] backend_args = self._compose_perceval_params( self.backend_section, repo) logger.debug(backend_args) logger.debug('[%s] collection starts for %s', self.backend_section, repo) es_col_url = self._get_collection_url() ds = self.backend_section backend = self.get_backend(self.backend_section) project = None # just used for github in cauldron try: feed_backend(es_col_url, clean, fetch_archive, backend, backend_args, cfg[ds]['raw_index'], cfg[ds]['enriched_index'], project) except Exception: logger.error( "Something went wrong collecting data from this %s repo: %s . " "Using the backend_args: %s " % (ds, url, str(backend_args))) traceback.print_exc() raise DataCollectionError('Failed to collect data from %s' % url) t3 = time.time() spent_time = time.strftime("%H:%M:%S", time.gmtime(t3 - t2)) logger.info('[%s] Data collection finished in %s', self.backend_section, spent_time) print("Collection for {}: finished after {} hours".format( self.backend_section, spent_time))