def _compose_arthur_params(self, backend_section, repo): # Params for the backends must be in a dictionary for arthur params = {} backend = self.get_backend(backend_section) connector = get_connector_from_name(backend) ocean = connector[1] # First add the params from the URL, which is backend specific params.update(ocean.get_arthur_params_from_url(repo)) # Now add the backend params included in the config file for p in self.conf[backend_section]: if p in self.NO_BACKEND_FIELDS: # These params are not for the perceval backend continue if self.conf[backend_section][p]: # Command line - in param is converted to _ in python variable p_ = p.replace("-", "_") if p in self.PARAMS_WITH_SPACES: # '--blacklist-jobs', 'a', 'b', 'c' # 'a', 'b', 'c' must be added as items in the list list_params = self.conf[backend_section][p].split() params[p_] = list_params else: params[p_] = self.conf[backend_section][p] return params
def _compose_perceval_params(self, backend_section, repo): backend = self.get_backend(backend_section) connector = get_connector_from_name(backend) ocean = connector[1] # First add the params from the URL, which is backend specific params = ocean.get_perceval_params_from_url(repo) # Now add the backend params included in the config file for p in self.conf[backend_section]: if p in self.NO_BACKEND_FIELDS: # These params are not for the perceval backend continue section_param = self.conf[backend_section][p] if not section_param: logger.warning("Empty section %s", p) continue # If param is boolean, no values must be added if type(section_param) == bool: params.append("--" + p) if section_param else None elif type(section_param) == list: # '--blacklist-jobs', 'a', 'b', 'c' # 'a', 'b', 'c' must be added as items in the list params.append("--" + p) list_params = section_param params += list_params else: params.append("--" + p) params.append(str(section_param)) return params
def _compose_perceval_params(self, backend_section, repo): backend = self.get_backend(backend_section) connector = get_connector_from_name(backend) ocean = connector[1] # First add the params from the URL, which is backend specific params = ocean.get_perceval_params_from_url(repo) # Now add the backend params included in the config file for p in self.conf[backend_section]: if p in self.ES_INDEX_FIELDS: # These params are not for the perceval backend continue params.append("--" + p) if self.conf[backend_section][p]: # If param is boolean, no values must be added if type(self.conf[backend_section][p]) != bool: if type(self.conf[backend_section][p]) == list: # '--blacklist-jobs', 'a', 'b', 'c' # 'a', 'b', 'c' must be added as items in the list list_params = self.conf[backend_section][p] params += list_params else: params.append(self.conf[backend_section][p]) return params
def _get_enrich_backend(self): db_projects_map = None json_projects_map = None clean = False connector = get_connector_from_name( self.get_backend(self.backend_section)) if 'projects_file' in self.conf['projects']: json_projects_map = self.conf['projects']['projects_file'] enrich_backend = connector[2](self.db_sh, db_projects_map, json_projects_map, self.db_user, self.db_password, self.db_host) elastic_enrich = get_elastic( self.conf['es_enrichment']['url'], self.conf[self.backend_section]['enriched_index'], clean, enrich_backend) enrich_backend.set_elastic(elastic_enrich) if 'github' in self.conf.keys() and \ 'backend_token' in self.conf['github'].keys() and \ self.get_backend(self.backend_section) == "git": gh_token = self.conf['github']['backend_token'] enrich_backend.set_github_token(gh_token) if 'unaffiliated_group' in self.conf['sortinghat']: enrich_backend.unaffiliated_group = self.conf['sortinghat'][ 'unaffiliated_group'] return enrich_backend
def compose_perceval_params(self, backend_name, repo): # Params that are lists separated by white space list_params_spaces = ['blacklist-jobs'] connector = get_connector_from_name(self.backend_name) ocean = connector[1] # First add the params from the URL, which is backend specific params = ocean.get_perceval_params_from_url(repo) # Now add the backend params included in the config file for p in self.conf[backend_name]: if p in self.ES_INDEX_FIELDS: # These params are not for the perceval backend continue params.append("--"+p) if self.conf[backend_name][p]: if type(self.conf[backend_name][p]) != bool: if p in list_params_spaces: # '--blacklist-jobs', 'a', 'b', 'c' # 'a', 'b', 'c' must be added as items in the list list_params = self.conf[backend_name][p].split() params += list_params else: params.append(self.conf[backend_name][p]) return params
def compose_p2o_params(self, backend_name, repo): # get p2o params included in the projects list params = {} connector = get_connector_from_name(self.backend_name) ocean = connector[1] # First add the params from the URL, which is backend specific params = ocean.get_p2o_params_from_url(repo) return params
def _extract_repo_labels(self, backend_section, repo): """Extract the labels declared in the repositories within the projects.json, and remove them to avoid breaking already existing functionalities. :param backend_section: name of the backend section :param repo: repo url in projects.json """ backend = self.get_backend(backend_section) connector = get_connector_from_name(backend) ocean = connector[1] processed_repo, labels_lst = ocean.extract_repo_labels(repo) return processed_repo, labels_lst
def collect_arthur_items(repo): aitems = self.__feed_backend_arthur(repo) if not aitems: return connector = get_connector_from_name(self.backend_section) klass = connector[1] # Ocean backend for the connector ocean_backend = klass(None) es_col_url = self._get_collection_url() es_index = self.conf[self.backend_section]['raw_index'] clean = False elastic_ocean = get_elastic(es_col_url, es_index, clean, ocean_backend) ocean_backend.set_elastic(elastic_ocean) ocean_backend.feed(arthur_items=aitems)
def _get_enrich_backend(self): db_projects_map = None json_projects_map = None clean = False connector = get_connector_from_name(self.get_backend(self.backend_section)) if 'projects_file' in self.conf['projects']: json_projects_map = self.conf['projects']['projects_file'] enrich_backend = connector[2](self.db_sh, db_projects_map, json_projects_map, self.db_user, self.db_password, self.db_host) elastic_enrich = get_elastic(self.conf['es_enrichment']['url'], self.conf[self.backend_section]['enriched_index'], clean, enrich_backend) enrich_backend.set_elastic(elastic_enrich) if self.db_unaffiliate_group: enrich_backend.unaffiliated_group = self.db_unaffiliate_group return enrich_backend
def get_enrich_backend(self): db_projects_map = None json_projects_map = None clean = False connector = get_connector_from_name(self.backend_name) enrich_backend = connector[2](self.db_sh, db_projects_map, json_projects_map, self.db_user, self.db_password, self.db_host) elastic_enrich = get_elastic(self.conf['es_enrichment'], self.conf[self.backend_name]['enriched_index'], clean, enrich_backend) enrich_backend.set_elastic(elastic_enrich) if 'github' in self.conf.keys() and \ 'backend_token' in self.conf['github'].keys() and \ self.backend_name == "git": gh_token = self.conf['github']['backend_token'] enrich_backend.set_github_token(gh_token) return enrich_backend
def __create_arthur_json(self, repo, backend_args): """ Create the JSON for configuring arthur to collect data https://github.com/grimoirelab/arthur#adding-tasks Sample for git: { "tasks": [ { "task_id": "arthur.git", "backend": "git", "backend_args": { "gitpath": "/tmp/arthur_git/", "uri": "https://github.com/grimoirelab/arthur.git" }, "category": "commit", "archive_args": { "archive_path": '/tmp/test_archives', "fetch_from_archive": false, "archive_after": None }, "scheduler_args": { "delay": 10 } } ] } """ backend_args = self._compose_arthur_params(self.backend_section, repo) if self.backend_section == 'git': backend_args['gitpath'] = os.path.join(self.REPOSITORY_DIR, repo) backend_args['tag'] = self.backend_tag(repo) ajson = {"tasks": [{}]} # This is the perceval tag ajson["tasks"][0]['task_id'] = self.backend_tag(repo) ajson["tasks"][0]['backend'] = self.backend_section.split(":")[0] ajson["tasks"][0]['backend_args'] = backend_args ajson["tasks"][0]['category'] = backend_args['category'] ajson["tasks"][0]['archive'] = {} ajson["tasks"][0]['scheduler'] = {"delay": self.ARTHUR_TASK_DELAY} # from-date or offset param must be added es_col_url = self._get_collection_url() es_index = self.conf[self.backend_section]['raw_index'] # Get the last activity for the data source es = ElasticSearch(es_col_url, es_index) connector = get_connector_from_name(self.backend_section) klass = connector[0] # Backend for the connector signature = inspect.signature(klass.fetch) last_activity = None filter_ = {"name": "tag", "value": backend_args['tag']} if 'from_date' in signature.parameters: last_activity = es.get_last_item_field('metadata__updated_on', [filter_]) if last_activity: ajson["tasks"][0]['backend_args'][ 'from_date'] = last_activity.isoformat() elif 'offset' in signature.parameters: last_activity = es.get_last_item_field('offset', [filter_]) if last_activity: ajson["tasks"][0]['backend_args']['offset'] = last_activity if last_activity: logging.info("Getting raw item with arthur since %s", last_activity) return (ajson)
if __name__ == '__main__': app_init = datetime.now() args = get_params() config_logging(args.debug) if args.index is None: # Extract identities from all indexes pass else: logging.info("Extracting identities from: %s" % (args.index)) perceval_params = get_perceval_params(args.elastic_url, args.index) backend_name = perceval_params['backend'] connector = get_connector_from_name(backend_name) perceval_backend_class = connector[0] ocean_backend_class = connector[1] perceval_backend = None # Don't use perceval perceval_backend = perceval_backend_class(**perceval_params) obackend = ocean_backend_class(perceval_backend, incremental=False) obackend.set_elastic(get_elastic(args.elastic_url, args.index)) identities = get_identities(obackend) SortingHat.add_identities(identities, backend_name) # Add the identities to Sorting Hat print ("Total identities processed: %i" % (len(identities)))
if __name__ == '__main__': app_init = datetime.now() args = get_params() config_logging(args.debug) if args.index is None: # Extract identities from all indexes pass else: logging.info("Extracting identities from: %s" % (args.index)) perceval_params = get_perceval_params(args.elastic_url, args.index) backend_name = perceval_params['backend'] connector = get_connector_from_name(backend_name) perceval_backend_class = connector[0] ocean_backend_class = connector[1] perceval_backend = None # Don't use perceval perceval_backend = perceval_backend_class(**perceval_params) obackend = ocean_backend_class(perceval_backend, incremental=False) obackend.set_elastic(get_elastic(args.elastic_url, args.index)) identities = get_identities(obackend) SortingHat.add_identities(identities, backend_name) # Add the identities to Sorting Hat print("Total identities processed: %i" % (len(identities)))
def execute(self): cfg = self.config.get_conf() if ('collect' in cfg[self.backend_section] and not cfg[self.backend_section]['collect']): logging.info('%s collect disabled', self.backend_section) return logger.info('Programming arthur for [%s] raw data collection', self.backend_section) clean = False fetch_cache = False if ('fetch-cache' in self.conf[self.backend_section] and self.conf[self.backend_section]['fetch-cache']): fetch_cache = True # repos could change between executions because changes in projects repos = TaskProjects.get_repos_by_backend_section(self.backend_section) if not repos: logger.warning("No collect repositories for %s", self.backend_section) for repo in repos: p2o_args = self._compose_p2o_params(self.backend_section, repo) filter_raw = p2o_args[ 'filter-raw'] if 'filter-raw' in p2o_args else None if filter_raw: # If filter-raw exists the goal is to enrich already collected # data, so don't collect anything logging.warning("Not collecting filter raw repository: %s", repo) continue url = p2o_args['url'] backend_args = self._compose_perceval_params( self.backend_section, repo) logger.debug(backend_args) arthur_repo_json = self.__create_arthur_json(repo, backend_args) logger.debug('JSON config for arthur %s', json.dumps(arthur_repo_json, indent=True)) # First check is the task already exists try: r = requests.post(self.arthur_url + "/tasks") except requests.exceptions.ConnectionError as ex: logging.error("Can not connect to %s", self.arthur_url) return # raise RuntimeError("Can not connect to " + self.arthur_url) task_ids = [task['task_id'] for task in r.json()['tasks']] new_task_ids = [ task['task_id'] for task in arthur_repo_json['tasks'] ] # TODO: if a tasks already exists maybe we should delete and readd it already_tasks = list(set(task_ids).intersection(set(new_task_ids))) if len(already_tasks) > 0: logger.warning( "Tasks not added to arthur because there are already existing tasks %s", already_tasks) else: r = requests.post(self.arthur_url + "/add", json=arthur_repo_json) r.raise_for_status() logger.info('[%s] collection configured in arthur for %s', self.backend_section, repo) # Try to collect existing items from REDIS aitems = self.__feed_backend_arthur(repo) connector = get_connector_from_name(self.backend_section) klass = connector[1] # Ocean backend for the connector ocean_backend = klass(None) es_col_url = self._get_collection_url() es_index = self.conf[self.backend_section]['raw_index'] clean = False elastic_ocean = get_elastic(es_col_url, es_index, clean, ocean_backend) ocean_backend.set_elastic(elastic_ocean) ocean_backend.feed(arthur_items=aitems)