def setUp(self): self.config = Config(CONF_FILE) mordred = Mordred(self.config) task = TaskProjects(self.config) self.assertEqual(task.execute(), None) self.backends = mordred._get_repos_by_backend() self.backend_tasks = [TaskRawDataCollection, TaskEnrich] self.stopper = threading.Event()
def test__get_projects_from_url(self): """Test downloading projects from an URL """ setup_http_server() projects_url = 'http://localhost/projects.json' config = Config(CONF_FILE) config.set_param('projects', 'projects_url', projects_url) task = TaskProjects(config) self.assertEqual(task.execute(), None) projects = task.get_projects() self.assertTrue(URL_PROJECTS_MAIN in projects)
def test_run_eclipse(self): """Test whether the Task could be run getting projects from Eclipse""" setup_http_server() # Create a empty projects file for testing projects_file = 'test-projects-eclipse.json' config = Config(CONF_FILE) config.set_param('projects', 'load_eclipse', True) config.set_param('projects', 'projects_file', projects_file) task = TaskProjects(config) self.assertEqual(task.execute(), None) self.assertEqual(len(task.get_projects().keys()), 302) # Let's remove some projects to track changes with open(ECLIPSE_PROJECTS_FILE) as eproj: remove_project = 'birt' add_project = 'new_project' new_projects = task.convert_from_eclipse(json.load(eproj)['projects']) new_projects.pop(remove_project) new_projects.update({add_project: {}}) task.set_projects(new_projects) self.assertEqual(task.get_projects_last_diff().sort(), [add_project, remove_project].sort()) remove(projects_file)
def test_convert_from_eclipse(self): """Test the conversion from eclipse projects to grimoire projects""" setup_http_server() projects_file = 'test-projects-eclipse.json' config = Config(CONF_FILE) config.set_param('projects', 'load_eclipse', True) config.set_param('projects', 'projects_file', projects_file) task = TaskProjects(config) self.assertEqual(task.execute(), None) projects = task.get_projects() self.assertTrue(TaskProjects.GLOBAL_PROJECT in projects) remove(projects_file)
def _get_repos_by_backend(self): # # return dict with backend and list of repositories # output = {} projects = TaskProjects.get_projects() for backend_section in Config.get_backend_sections(): for pro in projects: backend = Task.get_backend(backend_section) if backend in projects[pro]: if backend_section not in output: output[backend_section] = projects[pro][backend] else: output[backend_section] += projects[pro][backend] # backend could be in project/repo file but not enabled in # mordred conf file enabled = {} for k in output: if k in self.conf: enabled[k] = output[k] # logger.debug('repos to be retrieved: %s ', enabled) return enabled
def test_run(self): """Test whether the Task could be run""" config = Config(CONF_FILE) cfg = config.get_conf() # We need to load the projects TaskProjects(config).execute() backend_section = GIT_BACKEND_SECTION task = TaskEnrich(config, backend_section=backend_section) self.assertEqual(task.execute(), None) # Check that the enrichment went well es_collection = cfg['es_collection']['url'] es_enrichment = cfg['es_enrichment']['url'] raw_index = es_collection + "/" + cfg[GIT_BACKEND_SECTION]['raw_index'] enrich_index = es_enrichment + "/" + cfg[GIT_BACKEND_SECTION][ 'enriched_index'] r = requests.get(raw_index + "/_search?size=0") raw_items = r.json()['hits']['total'] r = requests.get(enrich_index + "/_search?size=0") enriched_items = r.json()['hits']['total'] # the number of raw items is bigger since the enriched items are generated based on: # https://github.com/VizGrimoire/GrimoireLib # --filters-raw-prefix data.files.file:grimoirelib_alch data.files.file:README.md # see [git] section in tests/test-projects.json self.assertGreater(raw_items, enriched_items)
def test_studies(self): """Test whether the studies configuration works """ config = Config(CONF_FILE) cfg = config.get_conf() # We need to load the projects TaskProjects(config).execute() backend_section = GIT_BACKEND_SECTION task = TaskEnrich(config, backend_section=backend_section) self.assertEqual(task.execute(), None) # Configure a wrong study cfg['git']['studies'] = ['bad_study'] with self.assertRaises(RuntimeError): self.assertEqual(task.execute(), None) # Configure no studies cfg['git']['studies'] = None self.assertEqual(task.execute(), None) # Configure several studies cfg['git']['studies'] = ["enrich_demography", "enrich_areas_of_code"] self.assertEqual(task.execute(), None) # Configure several studies, one wrong cfg['git']['studies'] = ["enrich_demography", "enrich_areas_of_code1"] with self.assertRaises(RuntimeError): self.assertEqual(task.execute(), None)
def test_initialization(self): """Test whether attributes are initializated""" config = Config(CONF_FILE) task = TaskProjects(config) self.assertEqual(task.config, config)
def test_convert_from_eclipse(self): """Test the conversion from eclipse projects to grimoire projects""" setup_http_server() projects_file = 'test-projects-eclipse.json' config = Config(CONF_FILE) config.set_param('projects', 'load_eclipse', True) config.set_param('projects', 'projects_file', projects_file) task = TaskProjects(config) self.assertEqual(task.execute(), None) projects = task.get_projects() self.assertTrue(TaskProjects.GLOBAL_PROJECT in projects) self.assertEqual(projects['birt']['github'][0], 'https://github.com/eclipse/birt') remove(projects_file)
def test_run(self): """Test whether the Task could be run""" config = Config(CONF_FILE) backend_section = GIT_BACKEND_SECTION task = TaskRawDataCollection(config, backend_section=backend_section) # We need to load the projects TaskProjects(config).execute() self.assertEqual(task.execute(), None)
def test_execute_from_archive(self): """Test fetching data from archives""" # proj_file -> 'test-projects-archive.json' stored within the conf file conf_file = 'archive-test.cfg' config = Config(conf_file) backend_sections = [ 'askbot', 'bugzilla', 'bugzillarest', 'confluence', 'discourse', 'dockerhub', 'gerrit', 'github', 'jenkins', 'jira', 'mediawiki', 'meetup', 'nntp', 'phabricator', 'redmine', 'rss', 'stackexchange', 'slack', 'telegram' ] for backend_section in backend_sections: task = TaskRawDataCollection(config, backend_section=backend_section) # We need to load the projects TaskProjects(config).execute() self.assertEqual(task.execute(), None)
def execute(self): cfg = self.config.get_conf() if 'gerrit' not in cfg or 'git' not in cfg: logger.error("gerrit and git are needed for track items.") return # We need to track the items in all git repositories from OPNFV git_repos = [] repos_raw = TaskProjects.get_repos_by_backend_section("git") # git://git.opnfv.org/apex -> https://git.opnfv.org/apex/plain/UPSTREAM for repo in repos_raw: repo = repo.replace("git://", "https://") repo += "/plain/UPSTREAM" git_repos.append(repo) project = cfg['track_items']['project'] elastic_url_enrich = cfg['es_enrichment']['url'] # The raw data comes from upstream project elastic_url_raw = cfg['track_items']['upstream_raw_es_url'] index_gerrit_raw = cfg['track_items']['raw_index_gerrit'] index_git_raw = cfg['track_items']['raw_index_git'] index_gerrit_enrich = cfg['gerrit']['enriched_index'] index_git_enrich = cfg['git']['enriched_index'] db_config = { "database": cfg['sortinghat']['database'], "user": cfg['sortinghat']['user'], "password": cfg['sortinghat']['password'], "host": cfg['sortinghat']['host'] } logger.debug("Importing track items from %s ", git_repos) # # Gerrit Reviews # gerrit_uris = [] for git_repo in git_repos: gerrit_uris += fetch_track_items(git_repo, self.ITEMS_DATA_SOURCE) gerrit_numbers = get_gerrit_numbers(gerrit_uris) logger.info("Total gerrit track items to be imported: %i", len(gerrit_numbers)) enriched_items = enrich_gerrit_items(elastic_url_raw, index_gerrit_raw, gerrit_numbers, project, db_config) logger.info("Total gerrit track items enriched: %i", len(enriched_items)) elastic = ElasticSearch(elastic_url_enrich, index_gerrit_enrich) total = elastic.bulk_upload(enriched_items, "uuid") # # Git Commits # commits_sha = get_commits_from_gerrit(elastic_url_raw, index_gerrit_raw, gerrit_numbers) logger.info("Total git track items to be checked: %i", len(commits_sha)) enriched_items = enrich_git_items(elastic_url_raw, index_git_raw, commits_sha, project, db_config) logger.info("Total git track items enriched: %i", len(enriched_items)) elastic = ElasticSearch(elastic_url_enrich, index_git_enrich) total = elastic.bulk_upload(enriched_items, "uuid")
def __enrich_items(self): time_start = time.time() # logger.info('%s starts for %s ', 'enrichment', self.backend_section) logger.info('[%s] enrichment starts', self.backend_section) print("Enrichment for {}: starting...".format(self.backend_section)) cfg = self.config.get_conf() if 'scroll_size' in cfg['general']: ElasticItems.scroll_size = cfg['general']['scroll_size'] if 'bulk_size' in cfg['general']: ElasticSearch.max_items_bulk = cfg['general']['bulk_size'] no_incremental = False github_token = None pair_programming = False if 'github' in cfg and 'backend_token' in cfg['github']: github_token = cfg['github']['backend_token'] if 'git' in cfg and 'pair-programming' in cfg['git']: pair_programming = cfg['git']['pair-programming'] only_studies = False only_identities = False # repos could change between executions because changes in projects repos = TaskProjects.get_repos_by_backend_section(self.backend_section) if not repos: logger.warning("No enrich repositories for %s", self.backend_section) for repo in repos: # First process p2o params from repo p2o_args = self._compose_p2o_params(self.backend_section, repo) filter_raw = p2o_args[ 'filter-raw'] if 'filter-raw' in p2o_args else None filters_raw_prefix = p2o_args[ 'filters-raw-prefix'] if 'filters-raw-prefix' in p2o_args else None jenkins_rename_file = p2o_args[ 'jenkins-rename-file'] if 'jenkins-rename-file' in p2o_args else None url = p2o_args['url'] # Second process perceval params from repo backend_args = self._compose_perceval_params( self.backend_section, url) studies_args = None if 'studies' in self.conf[self.backend_section] and \ self.conf[self.backend_section]['studies']: studies_args = self.__load_studies() try: es_col_url = self._get_collection_url() logger.debug('[%s] enrichment starts for %s', self.backend_section, repo) backend = self.get_backend(self.backend_section) enrich_backend( es_col_url, self.clean, backend, backend_args, cfg[self.backend_section]['raw_index'], cfg[self.backend_section]['enriched_index'], None, # projects_db is deprecated cfg['projects']['projects_file'], cfg['sortinghat']['database'], no_incremental, only_identities, github_token, False, # studies are executed in its own Task only_studies, cfg['es_enrichment']['url'], None, # args.events_enrich cfg['sortinghat']['user'], cfg['sortinghat']['password'], cfg['sortinghat']['host'], None, # args.refresh_projects, None, # args.refresh_identities, author_id=None, author_uuid=None, filter_raw=filter_raw, filters_raw_prefix=filters_raw_prefix, jenkins_rename_file=jenkins_rename_file, unaffiliated_group=cfg['sortinghat']['unaffiliated_group'], pair_programming=pair_programming, studies_args=studies_args) except Exception as ex: logger.error( "Something went wrong producing enriched data for %s . " "Using the backend_args: %s ", self.backend_section, str(backend_args)) logger.error("Exception: %s", ex) raise DataEnrichmentError( 'Failed to produce enriched data for ' + self.backend_section) # Let's try to create the aliases for the enriched index if not self.enrich_aliases: logger.debug("Creating aliases after enrich") task_aliases = TaskPanelsAliases(self.config) task_aliases.set_backend_section(self.backend_section) task_aliases.execute() logger.debug("Done creating aliases after enrich") self.enrich_aliases = True spent_time = time.strftime("%H:%M:%S", time.gmtime(time.time() - time_start)) logger.info('[%s] enrichment finished in %s', self.backend_section, spent_time) print("Enrichment for {}: finished after {} hours".format( self.backend_section, spent_time))
def __enrich_items(self): time_start = time.time() # logger.info('%s starts for %s ', 'enrichment', self.backend_section) logger.info('[%s] enrichment starts', self.backend_section) print("Enrichment for {}: starting...".format(self.backend_section)) cfg = self.config.get_conf() if 'scroll_size' in cfg['general']: ElasticItems.scroll_size = cfg['general']['scroll_size'] if 'bulk_size' in cfg['general']: ElasticSearch.max_items_bulk = cfg['general']['bulk_size'] no_incremental = False github_token = None pair_programming = False if 'github' in cfg and 'backend_token' in cfg['github']: github_token = cfg['github']['backend_token'] if 'git' in cfg and 'pair-programming' in cfg['git']: pair_programming = cfg['git']['pair-programming'] only_studies = False only_identities = False # repos could change between executions because changes in projects repos = TaskProjects.get_repos_by_backend_section(self.backend_section) if not repos: logger.warning("No enrich repositories for %s", self.backend_section) for repo in repos: # First process p2o params from repo p2o_args = self._compose_p2o_params(self.backend_section, repo) filter_raw = p2o_args['filter-raw'] if 'filter-raw' in p2o_args else None filters_raw_prefix = p2o_args['filters-raw-prefix'] if 'filters-raw-prefix' in p2o_args else None jenkins_rename_file = p2o_args['jenkins-rename-file'] if 'jenkins-rename-file' in p2o_args else None url = p2o_args['url'] # Second process perceval params from repo backend_args = self._compose_perceval_params(self.backend_section, url) studies_args = None if 'studies' in self.conf[self.backend_section] and self.conf[self.backend_section]['studies']: studies_args = self.__load_studies() try: es_col_url = self._get_collection_url() logger.debug('[%s] enrichment starts for %s', self.backend_section, repo) backend = self.get_backend(self.backend_section) enrich_backend(es_col_url, self.clean, backend, backend_args, cfg[self.backend_section]['raw_index'], cfg[self.backend_section]['enriched_index'], None, # projects_db is deprecated cfg['projects']['projects_file'], cfg['sortinghat']['database'], no_incremental, only_identities, github_token, False, # studies are executed in its own Task only_studies, cfg['es_enrichment']['url'], None, # args.events_enrich cfg['sortinghat']['user'], cfg['sortinghat']['password'], cfg['sortinghat']['host'], None, # args.refresh_projects, None, # args.refresh_identities, author_id=None, author_uuid=None, filter_raw=filter_raw, filters_raw_prefix=filters_raw_prefix, jenkins_rename_file=jenkins_rename_file, unaffiliated_group=cfg['sortinghat']['unaffiliated_group'], pair_programming=pair_programming, studies_args=studies_args) except Exception as ex: logger.error("Something went wrong producing enriched data for %s . " "Using the backend_args: %s ", self.backend_section, str(backend_args)) logger.error("Exception: %s", ex) raise DataEnrichmentError('Failed to produce enriched data for ' + self.backend_section) # Let's try to create the aliases for the enriched index if not self.enrich_aliases: logger.debug("Creating aliases after enrich") task_aliases = TaskPanelsAliases(self.config) task_aliases.set_backend_section(self.backend_section) task_aliases.execute() logger.debug("Done creating aliases after enrich") self.enrich_aliases = True spent_time = time.strftime("%H:%M:%S", time.gmtime(time.time() - time_start)) logger.info('[%s] enrichment finished in %s', self.backend_section, spent_time) print("Enrichment for {}: finished after {} hours".format(self.backend_section, spent_time))
def execute(self): def check_arthur_task(repo, backend_args): """ Check if a task exists in arthur and if not, create it """ arthur_repo_json = self.__create_arthur_json(repo, backend_args) logger.debug('JSON config for arthur %s', json.dumps(arthur_repo_json, indent=True)) # First check is the task already exists try: r = requests.post(self.arthur_url + "/tasks") except requests.exceptions.ConnectionError as ex: logging.error("Can not connect to %s", self.arthur_url) raise RuntimeError("Can not connect to " + self.arthur_url) task_ids = [task['task_id'] for task in r.json()['tasks']] new_task_ids = [ task['task_id'] for task in arthur_repo_json['tasks'] ] # TODO: if a tasks already exists maybe we should delete and readd it already_tasks = list(set(task_ids).intersection(set(new_task_ids))) if len(already_tasks) > 0: logger.warning( "Tasks not added to arthur because there are already existing tasks %s", already_tasks) else: r = requests.post(self.arthur_url + "/add", json=arthur_repo_json) r.raise_for_status() logger.info('[%s] collection configured in arthur for %s', self.backend_section, repo) def collect_arthur_items(repo): aitems = self.__feed_backend_arthur(repo) if not aitems: return connector = get_connector_from_name(self.backend_section) klass = connector[1] # Ocean backend for the connector ocean_backend = klass(None) es_col_url = self._get_collection_url() es_index = self.conf[self.backend_section]['raw_index'] clean = False elastic_ocean = get_elastic(es_col_url, es_index, clean, ocean_backend) ocean_backend.set_elastic(elastic_ocean) ocean_backend.feed(arthur_items=aitems) cfg = self.config.get_conf() if ('collect' in cfg[self.backend_section] and not cfg[self.backend_section]['collect']): logging.info('%s collect disabled', self.backend_section) return if 'scroll_size' in cfg['general']: ElasticItems.scroll_size = cfg['general']['scroll_size'] if 'bulk_size' in cfg['general']: ElasticSearch.max_items_bulk = cfg['general']['bulk_size'] logger.info('Programming arthur for [%s] raw data collection', self.backend_section) clean = False fetch_archive = False if ('fetch-archive' in self.conf[self.backend_section] and self.conf[self.backend_section]['fetch-archive']): fetch_archive = True # repos could change between executions because changes in projects repos = TaskProjects.get_repos_by_backend_section(self.backend_section) if not repos: logger.warning("No collect repositories for %s", self.backend_section) for repo in repos: # If the repo already exists don't try to add it to arthur tag = self.backend_tag(repo) if tag not in self.arthur_items: self.arthur_items[tag] = [] p2o_args = self._compose_p2o_params(self.backend_section, repo) filter_raw = p2o_args[ 'filter-raw'] if 'filter-raw' in p2o_args else None if filter_raw: # If filter-raw exists the goal is to enrich already collected # data, so don't collect anything logging.warning("Not collecting filter raw repository: %s", repo) continue backend_args = self._compose_perceval_params( self.backend_section, repo) logger.debug(backend_args) check_arthur_task(repo, backend_args) collect_arthur_items(repo)
def test_run(self): """Test whether the Task could be run""" config = Config(CONF_FILE) task = TaskProjects(config) self.assertEqual(task.execute(), None) self.assertEqual(len(task.get_projects().keys()), 1)
def execute(self): cfg = self.config.get_conf() if 'scroll_size' in cfg['general']: ElasticItems.scroll_size = cfg['general']['scroll_size'] if 'bulk_size' in cfg['general']: ElasticSearch.max_items_bulk = cfg['general']['bulk_size'] if ('collect' in cfg[self.backend_section] and not cfg[self.backend_section]['collect']): logging.info('%s collect disabled', self.backend_section) return t2 = time.time() logger.info('[%s] raw data collection starts', self.backend_section) print("Collection for {}: starting...".format(self.backend_section)) clean = False fetch_archive = False if ('fetch-archive' in cfg[self.backend_section] and cfg[self.backend_section]['fetch-archive']): fetch_archive = True # repos could change between executions because changes in projects repos = TaskProjects.get_repos_by_backend_section(self.backend_section) if not repos: logger.warning("No collect repositories for %s", self.backend_section) for repo in repos: p2o_args = self._compose_p2o_params(self.backend_section, repo) filter_raw = p2o_args[ 'filter-raw'] if 'filter-raw' in p2o_args else None if filter_raw: # If filter-raw exists the goal is to enrich already collected # data, so don't collect anything logging.warning("Not collecting filter raw repository: %s", repo) continue url = p2o_args['url'] backend_args = self._compose_perceval_params( self.backend_section, repo) logger.debug(backend_args) logger.debug('[%s] collection starts for %s', self.backend_section, repo) es_col_url = self._get_collection_url() ds = self.backend_section backend = self.get_backend(self.backend_section) project = None # just used for github in cauldron try: feed_backend(es_col_url, clean, fetch_archive, backend, backend_args, cfg[ds]['raw_index'], cfg[ds]['enriched_index'], project) except Exception: logger.error( "Something went wrong collecting data from this %s repo: %s . " "Using the backend_args: %s " % (ds, url, str(backend_args))) traceback.print_exc() raise DataCollectionError('Failed to collect data from %s' % url) t3 = time.time() spent_time = time.strftime("%H:%M:%S", time.gmtime(t3 - t2)) logger.info('[%s] Data collection finished in %s', self.backend_section, spent_time) print("Collection for {}: finished after {} hours".format( self.backend_section, spent_time))