def setUp(self): self.config = Config(CONF_FILE) mordred = SirMordred(self.config) task = TaskProjects(self.config) self.assertEqual(task.execute(), None) self.backends = mordred._get_repos_by_backend() self.backend_tasks = [TaskRawDataCollection, TaskEnrich] self.stopper = threading.Event()
def test__get_projects_from_url(self): """Test downloading projects from an URL """ setup_http_server() projects_url = 'http://localhost/projects.json' config = Config(CONF_FILE) config.set_param('projects', 'projects_url', projects_url) task = TaskProjects(config) self.assertEqual(task.execute(), None) projects = task.get_projects() self.assertTrue(URL_PROJECTS_MAIN in projects)
def test_run_eclipse(self): """Test whether the Task could be run getting projects from Eclipse""" setup_http_server() # Create a empty projects file for testing projects_file = 'test-projects-eclipse.json' config = Config(CONF_FILE) config.set_param('projects', 'load_eclipse', True) config.set_param('projects', 'projects_file', projects_file) task = TaskProjects(config) self.assertEqual(task.execute(), None) self.assertEqual(len(task.get_projects().keys()), 302) # Let's remove some projects to track changes with open(ECLIPSE_PROJECTS_FILE) as eproj: remove_project = 'birt' add_project = 'new_project' new_projects = task.convert_from_eclipse( json.load(eproj)['projects']) new_projects.pop(remove_project) new_projects.update({add_project: {}}) task.set_projects(new_projects) self.assertEqual(task.get_projects_last_diff().sort(), [add_project, remove_project].sort()) remove(projects_file)
def test_run(self): """Test whether the Task could be run""" config = Config(CONF_FILE) cfg = config.get_conf() # We need to load the projects TaskProjects(config).execute() backend_section = GIT_BACKEND_SECTION task = TaskEnrich(config, backend_section=backend_section) self.assertEqual(task.execute(), None) # Check that the enrichment went well es_collection = cfg['es_collection']['url'] es_enrichment = cfg['es_enrichment']['url'] raw_index = es_collection + "/" + cfg[GIT_BACKEND_SECTION]['raw_index'] enrich_index = es_enrichment + "/" + cfg[GIT_BACKEND_SECTION]['enriched_index'] r = requests.get(raw_index + "/_search?size=0") raw_items = r.json()['hits']['total'] r = requests.get(enrich_index + "/_search?size=0") enriched_items = r.json()['hits']['total'] # the number of raw items is bigger since the enriched items are generated based on: # https://github.com/VizGrimoire/GrimoireLib # --filters-raw-prefix data.files.file:grimoirelib_alch data.files.file:README.md # see [git] section in tests/test-projects.json self.assertGreater(raw_items, enriched_items)
def _get_repos_by_backend(self): # # return dict with backend and list of repositories # output = {} projects = TaskProjects.get_projects() for pro in projects: # remove duplicates in backends_section with list(set(..)) backend_sections = list( set([ sect for sect in projects[pro].keys() for backend_section in Config.get_backend_sections() if sect and sect.startswith(backend_section) ])) # sort backends section backend_sections.sort() for backend_section in backend_sections: if backend_section not in output: output[backend_section] = projects[pro][backend_section] else: output[backend_section] += projects[pro][backend_section] # backend could be in project/repo file but not enabled in # sirmordred conf file enabled = {} for k in output: if k in self.conf: enabled[k] = output[k] # logger.debug('repos to be retrieved: %s ', enabled) return enabled
def test_execute_from_archive(self): """Test fetching data from archives""" # proj_file -> 'test-projects-archive.json' stored within the conf file conf_file = 'archives-test.cfg' config = Config(conf_file) backend_sections = [ 'askbot', 'bugzilla', 'bugzillarest', 'confluence', 'discourse', 'dockerhub', 'gerrit', 'github:issue', 'github:pull', 'gitlab:issue', 'gitlab:merge', 'google_hits', 'jenkins', 'jira', 'mediawiki', 'meetup', 'mozillaclub', 'nntp', 'phabricator', 'redmine', 'remo', 'rss', 'stackexchange', 'slack', 'telegram', 'twitter' ] # We need to load the projects TaskProjects(config).execute() for backend_section in backend_sections: task = TaskRawDataCollection(config, backend_section=backend_section) task.execute() for backend_section in backend_sections: task = TaskEnrich(config, backend_section=backend_section) self.assertEqual(task.execute(), None)
def _get_repos_by_backend(self): # # return dict with backend and list of repositories # output = {} projects = TaskProjects.get_projects() for backend_section in Config.get_backend_sections(): for pro in projects: backend = Task.get_backend(backend_section) if backend in projects[pro]: if backend_section not in output: output[backend_section] = projects[pro][backend] else: output[backend_section] += projects[pro][backend] # backend could be in project/repo file but not enabled in # sirmordred conf file enabled = {} for k in output: if k in self.conf: enabled[k] = output[k] # logger.debug('repos to be retrieved: %s ', enabled) return enabled
def test_initialization(self): """Test whether attributes are initializated""" config = Config(CONF_FILE) task = TaskProjects(config) self.assertEqual(task.config, config)
def test_run_eclipse(self): """Test whether the Task could be run getting projects from Eclipse""" setup_http_server() # Create a empty projects file for testing projects_file = 'test-projects-eclipse.json' config = Config(CONF_FILE) config.set_param('projects', 'load_eclipse', True) config.set_param('projects', 'projects_file', projects_file) task = TaskProjects(config) self.assertEqual(task.execute(), None) self.assertEqual(len(task.get_projects().keys()), 302) remove(projects_file)
def test_execute(self): """Test whether the Task could be run""" config = Config(CONF_FILE) backend_section = GIT_BACKEND_SECTION task = TaskRawDataCollection(config, backend_section=backend_section) # We need to load the projects TaskProjects(config).execute() self.assertEqual(task.execute(), None)
def get_identities_load(config): """Execute the load identities phase :param config: a Mordred config object """ TaskProjects(config).execute() task = TaskIdentitiesLoad(config) task.execute() logging.info("Loading identities finished!")
def test_convert_from_eclipse(self): """Test the conversion from eclipse projects to grimoire projects""" setup_http_server() projects_file = 'test-projects-eclipse.json' config = Config(CONF_FILE) config.set_param('projects', 'load_eclipse', True) config.set_param('projects', 'projects_file', projects_file) task = TaskProjects(config) self.assertEqual(task.execute(), None) projects = task.get_projects() self.assertTrue(TaskProjects.GLOBAL_PROJECT in projects) self.assertEqual(projects['birt']['github'][0], 'https://github.com/eclipse/birt') remove(projects_file)
def get_identities_merge(config): """Execute the merge identities phase :param config: a Mordred config object """ TaskProjects(config).execute() task = TaskIdentitiesMerge(config) task.execute() logging.info("Merging identities finished!")
def get_enrich(config, backend_section): """Execute the enrich phase for a given backend section :param config: a Mordred config object :param backend_section: the backend section where the enrich phase is executed """ TaskProjects(config).execute() task = TaskEnrich(config, backend_section=backend_section) task.execute() logging.info("Loading enriched data finished!")
def test_get_repos_by_backend_sections_unknown(self): """Test whether the repos of each section are properly loaded when the unknown section is present""" config = Config(CONF_FILE_UNKNOWN) task = TaskProjects(config) self.assertEqual(task.execute(), None) # repos not in unknown expected_list = ["https://github.com/chaoss/grimoirelab-perceval"] repos = task.get_repos_by_backend_section("git") self.assertListEqual(repos, expected_list) repos = task.get_repos_by_backend_section("git", raw=False) self.assertListEqual(repos, expected_list) # repos only in unknown expected_list = ["https://bugzilla.mozilla.org"] repos = task.get_repos_by_backend_section("bugzillarest") self.assertListEqual(repos, expected_list) repos = task.get_repos_by_backend_section("bugzillarest", raw=False) self.assertListEqual(repos, expected_list) # repos in unknown and other section expected_list = ["gerrit.onosproject.org"] repos = task.get_repos_by_backend_section("gerrit:onos") self.assertListEqual(repos, expected_list) expected_list = [ "gerrit.onosproject.org --filter-raw=data.project:OnosSystemTest", "gerrit.onosproject.org --filter-raw=data.project:OnosSystemTestJenkins", "gerrit.onosproject.org --filter-raw=data.project:cord-openwrt", "gerrit.onosproject.org --filter-raw=data.project:fabric-control", "gerrit.onosproject.org --filter-raw=data.project:manifest" ] repos = task.get_repos_by_backend_section("gerrit:onos", raw=False) repos.sort() expected_list.sort() self.assertListEqual(repos, expected_list)
def get_raw(config, backend_section): """Execute the raw phase for a given backend section :param config: a Mordred config object :param backend_section: the backend section where the raw phase is executed """ task = TaskRawDataCollection(config, backend_section=backend_section) TaskProjects(config).execute() try: task.execute() logging.info("Loading raw data finished!") except Exception as e: logging.error(str(e)) sys.exit(-1)
def get_raw(config, backend_section, arthur): """Execute the raw phase for a given backend section, optionally using Arthur :param config: a Mordred config object :param backend_section: the backend section where the raw phase is executed :param arthur: if true, it enables Arthur to collect the raw data """ if arthur: task = TaskRawDataArthurCollection(config, backend_section=backend_section) else: task = TaskRawDataCollection(config, backend_section=backend_section) TaskProjects(config).execute() task.execute() logging.info("Loading raw data finished!")
def test_execute_no_collection(self): """Test whether the raw data is not downloaded when --filter-no-collection is true""" config = Config(CONF_FILE_NO_COLL) cfg = config.get_conf() backend_section = GIT_BACKEND_SECTION task = TaskRawDataCollection(config, backend_section=backend_section) # We need to load the projects TaskProjects(config).execute() self.assertIsNotNone(task.execute()) # Check that the fitler --filter-no-collection works es_collection = cfg['es_collection']['url'] raw_index = es_collection + "/" + cfg[GIT_BACKEND_SECTION]['raw_index'] r = requests.get(raw_index + "/_search?size=0", verify=False) raw_items = r.json()['hits']['total'] self.assertEqual(raw_items, 40)
def test_execute(self): """Test whether the Task could be run""" config = Config(CONF_FILE) cfg = config.get_conf() backend_section = GIT_BACKEND_SECTION task = TaskRawDataCollection(config, backend_section=backend_section) # We need to load the projects TaskProjects(config).execute() self.assertIsNotNone(task.execute()) # Check that the collection went well es_collection = cfg['es_collection']['url'] raw_index = es_collection + "/" + cfg[GIT_BACKEND_SECTION]['raw_index'] r = requests.get(raw_index + "/_search?size=0", verify=False) raw_items = r.json()['hits']['total'] self.assertEqual(raw_items, 3603)
def get_raw(config, backend_section, repos_to_check=None): """Execute the raw phase for a given backend section Repos are only checked if they are in BOTH `repos_to_check` and the `projects.json` :param config: a Mordred config object :param backend_section: the backend section where the raw phase is executed :param repos_to_check: A list of repo URLs to check, or None to check all repos """ task = TaskRawDataCollection(config, backend_section=backend_section, allowed_repos=repos_to_check) TaskProjects(config).execute() try: task.execute() logging.info("Loading raw data finished!") except Exception as e: logging.error(str(e)) sys.exit(-1)
def test_execute(self): """Test whether the Task could be run""" def setUp(self): config = Config(CONF_FILE) sh = config.get_conf()['sortinghat'] self.sh_kwargs = { 'user': sh['user'], 'password': sh['password'], 'database': sh['database'], 'host': sh['host'], 'port': None } # Clean the database to start an empty state Database.drop(**self.sh_kwargs) # Create command Database.create(**self.sh_kwargs) self.sh_db = Database(**self.sh_kwargs) config = Config(CONF_FILE) cfg = config.get_conf() # We need to load the projects TaskProjects(config).execute() backend_section = GIT_BACKEND_SECTION task = TaskEnrich(config, backend_section=backend_section) self.assertEqual(task.execute(), None) # Check that the enrichment went well es_collection = cfg['es_collection']['url'] es_enrichment = cfg['es_enrichment']['url'] raw_index = es_collection + "/" + cfg[GIT_BACKEND_SECTION]['raw_index'] enrich_index = es_enrichment + "/" + cfg[GIT_BACKEND_SECTION][ 'enriched_index'] r = requests.get(raw_index + "/_search?size=0", verify=False) raw_items = r.json()['hits']['total'] r = requests.get(enrich_index + "/_search?size=0", verify=False) enriched_items = r.json()['hits']['total'] self.assertEqual(raw_items, enriched_items)
def test_studies(self): """Test whether the studies configuration works """ config = Config(CONF_FILE) cfg = config.get_conf() # We need to load the projects TaskProjects(config).execute() backend_section = GIT_BACKEND_SECTION task = TaskEnrich(config, backend_section=backend_section) # Configure no studies cfg.set_param('git', 'studies', None) self.assertEqual(task.execute(), None) # Configure no studies cfg.set_param('git', 'studies', []) self.assertEqual(task.execute(), None) # Configure a wrong study cfg.set_param('git', 'studies', ['bad_study']) with self.assertRaises(DataEnrichmentError): self.assertEqual(task.execute(), None) # Configure several studies cfg.set_param('git', 'studies', ['enrich_onion']) self.assertEqual(task.execute(), None) # Configure several studies cfg.set_param('git', 'studies', ['enrich_demography:1', 'enrich_areas_of_code']) self.assertEqual(task.execute(), None) # Configure kafka kip study cfg.set_param('mbox', 'studies', ['kafka_kip']) self.assertEqual(task.execute(), None) # Configure several studies, one wrong cfg.set_param('git', 'studies', ['enrich_demography:1', "enrich_areas_of_code1"]) with self.assertRaises(DataEnrichmentError): self.assertEqual(task.execute(), None)
def test_execute_no_sh(self): """Test whether the Task could be run without SortingHat""" config = Config(CONF_FILE_NO_SH) cfg = config.get_conf() # We need to load the projects TaskProjects(config).execute() backend_section = GIT_BACKEND_SECTION task = TaskEnrich(config, backend_section=backend_section) self.assertEqual(task.execute(), None) # Check that the enrichment went well es_collection = cfg['es_collection']['url'] es_enrichment = cfg['es_enrichment']['url'] raw_index = es_collection + "/" + cfg[GIT_BACKEND_SECTION]['raw_index'] enrich_index = es_enrichment + "/" + cfg[GIT_BACKEND_SECTION][ 'enriched_index'] r = requests.get(raw_index + "/_search?size=0", verify=False) raw_items = r.json()['hits']['total'] r = requests.get(enrich_index + "/_search?size=0", verify=False) enriched_items = r.json()['hits']['total'] self.assertEqual(raw_items, enriched_items)
def execute(self): cfg = self.config.get_conf() if 'scroll_size' in cfg['general']: ElasticItems.scroll_size = cfg['general']['scroll_size'] if 'bulk_size' in cfg['general']: ElasticSearch.max_items_bulk = cfg['general']['bulk_size'] if ('collect' in cfg[self.backend_section] and not cfg[self.backend_section]['collect']): logging.info('%s collect disabled', self.backend_section) return t2 = time.time() logger.info('[%s] raw data collection starts', self.backend_section) print("Collection for {}: starting...".format(self.backend_section)) clean = False fetch_archive = False if ('fetch-archive' in cfg[self.backend_section] and cfg[self.backend_section]['fetch-archive']): fetch_archive = True # repos could change between executions because changes in projects repos = TaskProjects.get_repos_by_backend_section(self.backend_section) if not repos: logger.warning("No collect repositories for %s", self.backend_section) for repo in repos: p2o_args = self._compose_p2o_params(self.backend_section, repo) filter_raw = p2o_args[ 'filter-raw'] if 'filter-raw' in p2o_args else None if filter_raw: # If filter-raw exists the goal is to enrich already collected # data, so don't collect anything logging.warning("Not collecting filter raw repository: %s", repo) continue url = p2o_args['url'] backend_args = self._compose_perceval_params( self.backend_section, repo) logger.debug(backend_args) logger.debug('[%s] collection starts for %s', self.backend_section, repo) es_col_url = self._get_collection_url() ds = self.backend_section backend = self.get_backend(self.backend_section) project = None # just used for github in cauldron try: feed_backend(es_col_url, clean, fetch_archive, backend, backend_args, cfg[ds]['raw_index'], cfg[ds]['enriched_index'], project) except Exception: logger.error( "Something went wrong collecting data from this %s repo: %s . " "Using the backend_args: %s " % (ds, url, str(backend_args))) traceback.print_exc() raise DataCollectionError('Failed to collect data from %s' % url) t3 = time.time() spent_time = time.strftime("%H:%M:%S", time.gmtime(t3 - t2)) logger.info('[%s] Data collection finished in %s', self.backend_section, spent_time) print("Collection for {}: finished after {} hours".format( self.backend_section, spent_time))
def test_run(self): """Test whether the Task could be run""" config = Config(CONF_FILE) task = TaskProjects(config) self.assertEqual(task.execute(), None) self.assertEqual(len(task.get_projects().keys()), 1)
def test_get_repos_by_backend_section(self): """Test whether the repos of each section are properly loaded""" config = Config(CONF_FILE) task = TaskProjects(config) self.assertEqual(task.execute(), None) config.conf.keys() backend_sections = list(set([sect for sect in config.conf.keys() for backend_section in Config.get_backend_sections() if sect and sect.startswith(backend_section)])) backend_sections.sort() backend = backend_sections[0] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'askbot') self.assertEqual(repos, ['https://ask.puppet.com']) backend = backend_sections[1] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'bugzilla') self.assertEqual(repos, ['https://bugs.eclipse.org/bugs/']) backend = backend_sections[2] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'bugzillarest') self.assertEqual(repos, ['https://bugzilla.mozilla.org']) backend = backend_sections[3] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'confluence') self.assertEqual(repos, ['https://wiki.open-o.org/']) backend = backend_sections[4] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'discourse') self.assertEqual(repos, ['https://foro.mozilla-hispano.org/']) backend = backend_sections[5] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'dockerhub') self.assertEqual(repos, ['bitergia kibiter']) backend = backend_sections[6] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'functest') self.assertEqual(repos, ['http://testresults.opnfv.org/test/']) backend = backend_sections[7] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'gerrit') self.assertEqual(repos, ['review.openstack.org']) backend = backend_sections[8] repos = task.get_repos_by_backend_section(backend) repos.sort() expected_list = [ "https://github.com/VizGrimoire/GrimoireLib " "--filter-raw-prefix=data.files.file:grimoirelib_alch,data.files.file:README.md", "https://github.com/MetricsGrimoire/CMetrics"] expected_list.sort() self.assertEqual(backend, 'git') self.assertEqual(repos, expected_list) backend = backend_sections[9] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'github') self.assertEqual(repos, ['https://github.com/grimoirelab/perceval']) backend = backend_sections[10] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'github:pull') self.assertEqual(repos, ['https://github.com/grimoirelab/perceval']) backend = backend_sections[11] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'gitlab') self.assertEqual(repos, ['https://gitlab.com/inkscape/inkscape-web']) backend = backend_sections[12] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'google_hits') self.assertEqual(repos, ['bitergia grimoirelab']) backend = backend_sections[13] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'hyperkitty') self.assertEqual(repos, ['https://lists.mailman3.org/archives/list/[email protected]']) backend = backend_sections[14] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'jenkins') self.assertEqual(repos, ['https://build.opnfv.org/ci']) backend = backend_sections[15] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'jira') self.assertEqual(repos, ['https://jira.opnfv.org']) backend = backend_sections[16] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'mattermost') self.assertEqual(repos, ['https://chat.openshift.io 8j366ft5affy3p36987pcugaoa']) backend = backend_sections[17] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'mattermost:group1') self.assertEqual(repos, ['https://chat.openshift.io 8j366ft5affy3p36987cip']) backend = backend_sections[18] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'mattermost:group2') self.assertEqual(repos, ['https://chat.openshift.io 8j366ft5affy3p36987ciop']) backend = backend_sections[19] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'mbox') self.assertEqual(repos, ['metrics-grimoire ~/.perceval/mbox']) backend = backend_sections[20] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'mediawiki') self.assertEqual(repos, ['https://wiki.mozilla.org']) backend = backend_sections[21] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'meetup') self.assertEqual(repos, ['South-East-Puppet-User-Group']) backend = backend_sections[22] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'mozillaclub') self.assertEqual(repos, ['https://spreadsheets.google.com/feeds/cells/' '1QHl2bjBhMslyFzR5XXPzMLdzzx7oeSKTbgR5PM8qp64/ohaibtm/public/values?alt=json']) backend = backend_sections[23] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'nntp') self.assertEqual(repos, ['news.mozilla.org mozilla.dev.project-link']) backend = backend_sections[24] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'phabricator') self.assertEqual(repos, ['https://phabricator.wikimedia.org']) backend = backend_sections[25] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'pipermail') self.assertEqual(repos, ['https://mail.gnome.org/archives/libart-hackers/']) backend = backend_sections[26] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'puppetforge') self.assertEqual(repos, ['']) backend = backend_sections[27] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'redmine') self.assertEqual(repos, ['http://tracker.ceph.com/']) backend = backend_sections[28] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'remo') self.assertEqual(repos, ['https://reps.mozilla.org']) backend = backend_sections[29] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'remo:activities') self.assertEqual(repos, ['https://reps.mozilla.org']) backend = backend_sections[30] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'rss') self.assertEqual(repos, ['https://blog.bitergia.com/feed/']) backend = backend_sections[31] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'slack') self.assertEqual(repos, ['C7LSGB0AU']) backend = backend_sections[32] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'stackexchange') repos.sort() expected_list = [ "https://stackoverflow.com/questions/tagged/ovirt", "https://stackoverflow.com/questions/tagged/rdo", "https://stackoverflow.com/questions/tagged/kibana" ] expected_list.sort() self.assertEqual(repos, expected_list) backend = backend_sections[33] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'supybot') self.assertEqual(repos, ['openshift ~/.perceval/irc/percevalbot/logs/ChannelLogger/freenode/#openshift/']) backend = backend_sections[34] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'telegram') self.assertEqual(repos, ['Mozilla_analytics']) backend = backend_sections[35] repos = task.get_repos_by_backend_section(backend) self.assertEqual(backend, 'twitter') self.assertEqual(repos, ['bitergia'])
def execute(self): errors = [] cfg = self.config.get_conf() if 'scroll_size' in cfg['general']: ElasticItems.scroll_size = cfg['general']['scroll_size'] if 'bulk_size' in cfg['general']: ElasticSearch.max_items_bulk = cfg['general']['bulk_size'] if 'collect' in cfg[self.backend_section] and not cfg[ self.backend_section]['collect']: logging.info('%s collect disabled', self.backend_section) return errors t2 = time.time() logger.info('[%s] collection phase starts', self.backend_section) print("Collection for {}: starting...".format(self.backend_section)) clean = False fetch_archive = False if 'fetch-archive' in cfg[self.backend_section] and cfg[ self.backend_section]['fetch-archive']: fetch_archive = True # repos could change between executions because changes in projects repos = TaskProjects.get_repos_by_backend_section(self.backend_section) if not repos: logger.warning("No collect repositories for %s", self.backend_section) for repo in repos: repo, repo_labels = self._extract_repo_labels( self.backend_section, repo) p2o_args = self._compose_p2o_params(self.backend_section, repo) filter_raw = p2o_args[ 'filter-raw'] if 'filter-raw' in p2o_args else None if filter_raw: # If filter-raw exists it means that there is an equivalent URL # in the `unknown` section of the projects.json. Thus the URL with # filter-raw is ignored in the collection phase, while the URL # in `unknown` is considered in this phase. logging.warning("Not collecting filter raw repository: %s", repo) continue url = p2o_args['url'] backend_args = self._compose_perceval_params( self.backend_section, repo) logger.debug(backend_args) logger.info('[%s] collection starts for %s', self.backend_section, repo) es_col_url = self._get_collection_url() ds = self.backend_section backend = self.get_backend(self.backend_section) project = None # just used for github in cauldron es_aliases = self.select_aliases(cfg, self.backend_section) try: error_msg = feed_backend(es_col_url, clean, fetch_archive, backend, backend_args, cfg[ds]['raw_index'], cfg[ds]['enriched_index'], project, es_aliases=es_aliases, projects_json_repo=repo, repo_labels=repo_labels) error = {'backend': backend, 'repo': repo, 'error': error_msg} errors.append(error) except Exception: logger.error( "Something went wrong collecting data from this %s repo: %s . " "Using the backend_args: %s " % (ds, url, str(backend_args))) traceback.print_exc() raise DataCollectionError('Failed to collect data from %s' % url) logger.info('[%s] collection finished for %s', self.backend_section, repo) t3 = time.time() spent_time = time.strftime("%H:%M:%S", time.gmtime(t3 - t2)) logger.info('[%s] collection phase finished in %s', self.backend_section, spent_time) print("Collection for {}: finished after {} hours".format( self.backend_section, spent_time)) self.retain_data(cfg['general']['retention_time'], self.conf['es_collection']['url'], self.conf[self.backend_section]['raw_index']) return errors
def execute(self): def check_arthur_task(repo, backend_args): """ Check if a task exists in arthur and if not, create it """ arthur_repo_json = self.__create_arthur_json(repo, backend_args) logger.debug('JSON config for arthur %s', json.dumps(arthur_repo_json, indent=True)) # First check is the task already exists try: r = requests.post(self.arthur_url + "/tasks") except requests.exceptions.ConnectionError as ex: logging.error("Can not connect to %s", self.arthur_url) raise RuntimeError("Can not connect to " + self.arthur_url) task_ids = [task['task_id'] for task in r.json()['tasks']] new_task_ids = [ task['task_id'] for task in arthur_repo_json['tasks'] ] # TODO: if a tasks already exists maybe we should delete and readd it already_tasks = list(set(task_ids).intersection(set(new_task_ids))) if len(already_tasks) > 0: logger.warning( "Tasks not added to arthur because there are already existing tasks %s", already_tasks) else: r = requests.post(self.arthur_url + "/add", json=arthur_repo_json) r.raise_for_status() logger.info('[%s] collection configured in arthur for %s', self.backend_section, repo) def collect_arthur_items(repo): aitems = self.__feed_backend_arthur(repo) if not aitems: return connector = get_connector_from_name(self.backend_section) klass = connector[1] # Ocean backend for the connector ocean_backend = klass(None) es_col_url = self._get_collection_url() es_index = self.conf[self.backend_section]['raw_index'] clean = False elastic_ocean = get_elastic(es_col_url, es_index, clean, ocean_backend) ocean_backend.set_elastic(elastic_ocean) ocean_backend.feed(arthur_items=aitems) cfg = self.config.get_conf() if 'collect' in cfg[self.backend_section] and not cfg[ self.backend_section]['collect']: logging.info('%s collect disabled', self.backend_section) return if 'scroll_size' in cfg['general']: ElasticItems.scroll_size = cfg['general']['scroll_size'] if 'bulk_size' in cfg['general']: ElasticSearch.max_items_bulk = cfg['general']['bulk_size'] logger.info('Programming arthur for [%s] raw data collection', self.backend_section) clean = False fetch_archive = False if 'fetch-archive' in self.conf[self.backend_section] and self.conf[ self.backend_section]['fetch-archive']: fetch_archive = True # repos could change between executions because changes in projects repos = TaskProjects.get_repos_by_backend_section(self.backend_section) if not repos: logger.warning("No collect repositories for %s", self.backend_section) for repo in repos: # If the repo already exists don't try to add it to arthur tag = self.backend_tag(repo) if tag not in self.arthur_items: self.arthur_items[tag] = [] repo, repo_labels = self._extract_repo_labels( self.backend_section, repo) p2o_args = self._compose_p2o_params(self.backend_section, repo) filter_raw = p2o_args[ 'filter-raw'] if 'filter-raw' in p2o_args else None if filter_raw: # If filter-raw exists it means that there is an equivalent URL # in the `unknown` section of the projects.json. Thus the URL with # filter-raw is ignored in the collection phase, while the URL # in `unknown` is considered in this phase. logging.warning("Not collecting filter raw repository: %s", repo) continue backend_args = self._compose_perceval_params( self.backend_section, repo) logger.debug(backend_args) check_arthur_task(repo, backend_args) collect_arthur_items(repo)
def __enrich_items(self): time_start = time.time() # logger.info('%s starts for %s ', 'enrichment', self.backend_section) logger.info('[%s] enrichment starts', self.backend_section) print("Enrichment for {}: starting...".format(self.backend_section)) cfg = self.config.get_conf() if 'scroll_size' in cfg['general']: ElasticItems.scroll_size = cfg['general']['scroll_size'] if 'bulk_size' in cfg['general']: ElasticSearch.max_items_bulk = cfg['general']['bulk_size'] no_incremental = False github_token = None pair_programming = False if 'github' in cfg and 'backend_token' in cfg['github']: github_token = cfg['github']['backend_token'] if 'git' in cfg and 'pair-programming' in cfg['git']: pair_programming = cfg['git']['pair-programming'] only_studies = False only_identities = False # repos could change between executions because changes in projects repos = TaskProjects.get_repos_by_backend_section(self.backend_section) if not repos: logger.warning("No enrich repositories for %s", self.backend_section) for repo in repos: # First process p2o params from repo p2o_args = self._compose_p2o_params(self.backend_section, repo) filter_raw = p2o_args[ 'filter-raw'] if 'filter-raw' in p2o_args else None filters_raw_prefix = p2o_args[ 'filters-raw-prefix'] if 'filters-raw-prefix' in p2o_args else None jenkins_rename_file = p2o_args[ 'jenkins-rename-file'] if 'jenkins-rename-file' in p2o_args else None url = p2o_args['url'] # Second process perceval params from repo backend_args = self._compose_perceval_params( self.backend_section, url) studies_args = None if 'studies' in self.conf[self.backend_section] and \ self.conf[self.backend_section]['studies']: studies_args = self.__load_studies() try: es_col_url = self._get_collection_url() logger.debug('[%s] enrichment starts for %s', self.backend_section, repo) backend = self.get_backend(self.backend_section) enrich_backend( es_col_url, self.clean, backend, backend_args, cfg[self.backend_section]['raw_index'], cfg[self.backend_section]['enriched_index'], None, # projects_db is deprecated cfg['projects']['projects_file'], cfg['sortinghat']['database'], no_incremental, only_identities, github_token, False, # studies are executed in its own Task only_studies, cfg['es_enrichment']['url'], None, # args.events_enrich cfg['sortinghat']['user'], cfg['sortinghat']['password'], cfg['sortinghat']['host'], None, # args.refresh_projects, None, # args.refresh_identities, author_id=None, author_uuid=None, filter_raw=filter_raw, filters_raw_prefix=filters_raw_prefix, jenkins_rename_file=jenkins_rename_file, unaffiliated_group=cfg['sortinghat']['unaffiliated_group'], pair_programming=pair_programming, studies_args=studies_args) except Exception as ex: logger.error( "Something went wrong producing enriched data for %s . " "Using the backend_args: %s ", self.backend_section, str(backend_args)) logger.error("Exception: %s", ex) raise DataEnrichmentError( 'Failed to produce enriched data for ' + self.backend_section) # Let's try to create the aliases for the enriched index if not self.enrich_aliases: logger.debug("Creating aliases after enrich") task_aliases = TaskPanelsAliases(self.config) task_aliases.set_backend_section(self.backend_section) task_aliases.execute() logger.debug("Done creating aliases after enrich") self.enrich_aliases = True spent_time = time.strftime("%H:%M:%S", time.gmtime(time.time() - time_start)) logger.info('[%s] enrichment finished in %s', self.backend_section, spent_time) print("Enrichment for {}: finished after {} hours".format( self.backend_section, spent_time))
def __enrich_items(self): time_start = datetime.now() logger.info('[%s] enrichment phase starts', self.backend_section) cfg = self.config.get_conf() if 'scroll_size' in cfg['general']: ElasticItems.scroll_size = cfg['general']['scroll_size'] if 'bulk_size' in cfg['general']: ElasticSearch.max_items_bulk = cfg['general']['bulk_size'] no_incremental = False # not used due to https://github.com/chaoss/grimoirelab-elk/pull/773 github_token = None pair_programming = False node_regex = None if 'git' in cfg and 'pair-programming' in cfg['git']: pair_programming = cfg['git']['pair-programming'] if 'jenkins' in cfg and 'node_regex' in cfg['jenkins']: node_regex = cfg['jenkins']['node_regex'] only_studies = False only_identities = False # repos could change between executions because changes in projects repos = TaskProjects.get_repos_by_backend_section(self.backend_section, raw=False) if not repos: logger.warning("No enrich repositories for %s", self.backend_section) # Get the metadata__timestamp value of the last item inserted in the enriched index before # looping over the repos which data is stored in the same index. This is needed to make sure # that the incremental enrichment works for data sources that are collected globally but only # partially enriched. elastic_enrich = get_elastic( cfg['es_enrichment']['url'], cfg[self.backend_section]['enriched_index']) last_enrich_date = elastic_enrich.get_last_item_field( "metadata__timestamp") if last_enrich_date: last_enrich_date = last_enrich_date.replace(tzinfo=None) for repo in repos: repo, repo_labels = self._extract_repo_labels( self.backend_section, repo) p2o_args = self._compose_p2o_params(self.backend_section, repo) filter_raw = p2o_args[ 'filter-raw'] if 'filter-raw' in p2o_args else None jenkins_rename_file = p2o_args[ 'jenkins-rename-file'] if 'jenkins-rename-file' in p2o_args else None url = p2o_args['url'] # Second process perceval params from repo backend_args = self._compose_perceval_params( self.backend_section, url) studies_args = None backend = self.get_backend(self.backend_section) if 'studies' in self.conf[self.backend_section] and \ self.conf[self.backend_section]['studies']: studies_args = self.__load_studies() logger.info('[%s] enrichment starts for %s', self.backend_section, self.anonymize_url(repo)) es_enrich_aliases = self.select_aliases(cfg, self.backend_section) try: es_col_url = self._get_collection_url() enrich_backend( es_col_url, self.clean, backend, backend_args, self.backend_section, cfg[self.backend_section]['raw_index'], cfg[self.backend_section]['enriched_index'], None, # projects_db is deprecated cfg['projects']['projects_file'], self.db_sh, no_incremental, only_identities, github_token, False, # studies are executed in its own Task only_studies, cfg['es_enrichment']['url'], None, # args.events_enrich self.db_user, self.db_password, self.db_host, None, # args.refresh_projects, None, # args.refresh_identities, author_id=None, author_uuid=None, filter_raw=filter_raw, jenkins_rename_file=jenkins_rename_file, unaffiliated_group=self.db_unaffiliate_group, pair_programming=pair_programming, node_regex=node_regex, studies_args=studies_args, es_enrich_aliases=es_enrich_aliases, last_enrich_date=last_enrich_date, projects_json_repo=repo, repo_labels=repo_labels) except Exception as ex: logger.error( "Something went wrong producing enriched data for %s . " "Using the backend_args: %s ", self.backend_section, str(backend_args)) logger.error("Exception: %s", ex) raise DataEnrichmentError( 'Failed to produce enriched data for ' + self.backend_section) logger.info('[%s] enrichment finished for %s', self.backend_section, self.anonymize_url(repo)) spent_time = str(datetime.now() - time_start).split('.')[0] logger.info('[%s] enrichment phase finished in %s', self.backend_section, spent_time)