def test_add_aliases_duplicated(self): """Test whether an alias isn't added when already present in a given index""" elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping, aliases=['A', 'B', 'C']) expected_aliases = {'A': {}, 'B': {}, 'C': {}} aliases = elastic.list_aliases() self.assertDictEqual(aliases, expected_aliases) elastic.add_alias('C') aliases = elastic.list_aliases() self.assertDictEqual(aliases, expected_aliases)
def test_add_aliases(self): """Test whether an alias is added to a given index""" elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping, aliases=['A', 'B', 'C']) expected_aliases = {'A': {}, 'B': {}, 'C': {}} aliases = elastic.list_aliases() self.assertDictEqual(aliases, expected_aliases) expected_aliases = {'A': {}, 'B': {}, 'C': {}, 'D': {}} elastic.add_alias('D') aliases = elastic.list_aliases() self.assertDictEqual(aliases, expected_aliases)
def enrich_cocom_analysis(self, ocean_backend, enrich_backend, no_incremental=False, out_index="cocom_enrich_graal_repo", interval_months=[3], date_field="grimoire_creation_date"): logger.info("[cocom] study enrich-cocom-analysis start") es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100, verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection) in_index = enrich_backend.elastic.index interval_months = list(map(int, interval_months)) unique_repos = es_in.search(index=in_index, body=get_unique_repository()) repositories = [ repo['key'] for repo in unique_repos['aggregations']['unique_repos'].get( 'buckets', []) ] current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0) logger.info( "[cocom] study enrich-cocom-analysis {} repositories to process". format(len(repositories))) es_out = ElasticSearch(enrich_backend.elastic.url, out_index, mappings=Mapping) es_out.add_alias("cocom_study") num_items = 0 ins_items = 0 for repository_url in repositories: logger.info( "[cocom] study enrich-cocom-analysis start analysis for {}". format(repository_url)) evolution_items = [] for interval in interval_months: to_month = get_to_date(es_in, in_index, out_index, repository_url, interval) to_month = to_month.replace(month=int(interval), day=1, hour=0, minute=0, second=0) while to_month < current_month: files_at_time = es_in.search( index=in_index, body=get_files_at_time(repository_url, to_month.isoformat()) )['aggregations']['file_stats'].get("buckets", []) if not len(files_at_time): to_month = to_month + relativedelta(months=+interval) continue repository_name = repository_url.split("/")[-1] evolution_item = { "id": "{}_{}_{}".format(to_month.isoformat(), repository_name, interval), "origin": repository_url, "interval_months": interval, "study_creation_date": to_month.isoformat(), "total_files": len(files_at_time) } for file_ in files_at_time: file_details = file_["1"]["hits"]["hits"][0]["_source"] for metric in self.metrics: total_metric = "total_" + metric evolution_item[total_metric] = evolution_item.get( total_metric, 0) evolution_item[total_metric] += file_details[ metric] if file_details[ metric] is not None else 0 # TODO: Fix Logic: None rather than 1 evolution_item["total_comments_per_loc"] = round( evolution_item["total_comments"] / max(evolution_item["total_loc"], 1), 2) evolution_item["total_blanks_per_loc"] = round( evolution_item["total_blanks"] / max(evolution_item["total_loc"], 1), 2) evolution_item["total_loc_per_function"] = round( evolution_item["total_loc"] / max(evolution_item["total_num_funs"], 1), 2) evolution_item.update( self.get_grimoire_fields( evolution_item["study_creation_date"], "stats")) evolution_items.append(evolution_item) if len(evolution_items) >= self.elastic.max_items_bulk: num_items += len(evolution_items) ins_items += es_out.bulk_upload( evolution_items, self.get_field_unique_id()) evolution_items = [] to_month = to_month + relativedelta(months=+interval) if len(evolution_items) > 0: num_items += len(evolution_items) ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) if num_items != ins_items: missing = num_items - ins_items logger.error( "[cocom] study enrich-cocom-analysis {}/{} missing items for Graal CoCom Analysis " "Study".format(missing, num_items)) else: logger.info( "[cocom] study enrich-cocom-analysis {} items inserted for Graal CoCom Analysis " "Study".format(num_items)) logger.info( "[cocom] study enrich-cocom-analysis End analysis for {} with month interval" .format(repository_url)) logger.info("[cocom] study enrich-cocom-analysis End")
def enrich_backlog_analysis(self, ocean_backend, enrich_backend, no_incremental=False, out_index="github_enrich_backlog", date_field="grimoire_creation_date", interval_days=1, reduced_labels=["bug"], map_label=["others", "bugs"]): """ The purpose of this study is to add additional index to compute the chronological evolution of opened issues and average opened time issues. For each repository and label, we start the study on repository creation date until today with a day interval (default). For each date we retrieve the number of open issues at this date by difference between number of opened issues and number of closed issues. In addition, we compute the average opened time for all issues open at this date. To differentiate by label, we compute evolution for bugs and all others labels (like "enhancement","good first issue" ... ), we call this "reduced labels". We need to use theses reduced labels because the complexity to compute evolution for each combination of labels would be too big. In addition, we can rename "bug" label to "bugs" with map_label. Entry example in setup.cfg : [github] raw_index = github_issues_raw enriched_index = github_issues_enriched ... studies = [enrich_backlog_analysis] [enrich_backlog_analysis] out_index = github_enrich_backlog interval_days = 7 reduced_labels = [bug,enhancement] map_label = [others, bugs, enhancements] """ logger.info("[github] Start enrich_backlog_analysis study") # combine two lists to create the dict to map labels map_label = dict(zip([""] + reduced_labels, map_label)) # connect to ES es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100, verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection) in_index = enrich_backend.elastic.index # get all repositories unique_repos = es_in.search( index=in_index, body=get_unique_repository_with_project_name()) repositories = [ repo['key'] for repo in unique_repos['aggregations']['unique_repos'].get( 'buckets', []) ] logger.debug( "[enrich-backlog-analysis] {} repositories to process".format( len(repositories))) # create the index es_out = ElasticSearch(enrich_backend.elastic.url, out_index, mappings=Mapping) es_out.add_alias("backlog_study") # analysis for each repositories num_items = 0 ins_items = 0 for repository in repositories: repository_url = repository["origin"] project = repository["project"] org_name = repository["organization"] repository_name = repository_url.split("/")[-1] logger.debug( "[enrich-backlog-analysis] Start analysis for {}".format( repository_url)) # get each day since repository creation dates = es_in.search(index=in_index, body=get_issues_dates(interval_days, repository_url) )['aggregations']['created_per_interval'].get( "buckets", []) # for each selected label + others labels for label, other in [("", True)] + [(l, False) for l in reduced_labels]: # compute metrics for each day (ES request for each day) evolution_items = [] for date in map(lambda i: i['key_as_string'], dates): evolution_item = self.__create_backlog_item( repository_url, repository_name, project, date, org_name, interval_days, label, map_label, self.__get_opened_issues(es_in, in_index, repository_url, date, interval_days, other, label, reduced_labels)) evolution_items.append(evolution_item) # complete until today (no ES request needed, just extrapol) today = datetime.now().replace(hour=0, minute=0, second=0, tzinfo=None) last_item = evolution_item last_date = str_to_datetime( evolution_item['study_creation_date']).replace(tzinfo=None) \ + relativedelta(days=interval_days) average_opened_time = evolution_item['average_opened_time'] \ + float(interval_days) while last_date < today: date = last_date.strftime('%Y-%m-%dT%H:%M:%S.000Z') evolution_item = {} evolution_item.update(last_item) evolution_item.update({ "average_opened_time": average_opened_time, "study_creation_date": date, "uuid": "{}_{}_{}".format(date, repository_name, label), }) evolution_item.update( self.get_grimoire_fields(date, "stats")) evolution_items.append(evolution_item) last_date = last_date + relativedelta(days=interval_days) average_opened_time = average_opened_time + float( interval_days) # upload items to ES if len(evolution_items) > 0: num_items += len(evolution_items) ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) if num_items != ins_items: missing = num_items - ins_items logger.error( ("[enrich-backlog-analysis] %s/%s missing items", "for Graal Backlog Analysis Study"), str(missing), str(num_items)) else: logger.debug( ("[enrich-backlog-analysis] %s items inserted", "for Graal Backlog Analysis Study"), str(num_items)) logger.info("[github] End enrich_backlog_analysis study")
def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=False, out_index="colic_enrich_graal_repo", interval_months=[3], date_field="grimoire_creation_date"): logger.info("[colic] study enrich-colic-analysis start") es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100, verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection) in_index = enrich_backend.elastic.index interval_months = list(map(int, interval_months)) unique_repos = es_in.search(index=in_index, body=get_unique_repository()) repositories = [ repo['key'] for repo in unique_repos['aggregations']['unique_repos'].get( 'buckets', []) ] logger.info( "[colic] study enrich-colic-analysis {} repositories to process". format(len(repositories))) es_out = ElasticSearch(enrich_backend.elastic.url, out_index, mappings=Mapping) es_out.add_alias("colic_study") current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0) num_items = 0 ins_items = 0 for repository_url in repositories: repository_url_anonymized = repository_url if repository_url_anonymized.startswith('http'): repository_url_anonymized = anonymize_url( repository_url_anonymized) logger.info( "[colic] study enrich-colic-analysis start analysis for {}". format(repository_url_anonymized)) evolution_items = [] for interval in interval_months: to_month = get_to_date(es_in, in_index, out_index, repository_url, interval) to_month = to_month.replace(month=int(interval), day=1, hour=0, minute=0, second=0) while to_month < current_month: copyrighted_files_at_time = es_in.search( index=in_index, body=self.__get_copyrighted_files( repository_url, to_month.isoformat())) licensed_files_at_time = es_in.search( index=in_index, body=self.__get_licensed_files(repository_url, to_month.isoformat())) files_at_time = es_in.search(index=in_index, body=self.__get_total_files( repository_url, to_month.isoformat())) licensed_files = int( licensed_files_at_time["aggregations"]["1"]["value"]) copyrighted_files = int( copyrighted_files_at_time["aggregations"]["1"] ["value"]) total_files = int( files_at_time["aggregations"]["1"]["value"]) if not total_files: to_month = to_month + relativedelta(months=+interval) continue evolution_item = { "id": "{}_{}_{}".format(to_month.isoformat(), hash(repository_url_anonymized), interval), "repo_url": repository_url_anonymized, "origin": repository_url, "interval_months": interval, "study_creation_date": to_month.isoformat(), "licensed_files": licensed_files, "copyrighted_files": copyrighted_files, "total_files": total_files } evolution_item.update( self.get_grimoire_fields( evolution_item["study_creation_date"], "stats")) evolution_items.append(evolution_item) if len(evolution_items) >= self.elastic.max_items_bulk: num_items += len(evolution_items) ins_items += es_out.bulk_upload( evolution_items, self.get_field_unique_id()) evolution_items = [] to_month = to_month + relativedelta(months=+interval) if len(evolution_items) > 0: num_items += len(evolution_items) ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) if num_items != ins_items: missing = num_items - ins_items logger.error( "[colic] study enrich-colic-analysis {}/{} missing items for Graal CoLic Analysis " "Study".format(missing, num_items)) else: logger.info( "[colic] study enrich-colic-analysis {} items inserted for Graal CoLic Analysis " "Study".format(num_items)) logger.info( "[colic] study enrich-colic-analysis end analysis for {} with month interval" .format(repository_url_anonymized)) logger.info("[colic] study enrich-colic-analysis end")