def get_rich_items(self, item): """ :category: code_license_scancode_cli(default) """ if item["category"] == "code_license_nomos": get_rich_item = self.__get_rich_nomossa else: get_rich_item = self.__get_rich_scancode entry = item['data'] enriched_items = [] for file_analysis in entry["analysis"]: eitem = get_rich_item(file_analysis) for f in self.RAW_FIELDS_COPY: if f in item: eitem[f] = item[f] else: eitem[f] = None # common attributes eitem['author'] = entry['Author'] eitem['author_date'] = fix_field_date(entry['AuthorDate']) eitem["category"] = item["category"] eitem['commit'] = entry['commit'] eitem['committer'] = entry['Commit'] eitem['commit_date'] = fix_field_date(entry['CommitDate']) eitem['commit_sha'] = entry['commit'] eitem['message'] = entry.get('message', None) # Other enrichment eitem["repo_url"] = item["origin"] if eitem["repo_url"].startswith('http'): eitem["repo_url"] = ElasticSearch.anonymize_url( eitem["repo_url"]) if self.prjs_map: eitem.update(self.get_item_project(eitem)) # uuid eitem['id'] = "{}_{}".format(eitem['commit_sha'], eitem['file_path']) eitem.update(self.get_grimoire_fields(entry["AuthorDate"], "file")) self.add_repository_labels(eitem) self.add_metadata_filter_raw(eitem) enriched_items.append(eitem) return enriched_items
def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=False, out_index="colic_enrich_graal_repo", interval_months=[3], date_field="grimoire_creation_date"): logger.info("[colic] study enrich-colic-analysis start") es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100, verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection) in_index = enrich_backend.elastic.index interval_months = list(map(int, interval_months)) unique_repos = es_in.search(index=in_index, body=get_unique_repository()) repositories = [ repo['key'] for repo in unique_repos['aggregations']['unique_repos'].get( 'buckets', []) ] logger.info( "[colic] study enrich-colic-analysis {} repositories to process". format(len(repositories))) es_out = ElasticSearch(enrich_backend.elastic.url, out_index, mappings=Mapping) es_out.add_alias("colic_study") current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0) num_items = 0 ins_items = 0 for repository_url in repositories: repository_url_anonymized = repository_url if repository_url_anonymized.startswith('http'): repository_url_anonymized = ElasticSearch.anonymize_url( repository_url_anonymized) logger.info( "[colic] study enrich-colic-analysis start analysis for {}". format(repository_url_anonymized)) evolution_items = [] for interval in interval_months: to_month = get_to_date(es_in, in_index, out_index, repository_url, interval) to_month = to_month.replace(month=int(interval), day=1, hour=0, minute=0, second=0) while to_month < current_month: copyrighted_files_at_time = es_in.search( index=in_index, body=self.__get_copyrighted_files( repository_url, to_month.isoformat())) licensed_files_at_time = es_in.search( index=in_index, body=self.__get_licensed_files(repository_url, to_month.isoformat())) files_at_time = es_in.search(index=in_index, body=self.__get_total_files( repository_url, to_month.isoformat())) licensed_files = int( licensed_files_at_time["aggregations"]["1"]["value"]) copyrighted_files = int( copyrighted_files_at_time["aggregations"]["1"] ["value"]) total_files = int( files_at_time["aggregations"]["1"]["value"]) if not total_files: to_month = to_month + relativedelta(months=+interval) continue evolution_item = { "id": "{}_{}_{}".format(to_month.isoformat(), hash(repository_url_anonymized), interval), "repo_url": repository_url_anonymized, "origin": repository_url, "interval_months": interval, "study_creation_date": to_month.isoformat(), "licensed_files": licensed_files, "copyrighted_files": copyrighted_files, "total_files": total_files } evolution_item.update( self.get_grimoire_fields( evolution_item["study_creation_date"], "stats")) evolution_items.append(evolution_item) if len(evolution_items) >= self.elastic.max_items_bulk: num_items += len(evolution_items) ins_items += es_out.bulk_upload( evolution_items, self.get_field_unique_id()) evolution_items = [] to_month = to_month + relativedelta(months=+interval) if len(evolution_items) > 0: num_items += len(evolution_items) ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) if num_items != ins_items: missing = num_items - ins_items logger.error( "[colic] study enrich-colic-analysis {}/{} missing items for Graal CoLic Analysis " "Study".format(missing, num_items)) else: logger.info( "[colic] study enrich-colic-analysis {} items inserted for Graal CoLic Analysis " "Study".format(num_items)) logger.info( "[colic] study enrich-colic-analysis end analysis for {} with month interval" .format(repository_url_anonymized)) logger.info("[colic] study enrich-colic-analysis end")
def enrich_cocom_analysis(self, ocean_backend, enrich_backend, no_incremental=False, out_index="cocom_enrich_graal_repo", interval_months=[3], date_field="grimoire_creation_date"): logger.info("[cocom] study enrich-cocom-analysis start") es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100, verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection) in_index = enrich_backend.elastic.index interval_months = list(map(int, interval_months)) unique_repos = es_in.search( index=in_index, body=get_unique_repository()) repositories = [repo['key'] for repo in unique_repos['aggregations']['unique_repos'].get('buckets', [])] current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0) logger.info("[cocom] study enrich-cocom-analysis {} repositories to process".format( len(repositories))) es_out = ElasticSearch(enrich_backend.elastic.url, out_index, mappings=Mapping) es_out.add_alias("cocom_study") num_items = 0 ins_items = 0 for repository_url in repositories: repository_url_anonymized = repository_url if repository_url_anonymized.startswith('http'): repository_url_anonymized = ElasticSearch.anonymize_url(repository_url_anonymized) logger.info("[cocom] study enrich-cocom-analysis start analysis for {}".format( repository_url_anonymized)) evolution_items = [] for interval in interval_months: to_month = get_to_date(es_in, in_index, out_index, repository_url, interval) to_month = to_month.replace(month=int(interval), day=1, hour=0, minute=0, second=0) while to_month < current_month: files_at_time = es_in.search( index=in_index, body=get_files_at_time(repository_url, to_month.isoformat()) )['aggregations']['file_stats'].get("buckets", []) if not len(files_at_time): to_month = to_month + relativedelta(months=+interval) continue repository_name = repository_url.split("/")[-1] evolution_item = { "id": "{}_{}_{}".format(to_month.isoformat(), hash(repository_url_anonymized), interval), "repo_url": repository_url_anonymized, "origin": repository_url, "interval_months": interval, "study_creation_date": to_month.isoformat(), "total_files": len(files_at_time) } for file_ in files_at_time: file_details = file_["1"]["hits"]["hits"][0]["_source"] for metric in self.metrics: total_metric = "total_" + metric evolution_item[total_metric] = evolution_item.get(total_metric, 0) evolution_item[total_metric] += file_details[metric] if file_details[metric] is not None else 0 # TODO: Fix Logic: None rather than 1 evolution_item["total_comments_per_loc"] = round( evolution_item["total_comments"] / max(evolution_item["total_loc"], 1), 2) evolution_item["total_blanks_per_loc"] = round( evolution_item["total_blanks"] / max(evolution_item["total_loc"], 1), 2) evolution_item["total_loc_per_function"] = round( evolution_item["total_loc"] / max(evolution_item["total_num_funs"], 1), 2) evolution_item.update(self.get_grimoire_fields(evolution_item["study_creation_date"], "stats")) evolution_items.append(evolution_item) if len(evolution_items) >= self.elastic.max_items_bulk: num_items += len(evolution_items) ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) evolution_items = [] to_month = to_month + relativedelta(months=+interval) if len(evolution_items) > 0: num_items += len(evolution_items) ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) if num_items != ins_items: missing = num_items - ins_items logger.error( "[cocom] study enrich-cocom-analysis {}/{} missing items for Graal CoCom Analysis " "Study".format(missing, num_items) ) else: logger.info( "[cocom] study enrich-cocom-analysis {} items inserted for Graal CoCom Analysis " "Study".format(num_items) ) logger.info( "[cocom] study enrich-cocom-analysis End analysis for {} with month interval".format( repository_url_anonymized) ) logger.info("[cocom] study enrich-cocom-analysis End")