def process_contributors_data(self, contributors: List[str]): """Pre process of data for contributors in a project repository.""" pr_ids = sorted([int(k) for k in self.pull_requests.keys()]) contributors_reviews_data: Dict[str, Any] = {} contributors_reviews_data["reviewers"] = [] contributors_reviews_data["created_dts"] = [] interactions = {} for contributor in contributors: contributor_interaction = dict.fromkeys(contributors, 0) interactions[contributor] = contributor_interaction for pr_id in pr_ids: pr = self.pull_requests[str(pr_id)] self._analyze_pr_for_contributor_data(pr_id=pr_id, pr=pr, extracted_data=contributors_reviews_data) self._analyze_contributors_interaction( pr_interactions=pr["interactions"], pr_author=pr["created_by"], interactions_data=interactions ) for reviewer in contributors_reviews_data["reviewers"]: number_reviews = 0 reviews_length = [] time_reviews = [] for reviews in contributors_reviews_data[reviewer]["reviews"].values(): number_reviews += len(reviews) review_words = 0 for review in reviews: review_words += review["words_count"] time_reviews.append(review["submitted_at"]) reviews_length.append(review_words) last_review_dt = max(time_reviews) contributors_reviews_data[reviewer]["number_reviews"] = number_reviews contributors_reviews_data[reviewer]["median_review_length"] = np.median(reviews_length) contributors_reviews_data[reviewer]["last_review_time"] = last_review_dt # Encode Pull Request sizes for the contributor contributor_prs_size_encoded = [ convert_score2num(label=pr_size) for pr_size in contributors_reviews_data[reviewer]["PRs_size"] ] contributor_pr_median_size, contributor_relative_score = convert_num2label( score=np.median(contributor_prs_size_encoded) ) contributors_reviews_data[reviewer]["median_pr_length"] = contributor_pr_median_size contributors_reviews_data[reviewer]["median_pr_length_score"] = contributor_relative_score contributors_reviews_data[reviewer]["interactions"] = interactions[reviewer] return contributors_reviews_data
def process_prs_project_data(self): """Pre process of data for a given project repository.""" if not self.pull_requests: return {} ids = sorted([int(k) for k in self.pull_requests.keys()]) project_reviews_data = {} project_reviews_data["contributors"] = [] project_reviews_data["ids"] = [] project_reviews_data["created_dts"] = [] project_reviews_data["reviews_dts"] = [] project_reviews_data["TTFR"] = [] # Time to First Review (TTFR) [hr] project_reviews_data["MTTFR"] = [] # Median TTFR [hr] project_reviews_data["TTR"] = [] # Time to Review (TTR) [hr] project_reviews_data["MTTR"] = [] # Median TTR [hr] project_reviews_data["MTTCI"] = [] # Median TTCI [hr] project_reviews_data["PRs_size"] = [] # Pull Request length # Pull Request length encoded project_reviews_data["encoded_PRs_size"] = [] for id in ids: id = str(id) if self.pull_requests[id]["closed_at"] is None: continue pr = self.pull_requests[str(id)] if pr["created_by"] not in project_reviews_data["contributors"]: project_reviews_data["contributors"].append(pr["created_by"]) self._analyze_pr_for_project_data( pr_id=id, pr=pr, extracted_data=project_reviews_data) project_reviews_data["last_review_time"] = max( project_reviews_data["reviews_dts"]) # Encode Pull Request sizes for the contributor project_pr_median_size, project_length_score = convert_num2label( score=np.median(project_reviews_data["encoded_PRs_size"])) project_reviews_data["median_pr_length"] = project_pr_median_size project_reviews_data["median_pr_length_score"] = project_length_score return project_reviews_data
def pre_process_project_data(data: Dict[str, Any]): """Pre process of data for a given project repository.""" if not data: return {} pr_ids = sorted([int(k) for k in data.keys()]) project_reviews_data = {} project_reviews_data["contributors"] = [] project_reviews_data["ids"] = [] project_reviews_data["created_dts"] = [] project_reviews_data["reviews_dts"] = [] project_reviews_data["TTFR"] = [] # Time to First Review (TTFR) [hr] project_reviews_data["MTTFR"] = [] # Median TTFR [hr] project_reviews_data["TTR"] = [] # Time to Review (TTR) [hr] project_reviews_data["MTTR"] = [] # Median TTR [hr] project_reviews_data["PRs_size"] = [] # Pull Request length project_reviews_data["encoded_PRs_size"] = [ ] # Pull Request length encoded for pr_id in pr_ids: pr = data[str(pr_id)] if pr["created_by"] not in project_reviews_data["contributors"]: project_reviews_data["contributors"].append(pr["created_by"]) analyze_pr_for_project_data(pr_id=pr_id, pr=pr, extracted_data=project_reviews_data) project_reviews_data["last_review_time"] = max( project_reviews_data["reviews_dts"]) # Encode Pull Request sizes for the contributor project_pr_median_size, project_length_score = convert_num2label( score=np.median(project_reviews_data["encoded_PRs_size"])) project_reviews_data["median_pr_length"] = project_pr_median_size project_reviews_data["median_pr_length_score"] = project_length_score return project_reviews_data