def process_contributors_data(self, contributors: List[str]): """Pre process of data for contributors in a project repository.""" pr_ids = sorted([int(k) for k in self.pull_requests.keys()]) contributors_reviews_data: Dict[str, Any] = {} contributors_reviews_data["reviewers"] = [] contributors_reviews_data["created_dts"] = [] interactions = {} for contributor in contributors: contributor_interaction = dict.fromkeys(contributors, 0) interactions[contributor] = contributor_interaction for pr_id in pr_ids: pr = self.pull_requests[str(pr_id)] self._analyze_pr_for_contributor_data(pr_id=pr_id, pr=pr, extracted_data=contributors_reviews_data) self._analyze_contributors_interaction( pr_interactions=pr["interactions"], pr_author=pr["created_by"], interactions_data=interactions ) for reviewer in contributors_reviews_data["reviewers"]: number_reviews = 0 reviews_length = [] time_reviews = [] for reviews in contributors_reviews_data[reviewer]["reviews"].values(): number_reviews += len(reviews) review_words = 0 for review in reviews: review_words += review["words_count"] time_reviews.append(review["submitted_at"]) reviews_length.append(review_words) last_review_dt = max(time_reviews) contributors_reviews_data[reviewer]["number_reviews"] = number_reviews contributors_reviews_data[reviewer]["median_review_length"] = np.median(reviews_length) contributors_reviews_data[reviewer]["last_review_time"] = last_review_dt # Encode Pull Request sizes for the contributor contributor_prs_size_encoded = [ convert_score2num(label=pr_size) for pr_size in contributors_reviews_data[reviewer]["PRs_size"] ] contributor_pr_median_size, contributor_relative_score = convert_num2label( score=np.median(contributor_prs_size_encoded) ) contributors_reviews_data[reviewer]["median_pr_length"] = contributor_pr_median_size contributors_reviews_data[reviewer]["median_pr_length_score"] = contributor_relative_score contributors_reviews_data[reviewer]["interactions"] = interactions[reviewer] return contributors_reviews_data
def _analyze_pr_for_project_data(pr_id: int, pr: Dict[str, Any], extracted_data: Dict[str, Any]): """Extract project data from Pull Request.""" if not pr["reviews"]: return extracted_data # Consider all approved reviews pr_approved_dt = [ datetime.fromtimestamp(review["submitted_at"]) for review in pr["reviews"].values() if review["state"] == "APPROVED" ] if not pr_approved_dt: return extracted_data extracted_data["ids"].append(pr_id) # PR created timestamp pr_created_dt = datetime.fromtimestamp(pr["created_at"]) extracted_data["created_dts"].append(pr_created_dt) # PR first review timestamp (no matter the contributor) pr_first_review_dt = datetime.fromtimestamp( [r for r in pr["reviews"].values()][0]["submitted_at"]) ttfr = (pr_first_review_dt - pr_created_dt).total_seconds() / 3600 extracted_data["TTFR"].append(ttfr) mttfr = np.median(extracted_data["TTFR"]) extracted_data["MTTFR"].append(mttfr) project_prs_size = pr["size"] extracted_data["PRs_size"].append(project_prs_size) extracted_data["encoded_PRs_size"].append( convert_score2num(label=project_prs_size)) # Take maximum to consider last approved if more than one contributor has to approve pr_approved_dt = max(pr_approved_dt) ttr = (pr_approved_dt - pr_created_dt).total_seconds() / 3600 extracted_data["TTR"].append(ttr) mttr = np.median(extracted_data["TTR"]) extracted_data["MTTR"].append(mttr) # PR reviews timestamps extracted_data["reviews_dts"] += [ r["submitted_at"] for r in pr["reviews"].values() ] return extracted_data
def _evaluate_reviewer_data(pr: Dict[str, Any], reviewer: str, review_submission_dt: datetime.timestamp, extracted_data: Dict[str, Any]): """Evaluate reviewer data from reviews.""" if not pr["reviews"]: return extracted_data dt_approved = [ datetime.fromtimestamp(review["submitted_at"]) for review in pr["reviews"].values() if review["state"] == "APPROVED" and review["author"] == reviewer ] if not dt_approved: return extracted_data # PR created timestamp pr_created_dt = datetime.fromtimestamp(pr["created_at"]) extracted_data["created_dts"].append(pr_created_dt) pr_first_review_dt = datetime.fromtimestamp(review_submission_dt[0]) ttfr = (pr_first_review_dt - pr_created_dt).total_seconds() / 3600 extracted_data[reviewer]["TTFR"] = ttfr mttfr = np.median(extracted_data[reviewer]["TTFR"]) extracted_data[reviewer]["MTTFR"].append(mttfr) project_prs_size = pr["size"] extracted_data[reviewer]["PRs_size"].append(project_prs_size) extracted_data[reviewer]["encoded_PRs_size"].append( convert_score2num(label=project_prs_size)) # Take maximum to consider last approved if more than one contributor has to approve pr_approved_dt = max(dt_approved) ttr = (pr_approved_dt - pr_created_dt).total_seconds() / 3600 extracted_data[reviewer]["TTR"].append(ttr) mttr = np.median(extracted_data[reviewer]["TTR"]) extracted_data[reviewer]["MTTR"].append(mttr)
def visualize_pr_data(self, project: str, result_path: Path, pr_data: Dict[str, Any]): """Visualize results from Pull Requests for a project.""" projects_reviews_data = self.processing.process_prs_project_data() prs_ids = projects_reviews_data["ids"] prs_created_dts = projects_reviews_data["created_dts"] prs_lengths = projects_reviews_data["PRs_size"] ttfr = projects_reviews_data["TTFR"] mttfr = projects_reviews_data["MTTFR"] # MTTFR data = {"ids": prs_ids, "MTTFR": mttfr} mttfr_per_pr_processed = self.analyze_outliers(quantity="MTTFR", data=data) self.create_per_pr_plot( result_path=result_path, project=project, x_array=[el[0] for el in mttfr_per_pr_processed], y_array=[el[1] for el in mttfr_per_pr_processed], x_label="PR created date", y_label="Median Time to First Review (h)", title=f"MTTFR in Time per project: {project}", output_name="MTTFR-in-time", ) # TTFR data = { "ids": prs_ids, "created_dts": prs_created_dts, "TTFR": ttfr, "lengths": prs_lengths } tfr_in_time_processed = self.analyze_outliers(quantity="TTFR", data=data) self.create_per_pr_plot( result_path=result_path, project=project, x_array=[el[1] for el in tfr_in_time_processed], y_array=[el[2] for el in tfr_in_time_processed], x_label="PR created date", y_label="Time to First Review (h)", title=f"TTFR in Time per project: {project}", output_name="TTFR-in-time", ) self.create_per_pr_plot( result_path=result_path, project=project, x_array=[el[0] for el in tfr_in_time_processed], y_array=[el[2] for el in tfr_in_time_processed], x_label="PR id", y_label="Time to First Review (h)", title=f"TTFR per PR id per project: {project}", output_name="TTFR-per-PR", ) tfr_in_time_processed_sorted = sorted( tfr_in_time_processed, key=lambda x: convert_score2num(x[3]), reverse=False) self.create_per_pr_plot( result_path=result_path, project=project, x_array=[el[3] for el in tfr_in_time_processed_sorted], y_array=[el[2] for el in tfr_in_time_processed_sorted], x_label="PR length", y_label="Time to First Review (h)", title=f"TTFR in Time per PR length: {project}", output_name="TTFR-per-PR-length", ) ttr = projects_reviews_data["TTR"] mttr = projects_reviews_data["MTTR"] # MTTR data = {"created_dts": prs_created_dts, "MTTR": mttr} mttr_in_time_processed = self.analyze_outliers(quantity="MTTR", data=data) self.create_per_pr_plot( result_path=result_path, project=project, x_array=[el[0] for el in mttr_in_time_processed], y_array=[el[1] for el in mttr_in_time_processed], x_label="PR created date", y_label="Mean Time to Review (h)", title=f"MTTR in Time per project: {project}", output_name="MTTR-in-time", ) # TTR data = { "ids": prs_ids, "created_dts": prs_created_dts, "TTR": ttr, "lengths": prs_lengths } ttr_in_time_processed = self.analyze_outliers(quantity="TTR", data=data) self.create_per_pr_plot( result_path=result_path, project=project, x_array=[el[1] for el in ttr_in_time_processed], y_array=[el[2] for el in ttr_in_time_processed], x_label="PR created date", y_label="Time to Review (h)", title=f"TTR in Time per project: {project}", output_name="TTR-in-time", ) self.create_per_pr_plot( result_path=result_path, project=project, x_array=[el[0] for el in ttr_in_time_processed], y_array=[el[2] for el in ttr_in_time_processed], x_label="PR id", y_label="Time to Review (h)", title=f"TTR per PR id per project: {project}", output_name="TTR-per-PR", ) ttr_in_time_processed_sorted = sorted( ttr_in_time_processed, key=lambda x: convert_score2num(x[3]), reverse=False) self.create_per_pr_plot( result_path=result_path, project=project, x_array=[el[3] for el in ttr_in_time_processed_sorted], y_array=[el[2] for el in ttr_in_time_processed_sorted], x_label="PR length", y_label="Time to Review (h)", title=f"TTR in Time per PR length: {project}", output_name="TTR-per-PR-length", )
def visualize_results(project: str): """Visualize results for a project.""" knowledge_path = Path.cwd().joinpath("./srcopsmetrics/bot_knowledge") result_path = Path.cwd().joinpath("./srcopsmetrics/knowledge_statistics") data = retrieve_knowledge(knowledge_path=knowledge_path, project=project) if data: projects_reviews_data = pre_process_project_data(data=data) prs_ids = projects_reviews_data["ids"] prs_created_dts = projects_reviews_data["created_dts"] prs_lengths = projects_reviews_data["PRs_size"] ttfr = projects_reviews_data["TTFR"] mttfr = projects_reviews_data["MTTFR"] # MTTFR data = { "ids": prs_ids, "MTTFR": mttfr } mttfr_per_pr_processed = analyze_outliers( quantity="MTTFR", data=data ) create_per_pr_plot( result_path=result_path, project=project, x_array=[el[0] for el in mttfr_per_pr_processed], y_array=[el[1] for el in mttfr_per_pr_processed], x_label="PR created date", y_label="Median Time to First Review (h)", title=f"MTTFR in Time per project: {project}", output_name="MTTFR-in-time", ) # TTFR data = { "ids": prs_ids, "created_dts": prs_created_dts, "TTFR": ttfr, "lengths": prs_lengths } tfr_in_time_processed = analyze_outliers( quantity="TTFR", data=data ) create_per_pr_plot( result_path=result_path, project=project, x_array=[el[1] for el in tfr_in_time_processed], y_array=[el[2] for el in tfr_in_time_processed], x_label="PR created date", y_label="Time to First Review (h)", title=f"TTFR in Time per project: {project}", output_name="TTFR-in-time", ) create_per_pr_plot( result_path=result_path, project=project, x_array=[el[0] for el in tfr_in_time_processed], y_array=[el[2] for el in tfr_in_time_processed], x_label="PR id", y_label="Time to First Review (h)", title=f"TTFR per PR id per project: {project}", output_name="TTFR-per-PR", ) tfr_in_time_processed_sorted = sorted( tfr_in_time_processed, key=lambda x: convert_score2num(x[3]), reverse=False ) create_per_pr_plot( result_path=result_path, project=project, x_array=[el[3] for el in tfr_in_time_processed_sorted], y_array=[el[2] for el in tfr_in_time_processed_sorted], x_label="PR length", y_label="Time to First Review (h)", title=f"TTFR in Time per PR length: {project}", output_name="TTFR-per-PR-length", ) ttr = projects_reviews_data["TTR"] mttr = projects_reviews_data["MTTR"] # MTTR data = { "created_dts": prs_created_dts, "MTTR": mttr, } mttr_in_time_processed = analyze_outliers( quantity="MTTR", data=data ) create_per_pr_plot( result_path=result_path, project=project, x_array=[el[0] for el in mttr_in_time_processed], y_array=[el[1] for el in mttr_in_time_processed], x_label="PR created date", y_label="Mean Time to Review (h)", title=f"MTTR in Time per project: {project}", output_name="MTTR-in-time", ) # TTR data = { "ids": prs_ids, "created_dts": prs_created_dts, "TTR": ttr, "lengths": prs_lengths } ttr_in_time_processed = analyze_outliers( quantity="TTR", data=data ) create_per_pr_plot( result_path=result_path, project=project, x_array=[el[1] for el in ttr_in_time_processed], y_array=[el[2] for el in ttr_in_time_processed], x_label="PR created date", y_label="Time to Review (h)", title=f"TTR in Time per project: {project}", output_name="TTR-in-time", ) create_per_pr_plot( result_path=result_path, project=project, x_array=[el[0] for el in ttr_in_time_processed], y_array=[el[2] for el in ttr_in_time_processed], x_label="PR id", y_label="Time to Review (h)", title=f"TTR per PR id per project: {project}", output_name="TTR-per-PR", ) ttr_in_time_processed_sorted = sorted( ttr_in_time_processed, key=lambda x: convert_score2num(x[3]), reverse=False ) create_per_pr_plot( result_path=result_path, project=project, x_array=[el[3] for el in ttr_in_time_processed_sorted], y_array=[el[2] for el in ttr_in_time_processed_sorted], x_label="PR length", y_label="Time to Review (h)", title=f"TTR in Time per PR length: {project}", output_name="TTR-per-PR-length", )