예제 #1
0
    def process_contributors_data(self, contributors: List[str]):
        """Pre process of data for contributors in a project repository."""
        pr_ids = sorted([int(k) for k in self.pull_requests.keys()])

        contributors_reviews_data: Dict[str, Any] = {}
        contributors_reviews_data["reviewers"] = []
        contributors_reviews_data["created_dts"] = []

        interactions = {}
        for contributor in contributors:
            contributor_interaction = dict.fromkeys(contributors, 0)
            interactions[contributor] = contributor_interaction

        for pr_id in pr_ids:
            pr = self.pull_requests[str(pr_id)]

            self._analyze_pr_for_contributor_data(pr_id=pr_id, pr=pr, extracted_data=contributors_reviews_data)

            self._analyze_contributors_interaction(
                pr_interactions=pr["interactions"], pr_author=pr["created_by"], interactions_data=interactions
            )

        for reviewer in contributors_reviews_data["reviewers"]:

            number_reviews = 0
            reviews_length = []
            time_reviews = []

            for reviews in contributors_reviews_data[reviewer]["reviews"].values():
                number_reviews += len(reviews)
                review_words = 0
                for review in reviews:
                    review_words += review["words_count"]
                    time_reviews.append(review["submitted_at"])

                reviews_length.append(review_words)

            last_review_dt = max(time_reviews)

            contributors_reviews_data[reviewer]["number_reviews"] = number_reviews
            contributors_reviews_data[reviewer]["median_review_length"] = np.median(reviews_length)
            contributors_reviews_data[reviewer]["last_review_time"] = last_review_dt

            # Encode Pull Request sizes for the contributor
            contributor_prs_size_encoded = [
                convert_score2num(label=pr_size) for pr_size in contributors_reviews_data[reviewer]["PRs_size"]
            ]

            contributor_pr_median_size, contributor_relative_score = convert_num2label(
                score=np.median(contributor_prs_size_encoded)
            )
            contributors_reviews_data[reviewer]["median_pr_length"] = contributor_pr_median_size
            contributors_reviews_data[reviewer]["median_pr_length_score"] = contributor_relative_score
            contributors_reviews_data[reviewer]["interactions"] = interactions[reviewer]

        return contributors_reviews_data
예제 #2
0
파일: processing.py 프로젝트: harshad16/mi
    def _analyze_pr_for_project_data(pr_id: int, pr: Dict[str, Any],
                                     extracted_data: Dict[str, Any]):
        """Extract project data from Pull Request."""
        if not pr["reviews"]:
            return extracted_data

        # Consider all approved reviews
        pr_approved_dt = [
            datetime.fromtimestamp(review["submitted_at"])
            for review in pr["reviews"].values()
            if review["state"] == "APPROVED"
        ]

        if not pr_approved_dt:
            return extracted_data

        extracted_data["ids"].append(pr_id)

        # PR created timestamp
        pr_created_dt = datetime.fromtimestamp(pr["created_at"])
        extracted_data["created_dts"].append(pr_created_dt)

        # PR first review timestamp (no matter the contributor)
        pr_first_review_dt = datetime.fromtimestamp(
            [r for r in pr["reviews"].values()][0]["submitted_at"])

        ttfr = (pr_first_review_dt - pr_created_dt).total_seconds() / 3600
        extracted_data["TTFR"].append(ttfr)

        mttfr = np.median(extracted_data["TTFR"])
        extracted_data["MTTFR"].append(mttfr)

        project_prs_size = pr["size"]
        extracted_data["PRs_size"].append(project_prs_size)
        extracted_data["encoded_PRs_size"].append(
            convert_score2num(label=project_prs_size))

        # Take maximum to consider last approved if more than one contributor has to approve
        pr_approved_dt = max(pr_approved_dt)

        ttr = (pr_approved_dt - pr_created_dt).total_seconds() / 3600
        extracted_data["TTR"].append(ttr)

        mttr = np.median(extracted_data["TTR"])
        extracted_data["MTTR"].append(mttr)

        # PR reviews timestamps
        extracted_data["reviews_dts"] += [
            r["submitted_at"] for r in pr["reviews"].values()
        ]

        return extracted_data
예제 #3
0
파일: processing.py 프로젝트: harshad16/mi
    def _evaluate_reviewer_data(pr: Dict[str, Any], reviewer: str,
                                review_submission_dt: datetime.timestamp,
                                extracted_data: Dict[str, Any]):
        """Evaluate reviewer data from reviews."""
        if not pr["reviews"]:
            return extracted_data

        dt_approved = [
            datetime.fromtimestamp(review["submitted_at"])
            for review in pr["reviews"].values()
            if review["state"] == "APPROVED" and review["author"] == reviewer
        ]

        if not dt_approved:
            return extracted_data

        # PR created timestamp
        pr_created_dt = datetime.fromtimestamp(pr["created_at"])
        extracted_data["created_dts"].append(pr_created_dt)

        pr_first_review_dt = datetime.fromtimestamp(review_submission_dt[0])
        ttfr = (pr_first_review_dt - pr_created_dt).total_seconds() / 3600
        extracted_data[reviewer]["TTFR"] = ttfr

        mttfr = np.median(extracted_data[reviewer]["TTFR"])
        extracted_data[reviewer]["MTTFR"].append(mttfr)

        project_prs_size = pr["size"]
        extracted_data[reviewer]["PRs_size"].append(project_prs_size)
        extracted_data[reviewer]["encoded_PRs_size"].append(
            convert_score2num(label=project_prs_size))

        # Take maximum to consider last approved if more than one contributor has to approve
        pr_approved_dt = max(dt_approved)

        ttr = (pr_approved_dt - pr_created_dt).total_seconds() / 3600
        extracted_data[reviewer]["TTR"].append(ttr)

        mttr = np.median(extracted_data[reviewer]["TTR"])
        extracted_data[reviewer]["MTTR"].append(mttr)
예제 #4
0
    def visualize_pr_data(self, project: str, result_path: Path,
                          pr_data: Dict[str, Any]):
        """Visualize results from Pull Requests for a project."""
        projects_reviews_data = self.processing.process_prs_project_data()
        prs_ids = projects_reviews_data["ids"]
        prs_created_dts = projects_reviews_data["created_dts"]
        prs_lengths = projects_reviews_data["PRs_size"]

        ttfr = projects_reviews_data["TTFR"]
        mttfr = projects_reviews_data["MTTFR"]

        # MTTFR
        data = {"ids": prs_ids, "MTTFR": mttfr}
        mttfr_per_pr_processed = self.analyze_outliers(quantity="MTTFR",
                                                       data=data)

        self.create_per_pr_plot(
            result_path=result_path,
            project=project,
            x_array=[el[0] for el in mttfr_per_pr_processed],
            y_array=[el[1] for el in mttfr_per_pr_processed],
            x_label="PR created date",
            y_label="Median Time to First Review (h)",
            title=f"MTTFR in Time per project: {project}",
            output_name="MTTFR-in-time",
        )

        # TTFR
        data = {
            "ids": prs_ids,
            "created_dts": prs_created_dts,
            "TTFR": ttfr,
            "lengths": prs_lengths
        }

        tfr_in_time_processed = self.analyze_outliers(quantity="TTFR",
                                                      data=data)

        self.create_per_pr_plot(
            result_path=result_path,
            project=project,
            x_array=[el[1] for el in tfr_in_time_processed],
            y_array=[el[2] for el in tfr_in_time_processed],
            x_label="PR created date",
            y_label="Time to First Review (h)",
            title=f"TTFR in Time per project: {project}",
            output_name="TTFR-in-time",
        )

        self.create_per_pr_plot(
            result_path=result_path,
            project=project,
            x_array=[el[0] for el in tfr_in_time_processed],
            y_array=[el[2] for el in tfr_in_time_processed],
            x_label="PR id",
            y_label="Time to First Review (h)",
            title=f"TTFR per PR id per project: {project}",
            output_name="TTFR-per-PR",
        )

        tfr_in_time_processed_sorted = sorted(
            tfr_in_time_processed,
            key=lambda x: convert_score2num(x[3]),
            reverse=False)
        self.create_per_pr_plot(
            result_path=result_path,
            project=project,
            x_array=[el[3] for el in tfr_in_time_processed_sorted],
            y_array=[el[2] for el in tfr_in_time_processed_sorted],
            x_label="PR length",
            y_label="Time to First Review (h)",
            title=f"TTFR in Time per PR length: {project}",
            output_name="TTFR-per-PR-length",
        )

        ttr = projects_reviews_data["TTR"]
        mttr = projects_reviews_data["MTTR"]
        # MTTR
        data = {"created_dts": prs_created_dts, "MTTR": mttr}
        mttr_in_time_processed = self.analyze_outliers(quantity="MTTR",
                                                       data=data)

        self.create_per_pr_plot(
            result_path=result_path,
            project=project,
            x_array=[el[0] for el in mttr_in_time_processed],
            y_array=[el[1] for el in mttr_in_time_processed],
            x_label="PR created date",
            y_label="Mean Time to Review (h)",
            title=f"MTTR in Time per project: {project}",
            output_name="MTTR-in-time",
        )

        # TTR
        data = {
            "ids": prs_ids,
            "created_dts": prs_created_dts,
            "TTR": ttr,
            "lengths": prs_lengths
        }
        ttr_in_time_processed = self.analyze_outliers(quantity="TTR",
                                                      data=data)

        self.create_per_pr_plot(
            result_path=result_path,
            project=project,
            x_array=[el[1] for el in ttr_in_time_processed],
            y_array=[el[2] for el in ttr_in_time_processed],
            x_label="PR created date",
            y_label="Time to Review (h)",
            title=f"TTR in Time per project: {project}",
            output_name="TTR-in-time",
        )

        self.create_per_pr_plot(
            result_path=result_path,
            project=project,
            x_array=[el[0] for el in ttr_in_time_processed],
            y_array=[el[2] for el in ttr_in_time_processed],
            x_label="PR id",
            y_label="Time to Review (h)",
            title=f"TTR per PR id per project: {project}",
            output_name="TTR-per-PR",
        )

        ttr_in_time_processed_sorted = sorted(
            ttr_in_time_processed,
            key=lambda x: convert_score2num(x[3]),
            reverse=False)
        self.create_per_pr_plot(
            result_path=result_path,
            project=project,
            x_array=[el[3] for el in ttr_in_time_processed_sorted],
            y_array=[el[2] for el in ttr_in_time_processed_sorted],
            x_label="PR length",
            y_label="Time to Review (h)",
            title=f"TTR in Time per PR length: {project}",
            output_name="TTR-per-PR-length",
        )
예제 #5
0
def visualize_results(project: str):
    """Visualize results for a project."""
    knowledge_path = Path.cwd().joinpath("./srcopsmetrics/bot_knowledge")
    result_path = Path.cwd().joinpath("./srcopsmetrics/knowledge_statistics")

    data = retrieve_knowledge(knowledge_path=knowledge_path, project=project)

    if data:
        projects_reviews_data = pre_process_project_data(data=data)
        prs_ids = projects_reviews_data["ids"]
        prs_created_dts = projects_reviews_data["created_dts"]
        prs_lengths = projects_reviews_data["PRs_size"]

        ttfr = projects_reviews_data["TTFR"]
        mttfr = projects_reviews_data["MTTFR"]
        # MTTFR
        data = {
            "ids": prs_ids,
            "MTTFR": mttfr
        }
        mttfr_per_pr_processed = analyze_outliers(
            quantity="MTTFR", data=data
        )

        create_per_pr_plot(
            result_path=result_path,
            project=project,
            x_array=[el[0] for el in mttfr_per_pr_processed],
            y_array=[el[1] for el in mttfr_per_pr_processed],
            x_label="PR created date",
            y_label="Median Time to First Review (h)",
            title=f"MTTFR in Time per project: {project}",
            output_name="MTTFR-in-time",
        )

        # TTFR
        data = {
            "ids": prs_ids,
            "created_dts": prs_created_dts,
            "TTFR": ttfr,
            "lengths": prs_lengths
        }

        tfr_in_time_processed = analyze_outliers(
            quantity="TTFR", data=data
        )

        create_per_pr_plot(
            result_path=result_path,
            project=project,
            x_array=[el[1] for el in tfr_in_time_processed],
            y_array=[el[2] for el in tfr_in_time_processed],
            x_label="PR created date",
            y_label="Time to First Review (h)",
            title=f"TTFR in Time per project: {project}",
            output_name="TTFR-in-time",
        )

        create_per_pr_plot(
            result_path=result_path,
            project=project,
            x_array=[el[0] for el in tfr_in_time_processed],
            y_array=[el[2] for el in tfr_in_time_processed],
            x_label="PR id",
            y_label="Time to First Review (h)",
            title=f"TTFR per PR id per project: {project}",
            output_name="TTFR-per-PR",
        )

        tfr_in_time_processed_sorted = sorted(
            tfr_in_time_processed, key=lambda x: convert_score2num(x[3]), reverse=False
        )
        create_per_pr_plot(
            result_path=result_path,
            project=project,
            x_array=[el[3] for el in tfr_in_time_processed_sorted],
            y_array=[el[2] for el in tfr_in_time_processed_sorted],
            x_label="PR length",
            y_label="Time to First Review (h)",
            title=f"TTFR in Time per PR length: {project}",
            output_name="TTFR-per-PR-length",
        )

        ttr = projects_reviews_data["TTR"]
        mttr = projects_reviews_data["MTTR"]
        # MTTR
        data = {
            "created_dts": prs_created_dts,
            "MTTR": mttr,
        }
        mttr_in_time_processed = analyze_outliers(
            quantity="MTTR", data=data
        )

        create_per_pr_plot(
            result_path=result_path,
            project=project,
            x_array=[el[0] for el in mttr_in_time_processed],
            y_array=[el[1] for el in mttr_in_time_processed],
            x_label="PR created date",
            y_label="Mean Time to Review (h)",
            title=f"MTTR in Time per project: {project}",
            output_name="MTTR-in-time",
        )

        # TTR
        data = {
            "ids": prs_ids,
            "created_dts": prs_created_dts,
            "TTR": ttr,
            "lengths": prs_lengths
        }
        ttr_in_time_processed = analyze_outliers(
            quantity="TTR", data=data
        )

        create_per_pr_plot(
            result_path=result_path,
            project=project,
            x_array=[el[1] for el in ttr_in_time_processed],
            y_array=[el[2] for el in ttr_in_time_processed],
            x_label="PR created date",
            y_label="Time to Review (h)",
            title=f"TTR in Time per project: {project}",
            output_name="TTR-in-time",
        )

        create_per_pr_plot(
            result_path=result_path,
            project=project,
            x_array=[el[0] for el in ttr_in_time_processed],
            y_array=[el[2] for el in ttr_in_time_processed],
            x_label="PR id",
            y_label="Time to Review (h)",
            title=f"TTR per PR id per project: {project}",
            output_name="TTR-per-PR",
        )

        ttr_in_time_processed_sorted = sorted(
            ttr_in_time_processed, key=lambda x: convert_score2num(x[3]), reverse=False
        )
        create_per_pr_plot(
            result_path=result_path,
            project=project,
            x_array=[el[3] for el in ttr_in_time_processed_sorted],
            y_array=[el[2] for el in ttr_in_time_processed_sorted],
            x_label="PR length",
            y_label="Time to Review (h)",
            title=f"TTR in Time per PR length: {project}",
            output_name="TTR-per-PR-length",
        )