def test_assert_melt():
    for metric in ["precision_recall", "replicate_reproducibility", "grit"]:
        result = metric_melt(
            df=df,
            features=features,
            metadata_features=meta_features,
            similarity_metric="pearson",
            eval_metric=metric,
        )

        result = assign_replicates(
            similarity_melted_df=result, replicate_groups=replicate_groups
        )

        assert_melt(result, eval_metric=metric)

        # Note, not all alternative dummy metrics are provided, since many require
        # the same melted dataframe
        if metric == "precision_recall":
            dummy_metrics = ["replicate_reproducibility"]
        elif metric == "replicate_reproducibility":
            dummy_metrics = ["precision_recall", "grit"]
        elif metric == "grit":
            dummy_metrics = ["replicate_reproducibility"]

        for dummy_metric in dummy_metrics:
            with pytest.raises(AssertionError) as ve:
                output = assert_melt(result, eval_metric=dummy_metric)
            assert (
                "Stop! The eval_metric provided in 'metric_melt()' is incorrect!"
                in str(ve.value)
            )
Exemplo n.º 2
0
def grit(
    similarity_melted_df: pd.DataFrame,
    control_perts: List[str],
    profile_col: str,
    replicate_group_col: str,
    replicate_summary_method: str = "mean",
) -> pd.DataFrame:
    r"""Calculate grit

    Parameters
    ----------
    similarity_melted_df : pandas.DataFrame
        a long pandas dataframe output from cytominer_eval.transform.metric_melt
    control_perts : list
        a list of control perturbations to calculate a null distribution
    profile_col : str
        the metadata column storing profile ids. The column can have unique or replicate
        identifiers.
    replicate_group_col : str
        the metadata column indicating a higher order structure (group) than the
        profile column. E.g. target gene vs. guide in a CRISPR experiment.
    replicate_summary_method : {'mean', 'median'}, optional
        how replicate z-scores to control perts are summarized. Defaults to "mean".

    Returns
    -------
    pandas.DataFrame
        A dataframe of grit measurements per perturbation
    """
    # Check if we support the provided summary method
    check_replicate_summary_method(replicate_summary_method)

    # Determine pairwise replicates
    similarity_melted_df = assign_replicates(
        similarity_melted_df=similarity_melted_df,
        replicate_groups=[profile_col, replicate_group_col],
    )

    # Check to make sure that the melted dataframe is full
    assert_melt(similarity_melted_df, eval_metric="grit")

    # Extract out specific columns
    pair_ids = set_pair_ids()
    profile_col_name = "{x}{suf}".format(
        x=profile_col, suf=pair_ids[list(pair_ids)[0]]["suffix"])

    # Define the columns to use in the calculation
    column_id_info = set_grit_column_info(
        profile_col=profile_col, replicate_group_col=replicate_group_col)

    # Calculate grit for each perturbation
    grit_df = (similarity_melted_df.groupby(profile_col_name).apply(
        lambda x: calculate_grit(
            replicate_group_df=x,
            control_perts=control_perts,
            column_id_info=column_id_info,
            replicate_summary_method=replicate_summary_method,
        )).reset_index(drop=True))

    return grit_df
def precision_recall(
    similarity_melted_df: pd.DataFrame,
    replicate_groups: List[str],
    k: Union[int, List[int]],
) -> pd.DataFrame:
    """Determine the precision and recall at k for all unique replicate groups
    based on a predefined similarity metric (see cytominer_eval.transform.metric_melt)

    Parameters
    ----------
    similarity_melted_df : pandas.DataFrame
        An elongated symmetrical matrix indicating pairwise correlations between
        samples. Importantly, it must follow the exact structure as output from
        :py:func:`cytominer_eval.transform.transform.metric_melt`.
    replicate_groups : List
        a list of metadata column names in the original profile dataframe to use as
        replicate columns.
    k : List of ints or int
        an integer indicating how many pairwise comparisons to threshold.

    Returns
    -------
    pandas.DataFrame
        precision and recall metrics for all replicate groups given k
    """
    # Determine pairwise replicates and make sure to sort based on the metric!
    similarity_melted_df = assign_replicates(
        similarity_melted_df=similarity_melted_df, replicate_groups=replicate_groups
    ).sort_values(by="similarity_metric", ascending=False)

    # Check to make sure that the melted dataframe is full
    assert_melt(similarity_melted_df, eval_metric="precision_recall")

    # Extract out specific columns
    pair_ids = set_pair_ids()
    replicate_group_cols = [
        "{x}{suf}".format(x=x, suf=pair_ids[list(pair_ids)[0]]["suffix"])
        for x in replicate_groups
    ]
    # iterate over all k
    precision_recall_df = pd.DataFrame()
    if type(k) == int:
        k = [k]
    for k_ in k:
        # Calculate precision and recall for all groups
        precision_recall_df_at_k = similarity_melted_df.groupby(
            replicate_group_cols
        ).apply(lambda x: calculate_precision_recall(x, k=k_))
        precision_recall_df = precision_recall_df.append(precision_recall_df_at_k)

    # Rename the columns back to the replicate groups provided
    rename_cols = dict(zip(replicate_group_cols, replicate_groups))

    return precision_recall_df.reset_index().rename(rename_cols, axis="columns")
Exemplo n.º 4
0
def test_assign_replicates():
    replicate_groups = ["Metadata_broad_sample", "Metadata_mg_per_ml"]
    result = assign_replicates(similarity_melted_df=similarity_melted_df,
                               replicate_groups=replicate_groups)

    expected_cols = ["{x}_replicate".format(x=x)
                     for x in replicate_groups] + ["group_replicate"]

    # Other functions expect columns to exist
    assert all([x in result.columns.tolist() for x in expected_cols])

    # Given the example data, we expect a certain number of pairwise replicates
    expected_replicates = list(result.loc[:, expected_cols].sum().values)
    assert expected_replicates == [1248, 408, 408]

    # Try with a different number of replicate groups
    replicate_groups = [
        "Metadata_broad_sample",
        "Metadata_mg_per_ml",
        "Metadata_plate_map_name",
    ]
    result = assign_replicates(similarity_melted_df=similarity_melted_df,
                               replicate_groups=replicate_groups)

    expected_cols = ["{x}_replicate".format(x=x)
                     for x in replicate_groups] + ["group_replicate"]

    # Other functions expect columns to exist
    assert all([x in result.columns.tolist() for x in expected_cols])

    # Given the example data, we expect a certain number of pairwise replicates
    expected_replicates = list(result.loc[:, expected_cols].sum().values)
    assert expected_replicates == [1248, 408, 73536, 408]

    # This function will fail if a replicate column is given that doesn't belong
    with pytest.raises(AssertionError) as ve:
        replicate_groups = ["MISSING_COLUMN"]
        result = assign_replicates(similarity_melted_df=similarity_melted_df,
                                   replicate_groups=replicate_groups)
    assert "replicate_group not found in melted dataframe columns" in str(
        ve.value)
Exemplo n.º 5
0
def test_calculate_precision_recall():
    similarity_melted_df = metric_melt(
        df=df,
        features=features,
        metadata_features=meta_features,
        similarity_metric="pearson",
        eval_metric="precision_recall",
    )

    replicate_groups = ["Metadata_broad_sample"]
    result = assign_replicates(similarity_melted_df=similarity_melted_df,
                               replicate_groups=replicate_groups).sort_values(
                                   by="similarity_metric", ascending=False)

    pair_ids = set_pair_ids()
    replicate_group_cols = [
        "{x}{suf}".format(x=x, suf=pair_ids[list(pair_ids)[0]]["suffix"])
        for x in replicate_groups
    ]

    example_group = result.groupby(replicate_group_cols).get_group(
        name=("BRD-A38592941-001-02-7"))

    assert example_group.shape[
        0] == 383 * 6  # number of pairwise comparisons per dose

    # Assert that the similarity metrics are sorted
    assert (example_group.similarity_metric.diff().dropna() > 0).sum() == 0

    # Perform the calculation!
    result = pd.DataFrame(calculate_precision_recall(example_group, k=10),
                          columns=["result"])

    expected_result = {"k": 10, "precision": 0.4, "recall": 0.1333}
    expected_result = pd.DataFrame(expected_result,
                                   index=["result"]).transpose()

    assert_frame_equal(result, expected_result, check_less_precise=True)

    # Check that recall is 1 when k is maximized
    result = pd.DataFrame(
        calculate_precision_recall(example_group, k=example_group.shape[0]),
        columns=["result"],
    )

    assert result.loc["recall", "result"] == 1
Exemplo n.º 6
0
def enrichment(
    similarity_melted_df: pd.DataFrame,
    replicate_groups: List[str],
    percentile: Union[float, List[float]],
) -> pd.DataFrame:
    """Calculate the enrichment score. This score is based on the fisher exact odds score.
    Similar to the other functions, the closest connections are determined and checked with the replicates.
    This score effectively calculates how much better the distribution of correct connections is compared to random.

    Parameters
    ----------
    similarity_melted_df : pandas.DataFrame
        An elongated symmetrical matrix indicating pairwise correlations between
        samples. Importantly, it must follow the exact structure as output from
        :py:func:`cytominer_eval.transform.transform.metric_melt`.
    replicate_groups : List
        a list of metadata column names in the original profile dataframe to use as
        replicate columns.
    percentile :  List of floats
        Determines what percentage of top connections used for the enrichment calculation.

    Returns
    -------
    dict
        percentile, threshold, odds ratio and p value
    """
    result = []
    replicate_truth_df = assign_replicates(
        similarity_melted_df=similarity_melted_df, replicate_groups=replicate_groups
    )
    # loop over all percentiles
    if type(percentile) == float:
        percentile = [percentile]
    for p in percentile:
        # threshold based on percentile of top connections
        threshold = similarity_melted_df.similarity_metric.quantile(p)

        # calculate the individual components of the contingency tables
        v11 = len(
            replicate_truth_df.query(
                "group_replicate==True and similarity_metric>@threshold"
            )
        )
        v12 = len(
            replicate_truth_df.query(
                "group_replicate==False and similarity_metric>@threshold"
            )
        )
        v21 = len(
            replicate_truth_df.query(
                "group_replicate==True and similarity_metric<=@threshold"
            )
        )
        v22 = len(
            replicate_truth_df.query(
                "group_replicate==False and similarity_metric<=@threshold"
            )
        )

        v = np.asarray([[v11, v12], [v21, v22]])
        r = scipy.stats.fisher_exact(v, alternative="greater")
        result.append(
            {
                "enrichment_percentile": p,
                "threshold": threshold,
                "ods_ratio": r[0],
                "p-value": r[1],
            }
        )
    result_df = pd.DataFrame(result)
    return result_df
Exemplo n.º 7
0
def replicate_reproducibility(
    similarity_melted_df: pd.DataFrame,
    replicate_groups: List[str],
    quantile_over_null: float = 0.95,
    return_median_correlations: bool = False,
) -> float:
    r"""Summarize pairwise replicate correlations

    For a given pairwise similarity matrix, replicate information, and specific options,
    output a replicate correlation summary.

    Parameters
    ----------
    similarity_melted_df : pandas.DataFrame
        An elongated symmetrical matrix indicating pairwise correlations between
        samples. Importantly, it must follow the exact structure as output from
        :py:func:`cytominer_eval.transform.transform.metric_melt`.
    replicate_groups : list
        A list of metadata column names in the original profile dataframe to indicate
        replicate samples.
    quantile_over_null : float, optional
        A float between 0 and 1 indicating the threshold of nonreplicates to use when
        reporting percent matching or percent replicating. Defaults to 0.95.
    return_median_correlations : bool, optional
        If provided, also return median pairwise correlations per replicate.
        Defaults to False.

    Returns
    -------
    {float, (float, pd.DataFrame)}
        The replicate reproducibility of the profiles according to the replicate
        columns provided. If `return_median_correlations = True` then the function will
        return both the metric and a median pairwise correlation pandas.DataFrame.
    """

    assert (0 < quantile_over_null and 1 >= quantile_over_null
            ), "quantile_over_null must be between 0 and 1"

    similarity_melted_df = assign_replicates(
        similarity_melted_df=similarity_melted_df,
        replicate_groups=replicate_groups)

    # Check to make sure that the melted dataframe is upper triangle
    assert_melt(similarity_melted_df, eval_metric="replicate_reproducibility")

    # check that there are group_replicates (non-unique rows)
    replicate_df = similarity_melted_df.query("group_replicate")
    denom = replicate_df.shape[0]

    assert denom != 0, "no replicate groups identified in {rep} columns!".format(
        rep=replicate_groups)

    non_replicate_quantile = similarity_melted_df.query(
        "not group_replicate").similarity_metric.quantile(quantile_over_null)

    replicate_reproducibility = (replicate_df.similarity_metric >
                                 non_replicate_quantile).sum() / denom

    if return_median_correlations:
        pair_ids = set_pair_ids()
        replicate_groups_for_groupby = {
            "{col}{suf}".format(col=x, suf=pair_ids["pair_a"]["suffix"]): x
            for x in replicate_groups
        }

        median_cor_df = (replicate_df.groupby(
            list(replicate_groups_for_groupby))
                         ["similarity_metric"].median().reset_index().rename(
                             replicate_groups_for_groupby, axis="columns"))

        return (replicate_reproducibility, median_cor_df)

    return replicate_reproducibility
Exemplo n.º 8
0
def test_calculate_grit():
    result = assign_replicates(
        similarity_melted_df=similarity_melted_df,
        replicate_groups=[profile_col, replicate_group_col],
    )

    assert_melt(result, eval_metric="grit")

    example_group = result.groupby(replicate_col_name).get_group(
        name=("MTOR-2"))

    # Perform the calculation!
    grit_result = pd.DataFrame(
        calculate_grit(example_group,
                       control_perts=control_perts,
                       column_id_info=column_id_info),
        columns=["result"],
    )

    expected_result = {
        "perturbation": "MTOR-2",
        "group": "MTOR",
        "grit": 1.55075
    }
    expected_result = pd.DataFrame(expected_result,
                                   index=["result"]).transpose()

    assert_frame_equal(grit_result, expected_result)

    # Calculate grit will not work with singleton perturbations
    # (no other perts in same group)
    example_group = result.groupby(replicate_col_name).get_group(
        name=("AURKB-2"))

    grit_result = pd.DataFrame(
        calculate_grit(example_group,
                       control_perts=control_perts,
                       column_id_info=column_id_info),
        columns=["result"],
    )

    expected_result = {
        "perturbation": "AURKB-2",
        "group": "AURKB",
        "grit": np.nan
    }
    expected_result = pd.DataFrame(expected_result,
                                   index=["result"]).transpose()

    assert_frame_equal(grit_result, expected_result)

    # Calculate grit will not work with the full dataframe
    with pytest.raises(AssertionError) as ae:
        result = calculate_grit(
            similarity_melted_df,
            control_perts=control_perts,
            column_id_info=column_id_info,
        )
    assert "grit is calculated for each perturbation independently" in str(
        ae.value)

    # Calculate grit will not work with when control barcodes are missing
    with pytest.raises(AssertionError) as ae:
        result = calculate_grit(
            example_group,
            control_perts=["DOES NOT EXIST", "THIS ONE NEITHER"],
            column_id_info=column_id_info,
        )
    assert "Error! No control perturbations found." in str(ae.value)
Exemplo n.º 9
0
def precision_recall(
    similarity_melted_df: pd.DataFrame,
    replicate_groups: List[str],
    groupby_columns: List[str],
    k: Union[int, List[int], str],
) -> pd.DataFrame:
    """Determine the precision and recall at k for all unique groupby_columns samples
    based on a predefined similarity metric (see cytominer_eval.transform.metric_melt)

    Parameters
    ----------
    similarity_melted_df : pandas.DataFrame
        An elongated symmetrical matrix indicating pairwise correlations between
        samples. Importantly, it must follow the exact structure as output from
        :py:func:`cytominer_eval.transform.transform.metric_melt`.
    replicate_groups : List
        a list of metadata column names in the original profile dataframe to use as replicate columns.
    groupby_columns : List of str
        Column by which the similarity matrix is grouped and by which the precision/recall is calculated.
        For example, if groupby_column = Metadata_sample then the precision is calculated for each sample.
        Calculating the precision by sample is the default
        but it is mathematically not incorrect to calculate the precision at the MOA level.
        This is just less intuitive to understand.
    k : List of ints or int
        an integer indicating how many pairwise comparisons to threshold.
        if k = 'R' then precision at R will be calculated where R is the number of other replicates

    Returns
    -------
    pandas.DataFrame
        precision and recall metrics for all groupby_column groups given k
    """
    # Check for correct k input
    assert Union[int, List[int], str]
    # Determine pairwise replicates and make sure to sort based on the metric!
    similarity_melted_df = assign_replicates(
        similarity_melted_df=similarity_melted_df,
        replicate_groups=replicate_groups).sort_values(by="similarity_metric",
                                                       ascending=False)

    # Check to make sure that the melted dataframe is full
    assert_melt(similarity_melted_df, eval_metric="precision_recall")

    # Extract out specific columns
    pair_ids = set_pair_ids()
    groupby_cols_suffix = [
        "{x}{suf}".format(x=x, suf=pair_ids[list(pair_ids)[0]]["suffix"])
        for x in groupby_columns
    ]
    # iterate over all k
    precision_recall_df = pd.DataFrame()
    if type(k) == int:
        k = [k]
    for k_ in k:
        # Calculate precision and recall for all groups
        precision_recall_df_at_k = similarity_melted_df.groupby(
            groupby_cols_suffix).apply(
                lambda x: calculate_precision_recall(x, k=k_))
        precision_recall_df = precision_recall_df.append(
            precision_recall_df_at_k)

    # Rename the columns back to the replicate groups provided
    rename_cols = dict(zip(groupby_cols_suffix, groupby_columns))

    return precision_recall_df.reset_index().rename(rename_cols,
                                                    axis="columns")
Exemplo n.º 10
0
def hitk(
    similarity_melted_df: pd.DataFrame,
    replicate_groups: List[str],
    groupby_columns: List[str],
    percent_list: Union[int, List[int]],
) -> pd.DataFrame:
    """Calculate the hit@k hits list and percent scores.
    This function groups the similarity matrix by each sample (group_col) and by similarity score. It then determines the rank of each correct hit.
    A correct hit is a connection to another sample with the same replicate attributes (replicate_groups), for example the same MOA.

    Hit@k records all hits/indexes in a long list which can be used to create histogram plots or similar visualizations.

    The percent scores contain the number of hits above the expected random distribution at a given percentage.
    For example, at 5 percent we calculate how many hits are within the first 5 percent of classes (number of neighbors) and then subtract the expected number of hits.

    Parameters
    ----------
    similarity_melted_df : pandas.DataFrame
        An elongated symmetrical matrix indicating pairwise correlations between
        samples. Importantly, it must follow the exact structure as output from
        :py:func:`cytominer_eval.transform.transform.metric_melt`.

    replicate_groups : list or int
        a list of metadata column names in the original profile dataframe to use as replicate columns.

    groupby_columns: str
        group columns determine the columns over which the similarity_melted_df is grouped.
        Usually groupby_columns will span the full space of the input data
        such that drop_duplicates by the groupby_cols would not change the data.
        If you group over Metadata_plate for examples, you will get meaningless results.
        This can easily be seen from the fact that the percent score at 100 will be nonzero.

    percent_list : list or "all"
        A list of percentages at which to calculate the percent scores, ie the amount of hits below this percentage.
        If percent_list == "all" a full dict with the length of classes will be created.
        Percentages are given as integers, ie 50 means 50 %.

    Returns
    -------
    hits_list : list
        full list of all hits. Can be used for histogram plotting.
    percent_scores: dict
        dictionary of the percentage list and their corresponding percent scores (see percent score function).
    """
    # make sure percent_list is a list
    if type(percent_list) == int:
        percent_list = [percent_list]
    # check for correct input
    assert type(percent_list) == list or percent_list == "all", "input is incorrect"
    if type(percent_list) == list:
        assert max(percent_list) <= 100, "percentages must be smaller than 100"

    similarity_melted_df = assign_replicates(
        similarity_melted_df=similarity_melted_df, replicate_groups=replicate_groups
    )
    # Check to make sure that the melted dataframe is full
    assert_melt(similarity_melted_df, eval_metric="hitk")

    # Extract the name of the columns in the sim_df
    pair_ids = set_pair_ids()
    groupby_cols_suffix = [
        "{x}{suf}".format(x=x, suf=pair_ids[list(pair_ids)[0]]["suffix"])
        for x in groupby_columns
    ]

    # group the sim_df by the groupby_columns
    grouped = similarity_melted_df.groupby(groupby_cols_suffix)
    nr_of_groups = grouped.ngroups
    # Within each group, add the ranks of each connection to a new column
    similarity_melted_with_rank = grouped.apply(lambda x: add_hit_rank(x))

    # make a list of the ranks of correct connection (hits), ie where the group_replicate is true
    hits_list = similarity_melted_with_rank[
        similarity_melted_with_rank["group_replicate"] == True
    ]["rank"].tolist()

    # calculate the scores at each percentage
    percent_scores = percentage_scores(hits_list, percent_list, nr_of_groups)

    return hits_list, percent_scores