def test_grit():
    result = grit(
        similarity_melted_df=similarity_melted_df,
        control_perts=control_perts,
        profile_col=profile_col,
        replicate_group_col=replicate_group_col,
    ).sort_values(by="grit")

    assert all(
        [x in result.columns for x in ["perturbation", "group", "grit"]])

    top_result = pd.DataFrame(
        result.sort_values(
            by="grit", ascending=False).reset_index(drop=True).iloc[0, :], )

    expected_result = {
        "perturbation": "PTK2-2",
        "group": "PTK2",
        "grit": 4.61094
    }
    expected_result = pd.DataFrame(expected_result, index=[0]).transpose()

    assert_frame_equal(top_result, expected_result)

    # There are six singletons in this dataset
    assert result.grit.isna().sum() == 6

    # No perturbations should be duplicated
    assert result.perturbation.duplicated().sum() == 0

    # With this data, we do not expect the sum of grit to change
    assert np.round(result.grit.sum(), 0) == 152.0
def test_grit_summary_metric():
    result = grit(
        similarity_melted_df=similarity_melted_df,
        control_perts=control_perts,
        profile_col=profile_col,
        replicate_group_col=replicate_group_col,
        replicate_summary_method="median",
    ).sort_values(by="grit")

    assert all(
        [x in result.columns for x in ["perturbation", "group", "grit"]])

    top_result = pd.DataFrame(
        result.sort_values(
            by="grit", ascending=False).reset_index(drop=True).iloc[0, :], )

    expected_result = {
        "perturbation": "PTK2-2",
        "group": "PTK2",
        "grit": 4.715917
    }
    expected_result = pd.DataFrame(expected_result, index=[0]).transpose()

    assert_frame_equal(
        top_result,
        expected_result,
    )

    with pytest.raises(ValueError) as ve:
        output = grit(
            similarity_melted_df=similarity_melted_df,
            control_perts=control_perts,
            profile_col=profile_col,
            replicate_group_col=replicate_group_col,
            replicate_summary_method="fail",
        )
    assert "fail method not supported. Select one of:" in str(ve.value)
示例#3
0
def test_compare_distributions():
    # Define two distributions using a specific compound as an example
    compound = "BRD-K07857022-002-01-1"
    profile_id = "Metadata_profile_378"

    target_group = similarity_melted_full_df.query(
        "Metadata_profile_id_pair_a == @profile_id")

    replicate_group_values = target_group.query(
        "Metadata_broad_sample_pair_b == @compound"
    ).similarity_metric.values.reshape(-1, 1)

    control_group_values = target_group.query(
        "Metadata_broad_sample_pair_b == 'DMSO'"
    ).similarity_metric.values.reshape(-1, 1)

    control_perts = df.query(
        "Metadata_broad_sample == 'DMSO'").Metadata_profile_id.tolist()

    hardcoded_values_should_not_change = {
        "zscore": {
            "mean": 5.639379456018854,
            "median": 5.648269672347573
        }
    }
    for summary_method in get_available_summary_methods():

        hardcoded = hardcoded_values_should_not_change["zscore"][
            summary_method]

        result = compare_distributions(
            target_distrib=replicate_group_values,
            control_distrib=control_group_values,
            method="zscore",
            replicate_summary_method=summary_method,
        )
        assert np.round(result, 5) == np.round(hardcoded, 5)

        grit_result = (grit(
            similarity_melted_full_df,
            control_perts=control_perts,
            profile_col="Metadata_profile_id",
            replicate_group_col="Metadata_broad_sample",
            replicate_summary_method=summary_method,
        ).query("perturbation == @profile_id").grit.values[0])

        assert result == grit_result
示例#4
0
def evaluate(
    profiles: pd.DataFrame,
    features: List[str],
    meta_features: List[str],
    replicate_groups: Union[List[str], dict],
    operation: str = "percent_strong",
    similarity_metric: str = "pearson",
    percent_strong_quantile: np.float = 0.95,
    precision_recall_k: int = 10,
    grit_control_perts: List[str] = ["None"],
):
    # Check replicate groups input
    check_replicate_groups(eval_metric=operation, replicate_groups=replicate_groups)

    # Melt the input profiles to long format
    similarity_melted_df = metric_melt(
        df=profiles,
        features=features,
        metadata_features=meta_features,
        similarity_metric=similarity_metric,
        eval_metric=operation,
    )

    # Perform the input operation
    if operation == "percent_strong":
        metric_result = percent_strong(
            similarity_melted_df=similarity_melted_df,
            replicate_groups=replicate_groups,
            quantile=percent_strong_quantile,
        )
    elif operation == "precision_recall":
        metric_result = precision_recall(
            similarity_melted_df=similarity_melted_df,
            replicate_groups=replicate_groups,
            k=precision_recall_k,
        )
    elif operation == "grit":
        metric_result = grit(
            similarity_melted_df=similarity_melted_df,
            control_perts=grit_control_perts,
            replicate_id=replicate_groups["replicate_id"],
            group_id=replicate_groups["group_id"],
        )

    return metric_result
示例#5
0
def evaluate(
    profiles: pd.DataFrame,
    features: List[str],
    meta_features: List[str],
    replicate_groups: Union[List[str], dict],
    operation: str = "replicate_reproducibility",
    similarity_metric: str = "pearson",
    replicate_reproducibility_quantile: float = 0.95,
    replicate_reproducibility_return_median_cor: bool = False,
    precision_recall_k: Union[int, List[int]] = 10,
    grit_control_perts: List[str] = ["None"],
    grit_replicate_summary_method: str = "mean",
    mp_value_params: dict = {},
    enrichment_percentile: Union[float, List[float]] = 0.99,
):
    r"""Evaluate profile quality and strength.

    For a given profile dataframe containing both metadata and feature measurement
    columns, use this function to calculate profile quality metrics. The function
    contains all the necessary arguments for specific evaluation operations.

    Parameters
    ----------
    profiles : pandas.DataFrame
        profiles must be a pandas DataFrame with profile samples as rows and profile
        features as columns. The columns should contain both metadata and feature
        measurements.
    features : list
        A list of strings corresponding to feature measurement column names in the
        `profiles` DataFrame. All features listed must be found in `profiles`.
    meta_features : list
        A list of strings corresponding to metadata column names in the `profiles`
        DataFrame. All features listed must be found in `profiles`.
    replicate_groups : {str, list, dict}
        An important variable indicating which metadata columns denote replicate
        information. All metric operations require replicate profiles.
        `replicate_groups` indicates a str or list of columns to use. For
        `operation="grit"`, `replicate_groups` is a dict with two keys: "profile_col"
        and "replicate_group_col". "profile_col" is the column name that stores
        identifiers for each profile (can be unique), while "replicate_group_col" is the
        column name indicating a higher order replicate information. E.g.
        "replicate_group_col" can be a gene column in a CRISPR experiment with multiple
        guides targeting the same genes. See also
        :py:func:`cytominer_eval.operations.grit` and
        :py:func:`cytominer_eval.transform.util.check_replicate_groups`.
    operation : {'replicate_reproducibility', 'precision_recall', 'grit', 'mp_value'}, optional
        The specific evaluation metric to calculate. The default is
        "replicate_reproducibility".
    similarity_metric: {'pearson', 'spearman', 'kendall'}, optional
        How to calculate pairwise similarity. Defaults to "pearson". We use the input
        in pandas.DataFrame.cor(). The default is "pearson".

    Returns
    -------
    float, pd.DataFrame
        The resulting evaluation metric. The return is either a single value or a pandas
        DataFrame summarizing the metric as specified in `operation`.

    Other Parameters
    -----------------------------
    replicate_reproducibility_quantile : {0.95, ...}, optional
        Only used when `operation='replicate_reproducibility'`. This indicates the
        percentile of the non-replicate pairwise similarity to consider a reproducible
        phenotype. Defaults to 0.95.
    replicate_reproducibility_return_median_cor : bool, optional
        Only used when `operation='replicate_reproducibility'`. If True, then also
        return pairwise correlations as defined by replicate_groups and
        similarity metric
    precision_recall_k : int or list of ints {10, ...}, optional
        Only used when `operation='precision_recall'`. Used to calculate precision and
        recall considering the top k profiles according to pairwise similarity.
    grit_control_perts : {None, ...}, optional
        Only used when `operation='grit'`. Specific profile identifiers used as a
        reference when calculating grit. The list entries must be found in the
        `replicate_groups[replicate_id]` column.
    grit_replicate_summary_method : {"mean", "median"}, optional
        Only used when `operation='grit'`. Defines how the replicate z scores are
        summarized. see
        :py:func:`cytominer_eval.operations.util.calculate_grit`
    mp_value_params : {{}, ...}, optional
        Only used when `operation='mp_value'`. A key, item pair of optional parameters
        for calculating mp value. See also
        :py:func:`cytominer_eval.operations.util.default_mp_value_parameters`
    enrichment_percentile : float or list of floats, optional
        Only used when `operation='enrichment'`. Determines the percentage of top connections
        used for the enrichment calculation.
    """
    # Check replicate groups input
    check_replicate_groups(eval_metric=operation,
                           replicate_groups=replicate_groups)

    if operation != "mp_value":
        # Melt the input profiles to long format
        similarity_melted_df = metric_melt(
            df=profiles,
            features=features,
            metadata_features=meta_features,
            similarity_metric=similarity_metric,
            eval_metric=operation,
        )

    # Perform the input operation
    if operation == "replicate_reproducibility":
        metric_result = replicate_reproducibility(
            similarity_melted_df=similarity_melted_df,
            replicate_groups=replicate_groups,
            quantile_over_null=replicate_reproducibility_quantile,
            return_median_correlations=
            replicate_reproducibility_return_median_cor,
        )
    elif operation == "precision_recall":
        metric_result = precision_recall(
            similarity_melted_df=similarity_melted_df,
            replicate_groups=replicate_groups,
            k=precision_recall_k,
        )
    elif operation == "grit":
        metric_result = grit(
            similarity_melted_df=similarity_melted_df,
            control_perts=grit_control_perts,
            profile_col=replicate_groups["profile_col"],
            replicate_group_col=replicate_groups["replicate_group_col"],
            replicate_summary_method=grit_replicate_summary_method,
        )
    elif operation == "mp_value":
        metric_result = mp_value(
            df=profiles,
            control_perts=grit_control_perts,
            replicate_id=replicate_groups,
            features=features,
            params=mp_value_params,
        )
    elif operation == "enrichment":
        metric_result = enrichment(
            similarity_melted_df=similarity_melted_df,
            replicate_groups=replicate_groups,
            percentile=enrichment_percentile,
        )

    return metric_result