示例#1
0
def grit(
    similarity_melted_df: pd.DataFrame,
    control_perts: List[str],
    profile_col: str,
    replicate_group_col: str,
    replicate_summary_method: str = "mean",
) -> pd.DataFrame:
    r"""Calculate grit

    Parameters
    ----------
    similarity_melted_df : pandas.DataFrame
        a long pandas dataframe output from cytominer_eval.transform.metric_melt
    control_perts : list
        a list of control perturbations to calculate a null distribution
    profile_col : str
        the metadata column storing profile ids. The column can have unique or replicate
        identifiers.
    replicate_group_col : str
        the metadata column indicating a higher order structure (group) than the
        profile column. E.g. target gene vs. guide in a CRISPR experiment.
    replicate_summary_method : {'mean', 'median'}, optional
        how replicate z-scores to control perts are summarized. Defaults to "mean".

    Returns
    -------
    pandas.DataFrame
        A dataframe of grit measurements per perturbation
    """
    # Check if we support the provided summary method
    check_replicate_summary_method(replicate_summary_method)

    # Determine pairwise replicates
    similarity_melted_df = assign_replicates(
        similarity_melted_df=similarity_melted_df,
        replicate_groups=[profile_col, replicate_group_col],
    )

    # Check to make sure that the melted dataframe is full
    assert_melt(similarity_melted_df, eval_metric="grit")

    # Extract out specific columns
    pair_ids = set_pair_ids()
    profile_col_name = "{x}{suf}".format(
        x=profile_col, suf=pair_ids[list(pair_ids)[0]]["suffix"])

    # Define the columns to use in the calculation
    column_id_info = set_grit_column_info(
        profile_col=profile_col, replicate_group_col=replicate_group_col)

    # Calculate grit for each perturbation
    grit_df = (similarity_melted_df.groupby(profile_col_name).apply(
        lambda x: calculate_grit(
            replicate_group_df=x,
            control_perts=control_perts,
            column_id_info=column_id_info,
            replicate_summary_method=replicate_summary_method,
        )).reset_index(drop=True))

    return grit_df
示例#2
0
def test_set_pair_ids():
    pair_a = "pair_a"
    pair_b = "pair_b"

    result = set_pair_ids()

    assert result[pair_a]["index"] == "{pair_a}_index".format(pair_a=pair_a)
    assert result[pair_a]["index"] == "{pair_a}_index".format(pair_a=pair_a)
    assert result[pair_b]["suffix"] == "_{pair_b}".format(pair_b=pair_b)
    assert result[pair_b]["suffix"] == "_{pair_b}".format(pair_b=pair_b)
def precision_recall(
    similarity_melted_df: pd.DataFrame,
    replicate_groups: List[str],
    k: Union[int, List[int]],
) -> pd.DataFrame:
    """Determine the precision and recall at k for all unique replicate groups
    based on a predefined similarity metric (see cytominer_eval.transform.metric_melt)

    Parameters
    ----------
    similarity_melted_df : pandas.DataFrame
        An elongated symmetrical matrix indicating pairwise correlations between
        samples. Importantly, it must follow the exact structure as output from
        :py:func:`cytominer_eval.transform.transform.metric_melt`.
    replicate_groups : List
        a list of metadata column names in the original profile dataframe to use as
        replicate columns.
    k : List of ints or int
        an integer indicating how many pairwise comparisons to threshold.

    Returns
    -------
    pandas.DataFrame
        precision and recall metrics for all replicate groups given k
    """
    # Determine pairwise replicates and make sure to sort based on the metric!
    similarity_melted_df = assign_replicates(
        similarity_melted_df=similarity_melted_df, replicate_groups=replicate_groups
    ).sort_values(by="similarity_metric", ascending=False)

    # Check to make sure that the melted dataframe is full
    assert_melt(similarity_melted_df, eval_metric="precision_recall")

    # Extract out specific columns
    pair_ids = set_pair_ids()
    replicate_group_cols = [
        "{x}{suf}".format(x=x, suf=pair_ids[list(pair_ids)[0]]["suffix"])
        for x in replicate_groups
    ]
    # iterate over all k
    precision_recall_df = pd.DataFrame()
    if type(k) == int:
        k = [k]
    for k_ in k:
        # Calculate precision and recall for all groups
        precision_recall_df_at_k = similarity_melted_df.groupby(
            replicate_group_cols
        ).apply(lambda x: calculate_precision_recall(x, k=k_))
        precision_recall_df = precision_recall_df.append(precision_recall_df_at_k)

    # Rename the columns back to the replicate groups provided
    rename_cols = dict(zip(replicate_group_cols, replicate_groups))

    return precision_recall_df.reset_index().rename(rename_cols, axis="columns")
示例#4
0
def test_calculate_precision_recall():
    similarity_melted_df = metric_melt(
        df=df,
        features=features,
        metadata_features=meta_features,
        similarity_metric="pearson",
        eval_metric="precision_recall",
    )

    replicate_groups = ["Metadata_broad_sample"]
    result = assign_replicates(similarity_melted_df=similarity_melted_df,
                               replicate_groups=replicate_groups).sort_values(
                                   by="similarity_metric", ascending=False)

    pair_ids = set_pair_ids()
    replicate_group_cols = [
        "{x}{suf}".format(x=x, suf=pair_ids[list(pair_ids)[0]]["suffix"])
        for x in replicate_groups
    ]

    example_group = result.groupby(replicate_group_cols).get_group(
        name=("BRD-A38592941-001-02-7"))

    assert example_group.shape[
        0] == 383 * 6  # number of pairwise comparisons per dose

    # Assert that the similarity metrics are sorted
    assert (example_group.similarity_metric.diff().dropna() > 0).sum() == 0

    # Perform the calculation!
    result = pd.DataFrame(calculate_precision_recall(example_group, k=10),
                          columns=["result"])

    expected_result = {"k": 10, "precision": 0.4, "recall": 0.1333}
    expected_result = pd.DataFrame(expected_result,
                                   index=["result"]).transpose()

    assert_frame_equal(result, expected_result, check_less_precise=True)

    # Check that recall is 1 when k is maximized
    result = pd.DataFrame(
        calculate_precision_recall(example_group, k=example_group.shape[0]),
        columns=["result"],
    )

    assert result.loc["recall", "result"] == 1
示例#5
0
def process_melt(
    df: pd.DataFrame,
    meta_df: pd.DataFrame,
    eval_metric: str = "replicate_reproducibility",
) -> pd.DataFrame:
    """Helper function to annotate and process an input similarity matrix

    Parameters
    ----------
    df : pandas.DataFrame
        A similarity matrix output from
        :py:func:`cytominer_eval.transform.transform.get_pairwise_metric`
    meta_df : pandas.DataFrame
        A wide matrix of metadata information where the index aligns to the similarity
        matrix index
    eval_metric : str, optional
        Which metric to ultimately calculate. Determines whether or not to keep the full
        similarity matrix or only one diagonal. Defaults to "replicate_reproducibility".

    Returns
    -------
    pandas.DataFrame
        A pairwise similarity matrix
    """
    # Confirm that the user formed the input arguments properly
    assert df.shape[0] == df.shape[1], "Matrix must be symmetrical"
    check_eval_metric(eval_metric)

    # Get identifiers for pairing metadata
    pair_ids = set_pair_ids()

    # Subset the pairwise similarity metric depending on the eval metric given:
    #   "replicate_reproducibility" - requires only the upper triangle of a symmetric matrix
    #   "precision_recall" - requires the full symmetric matrix (no diagonal)
    # Remove pairwise matrix diagonal and redundant pairwise comparisons
    if eval_metric == "replicate_reproducibility":
        upper_tri = get_upper_matrix(df)
        df = df.where(upper_tri)
    else:
        np.fill_diagonal(df.values, np.nan)

    # Convert pairwise matrix to melted (long) version based on index value
    metric_unlabeled_df = (
        pd.melt(
            df.reset_index(),
            id_vars="index",
            value_vars=df.columns,
            var_name=pair_ids["pair_b"]["index"],
            value_name="similarity_metric",
        )
        .dropna()
        .reset_index(drop=True)
        .rename({"index": pair_ids["pair_a"]["index"]}, axis="columns")
    )

    # Merge metadata on index for both comparison pairs
    output_df = meta_df.merge(
        meta_df.merge(
            metric_unlabeled_df,
            left_index=True,
            right_on=pair_ids["pair_b"]["index"],
        ),
        left_index=True,
        right_on=pair_ids["pair_a"]["index"],
        suffixes=[pair_ids["pair_a"]["suffix"], pair_ids["pair_b"]["suffix"]],
    ).reset_index(drop=True)

    return output_df
def assign_replicates(
    similarity_melted_df: pd.DataFrame,
    replicate_groups: List[str],
) -> pd.DataFrame:
    """Determine which profiles should be considered replicates.

    Given an elongated pairwise correlation matrix with metadata annotations, determine
    how to assign replicate information.

    Parameters
    ----------
    similarity_melted_df : pandas.DataFrame
        Long pandas DataFrame of annotated pairwise correlations output from
        :py:func:`cytominer_eval.transform.transform.metric_melt`.
    replicate_groups : list
        a list of metadata column names in the original profile dataframe used to
        indicate replicate profiles.

    Returns
    -------
    pd.DataFrame
        A similarity_melted_df but with added columns indicating whether or not the
        pairwise similarity metric is comparing replicates or not. Used in most eval
        operations.
    """
    pair_ids = set_pair_ids()
    replicate_col_names = {
        x: "{x}_replicate".format(x=x)
        for x in replicate_groups
    }

    compare_dfs = []
    for replicate_col in replicate_groups:
        replicate_cols_with_suffix = [
            "{col}{suf}".format(col=replicate_col, suf=pair_ids[x]["suffix"])
            for x in pair_ids
        ]

        assert all([
            x in similarity_melted_df.columns
            for x in replicate_cols_with_suffix
        ]), "replicate_group not found in melted dataframe columns"

        replicate_col_name = replicate_col_names[replicate_col]

        compare_df = similarity_melted_df.loc[:, replicate_cols_with_suffix]
        compare_df.loc[:, replicate_col_name] = False

        compare_df.loc[np.where(compare_df.iloc[:,
                                                0] == compare_df.iloc[:,
                                                                      1])[0],
                       replicate_col_name, ] = True
        compare_dfs.append(compare_df)

    compare_df = pd.concat(compare_dfs, axis="columns").reset_index(drop=True)
    compare_df = compare_df.assign(
        group_replicate=compare_df.loc[:, replicate_col_names.values()].min(
            axis="columns")).loc[:,
                                 list(replicate_col_names.values()) +
                                 ["group_replicate"]]

    similarity_melted_df = similarity_melted_df.merge(compare_df,
                                                      left_index=True,
                                                      right_index=True)
    return similarity_melted_df
]
features = df.drop(meta_features, axis="columns").columns.tolist()

similarity_melted_df = metric_melt(
    df=df,
    features=features,
    metadata_features=meta_features,
    similarity_metric="pearson",
    eval_metric="grit",
)

control_perts = ["Luc-2", "LacZ-2", "LacZ-3"]
profile_col = "Metadata_pert_name"
replicate_group_col = "Metadata_gene_name"

pair_ids = set_pair_ids()
replicate_col_name = "{x}{suf}".format(
    x=profile_col, suf=pair_ids[list(pair_ids)[0]]["suffix"])

column_id_info = set_grit_column_info(profile_col=profile_col,
                                      replicate_group_col=replicate_group_col)


def test_get_grit_entry():
    with pytest.raises(AssertionError) as ae:
        result = get_grit_entry(df=similarity_melted_df,
                                col=replicate_col_name)
    assert "grit is calculated for each perturbation independently" in str(
        ae.value)

    expected_result = "EMPTY"
示例#8
0
def precision_recall(
    similarity_melted_df: pd.DataFrame,
    replicate_groups: List[str],
    groupby_columns: List[str],
    k: Union[int, List[int], str],
) -> pd.DataFrame:
    """Determine the precision and recall at k for all unique groupby_columns samples
    based on a predefined similarity metric (see cytominer_eval.transform.metric_melt)

    Parameters
    ----------
    similarity_melted_df : pandas.DataFrame
        An elongated symmetrical matrix indicating pairwise correlations between
        samples. Importantly, it must follow the exact structure as output from
        :py:func:`cytominer_eval.transform.transform.metric_melt`.
    replicate_groups : List
        a list of metadata column names in the original profile dataframe to use as replicate columns.
    groupby_columns : List of str
        Column by which the similarity matrix is grouped and by which the precision/recall is calculated.
        For example, if groupby_column = Metadata_sample then the precision is calculated for each sample.
        Calculating the precision by sample is the default
        but it is mathematically not incorrect to calculate the precision at the MOA level.
        This is just less intuitive to understand.
    k : List of ints or int
        an integer indicating how many pairwise comparisons to threshold.
        if k = 'R' then precision at R will be calculated where R is the number of other replicates

    Returns
    -------
    pandas.DataFrame
        precision and recall metrics for all groupby_column groups given k
    """
    # Check for correct k input
    assert Union[int, List[int], str]
    # Determine pairwise replicates and make sure to sort based on the metric!
    similarity_melted_df = assign_replicates(
        similarity_melted_df=similarity_melted_df,
        replicate_groups=replicate_groups).sort_values(by="similarity_metric",
                                                       ascending=False)

    # Check to make sure that the melted dataframe is full
    assert_melt(similarity_melted_df, eval_metric="precision_recall")

    # Extract out specific columns
    pair_ids = set_pair_ids()
    groupby_cols_suffix = [
        "{x}{suf}".format(x=x, suf=pair_ids[list(pair_ids)[0]]["suffix"])
        for x in groupby_columns
    ]
    # iterate over all k
    precision_recall_df = pd.DataFrame()
    if type(k) == int:
        k = [k]
    for k_ in k:
        # Calculate precision and recall for all groups
        precision_recall_df_at_k = similarity_melted_df.groupby(
            groupby_cols_suffix).apply(
                lambda x: calculate_precision_recall(x, k=k_))
        precision_recall_df = precision_recall_df.append(
            precision_recall_df_at_k)

    # Rename the columns back to the replicate groups provided
    rename_cols = dict(zip(groupby_cols_suffix, groupby_columns))

    return precision_recall_df.reset_index().rename(rename_cols,
                                                    axis="columns")
示例#9
0
def hitk(
    similarity_melted_df: pd.DataFrame,
    replicate_groups: List[str],
    groupby_columns: List[str],
    percent_list: Union[int, List[int]],
) -> pd.DataFrame:
    """Calculate the hit@k hits list and percent scores.
    This function groups the similarity matrix by each sample (group_col) and by similarity score. It then determines the rank of each correct hit.
    A correct hit is a connection to another sample with the same replicate attributes (replicate_groups), for example the same MOA.

    Hit@k records all hits/indexes in a long list which can be used to create histogram plots or similar visualizations.

    The percent scores contain the number of hits above the expected random distribution at a given percentage.
    For example, at 5 percent we calculate how many hits are within the first 5 percent of classes (number of neighbors) and then subtract the expected number of hits.

    Parameters
    ----------
    similarity_melted_df : pandas.DataFrame
        An elongated symmetrical matrix indicating pairwise correlations between
        samples. Importantly, it must follow the exact structure as output from
        :py:func:`cytominer_eval.transform.transform.metric_melt`.

    replicate_groups : list or int
        a list of metadata column names in the original profile dataframe to use as replicate columns.

    groupby_columns: str
        group columns determine the columns over which the similarity_melted_df is grouped.
        Usually groupby_columns will span the full space of the input data
        such that drop_duplicates by the groupby_cols would not change the data.
        If you group over Metadata_plate for examples, you will get meaningless results.
        This can easily be seen from the fact that the percent score at 100 will be nonzero.

    percent_list : list or "all"
        A list of percentages at which to calculate the percent scores, ie the amount of hits below this percentage.
        If percent_list == "all" a full dict with the length of classes will be created.
        Percentages are given as integers, ie 50 means 50 %.

    Returns
    -------
    hits_list : list
        full list of all hits. Can be used for histogram plotting.
    percent_scores: dict
        dictionary of the percentage list and their corresponding percent scores (see percent score function).
    """
    # make sure percent_list is a list
    if type(percent_list) == int:
        percent_list = [percent_list]
    # check for correct input
    assert type(percent_list) == list or percent_list == "all", "input is incorrect"
    if type(percent_list) == list:
        assert max(percent_list) <= 100, "percentages must be smaller than 100"

    similarity_melted_df = assign_replicates(
        similarity_melted_df=similarity_melted_df, replicate_groups=replicate_groups
    )
    # Check to make sure that the melted dataframe is full
    assert_melt(similarity_melted_df, eval_metric="hitk")

    # Extract the name of the columns in the sim_df
    pair_ids = set_pair_ids()
    groupby_cols_suffix = [
        "{x}{suf}".format(x=x, suf=pair_ids[list(pair_ids)[0]]["suffix"])
        for x in groupby_columns
    ]

    # group the sim_df by the groupby_columns
    grouped = similarity_melted_df.groupby(groupby_cols_suffix)
    nr_of_groups = grouped.ngroups
    # Within each group, add the ranks of each connection to a new column
    similarity_melted_with_rank = grouped.apply(lambda x: add_hit_rank(x))

    # make a list of the ranks of correct connection (hits), ie where the group_replicate is true
    hits_list = similarity_melted_with_rank[
        similarity_melted_with_rank["group_replicate"] == True
    ]["rank"].tolist()

    # calculate the scores at each percentage
    percent_scores = percentage_scores(hits_list, percent_list, nr_of_groups)

    return hits_list, percent_scores