def test_set_pair_ids(): pair_a = "pair_a" pair_b = "pair_b" result = set_pair_ids() assert result[pair_a]["index"] == "{pair_a}_index".format(pair_a=pair_a) assert result[pair_a]["index"] == "{pair_a}_index".format(pair_a=pair_a) assert result[pair_b]["suffix"] == "_{pair_b}".format(pair_b=pair_b) assert result[pair_b]["suffix"] == "_{pair_b}".format(pair_b=pair_b)
def assign_replicates( similarity_melted_df: pd.DataFrame, replicate_groups: List[str], ) -> pd.DataFrame: """ Arguments: similarity_melted_df - a long pandas dataframe output from transform.metric_melt replicate_groups - a list of metadata column names in the original profile dataframe to use as replicate columns Output: Adds columns to the similarity metric dataframe to indicate whether or not the pairwise similarity metric is comparing replicates or not """ pair_ids = set_pair_ids() replicate_col_names = { x: "{x}_replicate".format(x=x) for x in replicate_groups } compare_dfs = [] for replicate_col in replicate_groups: replicate_cols_with_suffix = [ "{col}{suf}".format(col=replicate_col, suf=pair_ids[x]["suffix"]) for x in pair_ids ] assert all([ x in similarity_melted_df.columns for x in replicate_cols_with_suffix ]), "replicate_group not found in melted dataframe columns" replicate_col_name = replicate_col_names[replicate_col] compare_df = similarity_melted_df.loc[:, replicate_cols_with_suffix] compare_df.loc[:, replicate_col_name] = False compare_df.loc[np.where(compare_df.iloc[:, 0] == compare_df.iloc[:, 1])[0], replicate_col_name, ] = True compare_dfs.append(compare_df) compare_df = pd.concat(compare_dfs, axis="columns").reset_index(drop=True) compare_df = compare_df.assign( group_replicate=compare_df.loc[:, replicate_col_names.values()].min( axis="columns")).loc[:, list(replicate_col_names.values()) + ["group_replicate"]] similarity_melted_df = similarity_melted_df.merge(compare_df, left_index=True, right_index=True) return similarity_melted_df
def test_calculate_precision_recall(): similarity_melted_df = metric_melt( df=df, features=features, metadata_features=meta_features, similarity_metric="pearson", eval_metric="precision_recall", ) replicate_groups = ["Metadata_broad_sample"] result = assign_replicates(similarity_melted_df=similarity_melted_df, replicate_groups=replicate_groups).sort_values( by="similarity_metric", ascending=False) pair_ids = set_pair_ids() replicate_group_cols = [ "{x}{suf}".format(x=x, suf=pair_ids[list(pair_ids)[0]]["suffix"]) for x in replicate_groups ] example_group = result.groupby(replicate_group_cols).get_group( name=("BRD-A38592941-001-02-7")) assert example_group.shape[ 0] == 383 * 6 # number of pairwise comparisons per dose # Assert that the similarity metrics are sorted assert (example_group.similarity_metric.diff().dropna() > 0).sum() == 0 # Perform the calculation! result = pd.DataFrame(calculate_precision_recall(example_group, k=10), columns=["result"]) expected_result = {"k": 10, "precision": 0.4, "recall": 0.1333} expected_result = pd.DataFrame(expected_result, index=["result"]).transpose() assert_frame_equal(result, expected_result, check_less_precise=True) # Check that recall is 1 when k is maximized result = pd.DataFrame( calculate_precision_recall(example_group, k=example_group.shape[0]), columns=["result"], ) assert result.loc["recall", "result"] == 1
def precision_recall( similarity_melted_df: pd.DataFrame, replicate_groups: List[str], k: int, ) -> pd.DataFrame: """ Determine the precision and recall at k for all unique replicate groups based on a predefined similarity metric (see cytominer_eval.transform.metric_melt) Arguments: similarity_melted_df - a long pandas dataframe output from transform.metric_melt replicate_groups - a list of metadata column names in the original profile dataframe to use as replicate columns k - an integer indicating how many pairwise comparisons to threshold Output: pandas DataFrame of precision and recall metrics for all replicate groups """ # Determine pairwise replicates and make sure to sort based on the metric! similarity_melted_df = assign_replicates( similarity_melted_df=similarity_melted_df, replicate_groups=replicate_groups).sort_values(by="similarity_metric", ascending=False) # Check to make sure that the melted dataframe is full assert_melt(similarity_melted_df, eval_metric="precision_recall") # Extract out specific columns pair_ids = set_pair_ids() replicate_group_cols = [ "{x}{suf}".format(x=x, suf=pair_ids[list(pair_ids)[0]]["suffix"]) for x in replicate_groups ] # Calculate precision and recall for all groups precision_recall_df = similarity_melted_df.groupby( replicate_group_cols).apply( lambda x: calculate_precision_recall(x, k=k)) # Rename the columns back to the replicate groups provided rename_cols = dict(zip(replicate_group_cols, replicate_groups)) return precision_recall_df.reset_index().rename(rename_cols, axis="columns")
def grit( similarity_melted_df: pd.DataFrame, control_perts: List[str], replicate_id: str, group_id: str, ) -> pd.DataFrame: """ Calculate grit Arguments: similarity_melted_df - a long pandas dataframe output from transform.metric_melt control_perts - a list of control perturbations to calculate a null distribution replicate_id - the metadata identifier marking which column tracks replicate perts group_id - the metadata identifier marking which column tracks a higher order groups for all perturbations Output: A dataframe of grit measurements per perturbation """ # Determine pairwise replicates similarity_melted_df = assign_replicates( similarity_melted_df=similarity_melted_df, replicate_groups=[replicate_id, group_id], ) # Check to make sure that the melted dataframe is full assert_melt(similarity_melted_df, eval_metric="grit") # Extract out specific columns pair_ids = set_pair_ids() replicate_col_name = "{x}{suf}".format( x=replicate_id, suf=pair_ids[list(pair_ids)[0]]["suffix"]) # Define the columns to use in the calculation column_id_info = set_grit_column_info(replicate_id=replicate_id, group_id=group_id) # Calculate grit for each perturbation grit_df = (similarity_melted_df.groupby( replicate_col_name).apply(lambda x: calculate_grit( x, control_perts, column_id_info)).reset_index(drop=True)) return grit_df
] features = df.drop(meta_features, axis="columns").columns.tolist() similarity_melted_df = metric_melt( df=df, features=features, metadata_features=meta_features, similarity_metric="pearson", eval_metric="grit", ) control_perts = ["Luc-2", "LacZ-2", "LacZ-3"] replicate_id = "Metadata_pert_name" group_id = "Metadata_gene_name" pair_ids = set_pair_ids() replicate_col_name = "{x}{suf}".format( x=replicate_id, suf=pair_ids[list(pair_ids)[0]]["suffix"]) column_id_info = set_grit_column_info(replicate_id=replicate_id, group_id=group_id) def test_get_grit_entry(): with pytest.raises(AssertionError) as ae: result = get_grit_entry(df=similarity_melted_df, col=replicate_col_name) assert "grit is calculated for each perturbation independently" in str( ae.value) expected_result = "EMPTY"