def test_assert_melt(): for metric in ["precision_recall", "replicate_reproducibility", "grit"]: result = metric_melt( df=df, features=features, metadata_features=meta_features, similarity_metric="pearson", eval_metric=metric, ) result = assign_replicates( similarity_melted_df=result, replicate_groups=replicate_groups ) assert_melt(result, eval_metric=metric) # Note, not all alternative dummy metrics are provided, since many require # the same melted dataframe if metric == "precision_recall": dummy_metrics = ["replicate_reproducibility"] elif metric == "replicate_reproducibility": dummy_metrics = ["precision_recall", "grit"] elif metric == "grit": dummy_metrics = ["replicate_reproducibility"] for dummy_metric in dummy_metrics: with pytest.raises(AssertionError) as ve: output = assert_melt(result, eval_metric=dummy_metric) assert ( "Stop! The eval_metric provided in 'metric_melt()' is incorrect!" in str(ve.value) )
def test_metric_melt(): result_df = metric_melt(df, features, meta_features, similarity_metric="pearson") assert round(result_df.similarity_metric[0], 3) == round(example_sample_corr, 3) assert result_df.shape[0] == 73536 # The index ID is extremely important for aligning the dataframe # make sure the method is robust to indeces labeled inconsistently same_index_copy = df.copy() same_index_copy.index = [3] * same_index_copy.shape[0] result_df = metric_melt(same_index_copy, features, meta_features, similarity_metric="pearson") assert round(result_df.similarity_metric[0], 3) == round(example_sample_corr, 3) assert result_df.shape[0] == 73536 with pytest.raises(AssertionError) as ve: output = metric_melt( df, features, meta_features, similarity_metric="pearson", eval_metric="NOT SUPPORTED", ) assert "not supported. Available evaluation metrics:" in str(ve.value) for full_metric_required in ["precision_recall", "grit"]: result_df = metric_melt( same_index_copy, features, meta_features, similarity_metric="pearson", eval_metric=full_metric_required, ) assert round(result_df.similarity_metric[0], 3) == round(example_sample_corr, 3) assert result_df.shape[0] == 147072
def test_calculate_precision_recall(): similarity_melted_df = metric_melt( df=df, features=features, metadata_features=meta_features, similarity_metric="pearson", eval_metric="precision_recall", ) replicate_groups = ["Metadata_broad_sample"] result = assign_replicates(similarity_melted_df=similarity_melted_df, replicate_groups=replicate_groups).sort_values( by="similarity_metric", ascending=False) pair_ids = set_pair_ids() replicate_group_cols = [ "{x}{suf}".format(x=x, suf=pair_ids[list(pair_ids)[0]]["suffix"]) for x in replicate_groups ] example_group = result.groupby(replicate_group_cols).get_group( name=("BRD-A38592941-001-02-7")) assert example_group.shape[ 0] == 383 * 6 # number of pairwise comparisons per dose # Assert that the similarity metrics are sorted assert (example_group.similarity_metric.diff().dropna() > 0).sum() == 0 # Perform the calculation! result = pd.DataFrame(calculate_precision_recall(example_group, k=10), columns=["result"]) expected_result = {"k": 10, "precision": 0.4, "recall": 0.1333} expected_result = pd.DataFrame(expected_result, index=["result"]).transpose() assert_frame_equal(result, expected_result, check_less_precise=True) # Check that recall is 1 when k is maximized result = pd.DataFrame( calculate_precision_recall(example_group, k=example_group.shape[0]), columns=["result"], ) assert result.loc["recall", "result"] == 1
def evaluate( profiles: pd.DataFrame, features: List[str], meta_features: List[str], replicate_groups: Union[List[str], dict], operation: str = "percent_strong", similarity_metric: str = "pearson", percent_strong_quantile: np.float = 0.95, precision_recall_k: int = 10, grit_control_perts: List[str] = ["None"], ): # Check replicate groups input check_replicate_groups(eval_metric=operation, replicate_groups=replicate_groups) # Melt the input profiles to long format similarity_melted_df = metric_melt( df=profiles, features=features, metadata_features=meta_features, similarity_metric=similarity_metric, eval_metric=operation, ) # Perform the input operation if operation == "percent_strong": metric_result = percent_strong( similarity_melted_df=similarity_melted_df, replicate_groups=replicate_groups, quantile=percent_strong_quantile, ) elif operation == "precision_recall": metric_result = precision_recall( similarity_melted_df=similarity_melted_df, replicate_groups=replicate_groups, k=precision_recall_k, ) elif operation == "grit": metric_result = grit( similarity_melted_df=similarity_melted_df, control_perts=grit_control_perts, replicate_id=replicate_groups["replicate_id"], group_id=replicate_groups["group_id"], ) return metric_result
df = pd.read_csv(example_file) # Clean the dataframe for convenience df.loc[(df["Metadata_moa"].isna()) & (df["Metadata_broad_sample"] == "DMSO"), "Metadata_moa", ] = "none" df = df[~df["Metadata_moa"].isna()] meta_features = [ x for x in df.columns if (x.startswith("Metadata_") or x.startswith("Image_")) ] features = df.drop(meta_features, axis="columns").columns.tolist() similarity_melted_df = metric_melt( df=df, features=features, metadata_features=meta_features, similarity_metric="pearson", eval_metric="hitk", ) # compute the normal index_list replicate_group = ["Metadata_moa"] groupby_columns = ["Metadata_broad_sample", "Metadata_Plate", "Metadata_Well"] percent_list = [2, 5, 10, 100] index_list, percent_results = hitk( similarity_melted_df=similarity_melted_df, replicate_groups=replicate_group, groupby_columns=groupby_columns, percent_list=percent_list, )
example_file = "SQ00015054_normalized_feature_select.csv.gz" example_file = pathlib.Path( "{file}/../../example_data/compound/{eg}".format( file=os.path.dirname(__file__), eg=example_file ) ) df = pd.read_csv(example_file) meta_features = [x for x in df.columns if x.startswith("Metadata_")] features = df.drop(meta_features, axis="columns").columns.tolist() similarity_melted_df = metric_melt( df=df, features=features, metadata_features=meta_features, similarity_metric="pearson", ) def test_percent_strong(): replicate_groups = ["Metadata_broad_sample", "Metadata_mg_per_ml"] output = percent_strong( similarity_melted_df=similarity_melted_df, replicate_groups=replicate_groups, quantile=0.95, ) expected_result = 0.4583 assert np.round(output, 4) == expected_result
similarity_metric = "pearson" operation = "percent_strong" replicate_groups = [ "Metadata_cell_line", "Metadata_gene_name", "Metadata_pert_name" ] control_ids = ["Chr2", "Luc", "LacZ"] # In[4]: # Melt the input profiles to long format similarity_melted_df = metric_melt( df=cell_health_df, features=features, metadata_features=meta_features, similarity_metric=similarity_metric, eval_metric=operation, ) similarity_melted_df = assign_replicates( similarity_melted_df=similarity_melted_df, replicate_groups=replicate_groups) print(similarity_melted_df.shape) similarity_melted_df.head() # In[5]: non_replicate_cor_95th = ( similarity_melted_df.query("not group_replicate").groupby(
figure_output_dir = os.path.join(figure_dir, batch, plate) os.makedirs(figure_output_dir, exist_ok=True) audit_output_file = os.path.join(audit_output_dir, "{}_audit.csv".format(plate)) df = pd.read_csv(plate_files[plate]) # Determine feature class features = infer_cp_features(df) meta_features = infer_cp_features(df, metadata=True) # Calculate and process pairwise similarity matrix audit_df = metric_melt( df=df, features=features, metadata_features=meta_features, similarity_metric="pearson", eval_metric="replicate_reproducibility", ) audit_df = assign_replicates(similarity_melted_df=audit_df, replicate_groups=audit_cols) # What is 95% of the non replicate null distribution cutoff = audit_df.query( "not group_replicate").similarity_metric.quantile(0.95) # Calculate a single number for percent strong percent_strong = evaluate( profiles=df, features=features, meta_features=meta_features,
example_file = "SQ00015054_normalized_feature_select.csv.gz" example_file = pathlib.Path("{file}/../../example_data/compound/{eg}".format( file=os.path.dirname(__file__), eg=example_file)) df = pd.read_csv(example_file) df = df.assign(Metadata_profile_id=[ "Metadata_profile_{x}".format(x=x) for x in range(0, df.shape[0]) ]) meta_features = [x for x in df.columns if x.startswith("Metadata_")] features = df.drop(meta_features, axis="columns").columns.tolist() similarity_melted_df = metric_melt( df=df, features=features, metadata_features=meta_features, similarity_metric="pearson", ) similarity_melted_full_df = metric_melt( df=df, features=features, metadata_features=meta_features, similarity_metric="pearson", eval_metric="grit", ) def test_assign_replicates(): replicate_groups = ["Metadata_broad_sample", "Metadata_mg_per_ml"] result = assign_replicates(similarity_melted_df=similarity_melted_df,
example_file = "SQ00014610_normalized_feature_select.csv.gz" example_file = pathlib.Path("{file}/../../example_data/gene/{eg}".format( file=os.path.dirname(__file__), eg=example_file)) df = pd.read_csv(example_file) meta_features = [ x for x in df.columns if (x.startswith("Metadata_") or x.startswith("Image_")) ] features = df.drop(meta_features, axis="columns").columns.tolist() similarity_melted_df = metric_melt( df=df, features=features, metadata_features=meta_features, similarity_metric="pearson", eval_metric="precision_recall", ) replicate_groups = ["Metadata_gene_name", "Metadata_cell_line"] groupby_columns = ["Metadata_pert_name"] def test_precision_recall(): result_list = precision_recall( similarity_melted_df=similarity_melted_df, replicate_groups=replicate_groups, groupby_columns=groupby_columns, k=[5, 10],
def evaluate( profiles: pd.DataFrame, features: List[str], meta_features: List[str], replicate_groups: Union[List[str], dict], operation: str = "replicate_reproducibility", similarity_metric: str = "pearson", replicate_reproducibility_quantile: float = 0.95, replicate_reproducibility_return_median_cor: bool = False, precision_recall_k: Union[int, List[int]] = 10, grit_control_perts: List[str] = ["None"], grit_replicate_summary_method: str = "mean", mp_value_params: dict = {}, enrichment_percentile: Union[float, List[float]] = 0.99, ): r"""Evaluate profile quality and strength. For a given profile dataframe containing both metadata and feature measurement columns, use this function to calculate profile quality metrics. The function contains all the necessary arguments for specific evaluation operations. Parameters ---------- profiles : pandas.DataFrame profiles must be a pandas DataFrame with profile samples as rows and profile features as columns. The columns should contain both metadata and feature measurements. features : list A list of strings corresponding to feature measurement column names in the `profiles` DataFrame. All features listed must be found in `profiles`. meta_features : list A list of strings corresponding to metadata column names in the `profiles` DataFrame. All features listed must be found in `profiles`. replicate_groups : {str, list, dict} An important variable indicating which metadata columns denote replicate information. All metric operations require replicate profiles. `replicate_groups` indicates a str or list of columns to use. For `operation="grit"`, `replicate_groups` is a dict with two keys: "profile_col" and "replicate_group_col". "profile_col" is the column name that stores identifiers for each profile (can be unique), while "replicate_group_col" is the column name indicating a higher order replicate information. E.g. "replicate_group_col" can be a gene column in a CRISPR experiment with multiple guides targeting the same genes. See also :py:func:`cytominer_eval.operations.grit` and :py:func:`cytominer_eval.transform.util.check_replicate_groups`. operation : {'replicate_reproducibility', 'precision_recall', 'grit', 'mp_value'}, optional The specific evaluation metric to calculate. The default is "replicate_reproducibility". similarity_metric: {'pearson', 'spearman', 'kendall'}, optional How to calculate pairwise similarity. Defaults to "pearson". We use the input in pandas.DataFrame.cor(). The default is "pearson". Returns ------- float, pd.DataFrame The resulting evaluation metric. The return is either a single value or a pandas DataFrame summarizing the metric as specified in `operation`. Other Parameters ----------------------------- replicate_reproducibility_quantile : {0.95, ...}, optional Only used when `operation='replicate_reproducibility'`. This indicates the percentile of the non-replicate pairwise similarity to consider a reproducible phenotype. Defaults to 0.95. replicate_reproducibility_return_median_cor : bool, optional Only used when `operation='replicate_reproducibility'`. If True, then also return pairwise correlations as defined by replicate_groups and similarity metric precision_recall_k : int or list of ints {10, ...}, optional Only used when `operation='precision_recall'`. Used to calculate precision and recall considering the top k profiles according to pairwise similarity. grit_control_perts : {None, ...}, optional Only used when `operation='grit'`. Specific profile identifiers used as a reference when calculating grit. The list entries must be found in the `replicate_groups[replicate_id]` column. grit_replicate_summary_method : {"mean", "median"}, optional Only used when `operation='grit'`. Defines how the replicate z scores are summarized. see :py:func:`cytominer_eval.operations.util.calculate_grit` mp_value_params : {{}, ...}, optional Only used when `operation='mp_value'`. A key, item pair of optional parameters for calculating mp value. See also :py:func:`cytominer_eval.operations.util.default_mp_value_parameters` enrichment_percentile : float or list of floats, optional Only used when `operation='enrichment'`. Determines the percentage of top connections used for the enrichment calculation. """ # Check replicate groups input check_replicate_groups(eval_metric=operation, replicate_groups=replicate_groups) if operation != "mp_value": # Melt the input profiles to long format similarity_melted_df = metric_melt( df=profiles, features=features, metadata_features=meta_features, similarity_metric=similarity_metric, eval_metric=operation, ) # Perform the input operation if operation == "replicate_reproducibility": metric_result = replicate_reproducibility( similarity_melted_df=similarity_melted_df, replicate_groups=replicate_groups, quantile_over_null=replicate_reproducibility_quantile, return_median_correlations= replicate_reproducibility_return_median_cor, ) elif operation == "precision_recall": metric_result = precision_recall( similarity_melted_df=similarity_melted_df, replicate_groups=replicate_groups, k=precision_recall_k, ) elif operation == "grit": metric_result = grit( similarity_melted_df=similarity_melted_df, control_perts=grit_control_perts, profile_col=replicate_groups["profile_col"], replicate_group_col=replicate_groups["replicate_group_col"], replicate_summary_method=grit_replicate_summary_method, ) elif operation == "mp_value": metric_result = mp_value( df=profiles, control_perts=grit_control_perts, replicate_id=replicate_groups, features=features, params=mp_value_params, ) elif operation == "enrichment": metric_result = enrichment( similarity_melted_df=similarity_melted_df, replicate_groups=replicate_groups, percentile=enrichment_percentile, ) return metric_result