def test_evaluate_precision_recall(): ks = [1, 5, 10, 50, 5000] expected_result = { "gene": { "precision": {"1": 1, "5": 0, "10": 0, "50": 0, "5000": 0}, "recall": {"1": 0, "5": 3, "10": 4, "50": 14, "5000": 118}, }, "compound": { "precision": {"1": 18, "5": 9, "10": 5, "50": 2, "5000": 0}, "recall": {"1": 0, "5": 0, "10": 0, "50": 1, "5000": 58}, }, } gene_groupby_columns = ["Metadata_pert_name"] compound_groupby_columns = ["Metadata_broad_sample"] for k in ks: # first test the function with k = float, later we test with k = list of floats result = evaluate( profiles=gene_profiles, features=gene_features, meta_features=gene_meta_features, replicate_groups=gene_groups, groupby_columns=gene_groupby_columns, operation="precision_recall", similarity_metric="pearson", precision_recall_k=k, ) assert ( result.query("precision == 1").shape[0] == expected_result["gene"]["precision"][str(k)] ) assert ( result.query("recall == 1").shape[0] == expected_result["gene"]["recall"][str(k)] ) # test function with argument k = list of floats, should give same result as above result = evaluate( profiles=compound_profiles, features=compound_features, meta_features=compound_meta_features, replicate_groups=["Metadata_broad_sample"], groupby_columns=compound_groupby_columns, operation="precision_recall", similarity_metric="pearson", precision_recall_k=[k], ) assert ( result.query("precision == 1").shape[0] == expected_result["compound"]["precision"][str(k)] ) assert ( result.query("recall == 1").shape[0] == expected_result["compound"]["recall"][str(k)] )
def test_evaluate_replicate_reproducibility(): similarity_metrics = get_available_similarity_metrics() replicate_reproducibility_quantiles = [0.5, 0.95] expected_result = { "gene": { "pearson": {"0.5": 0.431, "0.95": 0.056}, "kendall": {"0.5": 0.429, "0.95": 0.054}, "spearman": {"0.5": 0.429, "0.95": 0.055}, }, "compound": { "pearson": {"0.5": 0.681, "0.95": 0.458}, "kendall": {"0.5": 0.679, "0.95": 0.463}, "spearman": {"0.5": 0.679, "0.95": 0.466}, }, } for sim_metric in similarity_metrics: for quant in replicate_reproducibility_quantiles: gene_res = evaluate( profiles=gene_profiles, features=gene_features, meta_features=gene_meta_features, replicate_groups=gene_groups, operation="replicate_reproducibility", replicate_reproducibility_return_median_cor=False, similarity_metric=sim_metric, replicate_reproducibility_quantile=quant, ) compound_res = evaluate( profiles=compound_profiles, features=compound_features, meta_features=compound_meta_features, replicate_groups=compound_groups, operation="replicate_reproducibility", replicate_reproducibility_return_median_cor=False, similarity_metric=sim_metric, replicate_reproducibility_quantile=quant, ) assert ( np.round(gene_res, 3) == expected_result["gene"][sim_metric][str(quant)] ) assert ( np.round(compound_res, 3) == expected_result["compound"][sim_metric][str(quant)] )
def test_evaluate_replicate_reprod_return_cor_true(): reprod, med_cor_df = evaluate( profiles=gene_profiles, features=gene_features, meta_features=gene_meta_features, replicate_groups=gene_groups, operation="replicate_reproducibility", replicate_reproducibility_return_median_cor=True, similarity_metric="pearson", replicate_reproducibility_quantile=0.95, ) med_cor_df = med_cor_df.sort_values(by="similarity_metric", ascending=False) assert np.round(reprod, 3) == 0.056 top_genes = med_cor_df.Metadata_gene_name[0:5].tolist() assert top_genes == ["CDK2", "CCNE1", "ATF4", "KIF11", "CCND1"] assert np.round(med_cor_df.similarity_metric.max(), 3) == 0.949 assert sorted(med_cor_df.columns.tolist()) == sorted([ "Metadata_gene_name", "Metadata_pert_name", "similarity_metric", ])
def test_compare_functions(): percent_list = [0.95, 0.9] eval_res = evaluate( profiles=df, features=features, meta_features=meta_features, replicate_groups=replicate_groups, operation="enrichment", similarity_metric="pearson", enrichment_percentile=percent_list, ) enr_res = enrichment( similarity_melted_df=similarity_melted_df, replicate_groups=replicate_groups, percentile=percent_list, ) assert enr_res.equals(eval_res)
def test_evaluate_hitk(): hitk_replicate_groups = ["Metadata_moa"] hitk_percent_list = "all" groupby_columns = ["Metadata_broad_sample", "Metadata_Plate", "Metadata_Well"] hitk_hits_list, percent_scores = evaluate( profiles=compound_profiles, features=compound_features, meta_features=compound_meta_features, replicate_groups=hitk_replicate_groups, operation="hitk", groupby_columns=groupby_columns, hitk_percent_list=hitk_percent_list, ) assert isclose(percent_scores[0], 150.75, abs_tol=1e-1) last_score = percent_scores[len(percent_scores) - 1] assert isclose(last_score, 0, abs_tol=1e-1)
similarity_metric="pearson", eval_metric="replicate_reproducibility", ) audit_df = assign_replicates(similarity_melted_df=audit_df, replicate_groups=audit_cols) # What is 95% of the non replicate null distribution cutoff = audit_df.query( "not group_replicate").similarity_metric.quantile(0.95) # Calculate a single number for percent strong percent_strong = evaluate( profiles=df, features=features, meta_features=meta_features, replicate_groups=audit_cols, operation="replicate_reproducibility", similarity_metric="pearson", replicate_reproducibility_quantile=0.95, ) grid_string = "~{}".format("+".join( [f"{x}_pair_a" for x in audit_cols])) # Visualize the audit - output two plots for each plate output_base = os.path.join( figure_output_dir, "{}_{}_replicate_correlation".format(batch, plate)) plot_replicate_correlation( df=audit_df, batch=batch,
use_features = subset_df.drop(meta_features, axis="columns").columns.tolist() else: subset_df = df.loc[:, meta_features + compartment_features] dropped_or_exclusive = "exclusive" use_features = compartment_features result = evaluate( profiles=subset_df.query("Metadata_cell_line == @cell_line"), features=use_features, meta_features=[barcode_col, gene_col], replicate_groups=replicate_group_grit, operation="grit", similarity_metric="pearson", grit_control_perts=control_barcodes, ).assign( cell_line=cell_line, barcode_control="cutting_control", cor_method="pearson", compartment=compartment, channel="all", feature_group="all", num_features=len(compartment_features), dropped_or_exclusive=dropped_or_exclusive, ) grit_compartment_results.append(result) grit_compartment_results = pd.concat(grit_compartment_results).reset_index( drop=True) print(grit_compartment_results.shape)
def test_evaluate_grit(): grit_gene_control_perts = [ "Chr2-1", "Chr2-2", "Chr2-3", "Chr2-4", "Chr2-5", "Chr2-6", "Luc-1", "Luc-2", "LacZ-2", "LacZ-3", ] grit_gene_replicate_groups = { "replicate_id": "Metadata_pert_name", "group_id": "Metadata_gene_name", } grit_results_df = evaluate( profiles=gene_profiles, features=gene_features, meta_features=gene_meta_features, replicate_groups=grit_gene_replicate_groups, operation="grit", grit_control_perts=grit_gene_control_perts, ) top_result = (grit_results_df.sort_values( by="grit", ascending=False).reset_index(drop=True).iloc[0, ]) assert np.round(top_result.grit, 4) == 2.2597 assert top_result.group == "PTK2" assert top_result.perturbation == "PTK2-2" grit_compound_replicate_groups = { "replicate_id": "Metadata_broad_sample", "group_id": "Metadata_moa", } grit_compound_control_perts = ["DMSO"] grit_results_df = evaluate( profiles=compound_profiles, features=compound_features, meta_features=compound_meta_features, replicate_groups=grit_compound_replicate_groups, operation="grit", grit_control_perts=grit_compound_control_perts, ) top_result = (grit_results_df.sort_values( by="grit", ascending=False).reset_index(drop=True).iloc[0, ]) assert np.round(top_result.grit, 4) == 0.9990 assert top_result.group == "ATPase inhibitor" assert top_result.perturbation == "BRD-A94756469-001-04-7" with pytest.raises(AssertionError) as ae: grit_results_df = evaluate( profiles=compound_profiles, features=compound_features, meta_features=compound_meta_features, replicate_groups=compound_groups, operation="grit", grit_control_perts=grit_compound_control_perts, ) assert "For grit, replicate_groups must be a dict" in str(ae.value)
def test_evaluate_precision_recall(): ks = [1, 5, 10, 50, 5000] expected_result = { "gene": { "precision": { "1": 1, "5": 0, "10": 0, "50": 0, "5000": 0 }, "recall": { "1": 0, "5": 3, "10": 4, "50": 14, "5000": 118 }, }, "compound": { "precision": { "1": 18, "5": 9, "10": 5, "50": 2, "5000": 0 }, "recall": { "1": 0, "5": 0, "10": 0, "50": 1, "5000": 58 }, }, } for k in ks: result = evaluate( profiles=gene_profiles, features=gene_features, meta_features=gene_meta_features, replicate_groups=gene_groups, operation="precision_recall", similarity_metric="pearson", precision_recall_k=k, ) assert (result.query("precision == 1").shape[0] == expected_result["gene"]["precision"][str(k)]) assert (result.query("recall == 1").shape[0] == expected_result["gene"] ["recall"][str(k)]) result = evaluate( profiles=compound_profiles, features=compound_features, meta_features=compound_meta_features, replicate_groups=["Metadata_broad_sample"], operation="precision_recall", similarity_metric="pearson", precision_recall_k=k, ) assert (result.query("precision == 1").shape[0] == expected_result["compound"]["precision"][str(k)]) assert (result.query("recall == 1").shape[0] == expected_result["compound"]["recall"][str(k)])
# In[17]: meta_features # In[18]: data_df.Metadata_treatment_profile_id.value_counts() # In[19]: # Get replicate correlation percent_strong, corr_df = evaluate( profiles=data_df, features=features, meta_features=meta_features, replicate_groups=["Metadata_clone_number", "Metadata_treatment"], operation="replicate_reproducibility", replicate_reproducibility_return_median_cor=True) # In[20]: percent_strong # In[21]: corr_df.head() # In[22]: # Get technical grit for batch
barcode_col = "Metadata_guide_identity" gene_col = "Metadata_gene_identity" replicate_group_grit = {"replicate_id": barcode_col, "group_id": gene_col} neg_controls = [ x for x in bulk_subset_df.Metadata_guide_identity if "neg_ctrl" in x ] neg_controls # In[9]: result = evaluate( profiles=bulk_df, features=genes_to_retain, meta_features=[barcode_col, gene_col], replicate_groups=replicate_group_grit, operation="grit", grit_control_perts=neg_controls, ) result = result.dropna().sort_values(by="grit", ascending=False).reset_index(drop=True) print(result.shape) result.head(3) # In[10]: # Merge with activity results and output file output_results_file = pathlib.Path(f"results/{gse_id}_grit.tsv")
# Now calculate sc-Grit per guide for guide in guides: if guide in control_group_guides_cut: continue subset_guide_df = pd.concat([ subset_sc_df.query("Metadata_pert_name == @guide"), neg_controls_df ]).reset_index(drop=True) # Calculate Grit sc_grit_result = evaluate( profiles=subset_guide_df, features=morph_features, meta_features=["Metadata_pert_name", "Metadata_cell_identity"], replicate_groups=replicate_group_grit, operation="grit", grit_control_perts=sc_neg_control_cells, ).assign(gene=gene, guide=guide) all_sc_grit_results.append( sc_grit_result.assign(grit_gene=gene, grit_guide=guide)) # Output results all_sc_umap_embeddings = pd.concat(all_sc_umap_embeddings).reset_index( drop=True) output_results_file = pathlib.Path( f"results/cellhealth_single_cell_umap_embeddings_{plate}_chr2.tsv.gz") all_sc_umap_embeddings.to_csv(output_results_file, sep="\t", compression="gzip",
def test_evaluate_mp_value(): # Permutations in mp_value could lead to some edge cases np.random.seed(2020) # Tests on genetic screen dataset mp_value_gene_control_perts = [ "Chr2-1", "Chr2-2", "Chr2-3", "Chr2-4", "Chr2-5", "Chr2-6", "Luc-1", "Luc-2", "LacZ-2", "LacZ-3", ] mp_value_gene_replicate_groups = "Metadata_pert_name" mp_value_results_df = evaluate( profiles=gene_profiles, features=gene_features, meta_features=gene_meta_features, replicate_groups=mp_value_gene_replicate_groups, operation="mp_value", grit_control_perts=mp_value_gene_control_perts, ) # Negative controls should be "close to themselves" assert all( mp_value_results_df[ [ x in mp_value_gene_control_perts for x in mp_value_results_df.Metadata_pert_name ] ].mp_value >= 0.05 ) # Strong perturbation should differ from controls assert "PTK2-2" in list( mp_value_results_df[mp_value_results_df.mp_value == 0].Metadata_pert_name ) assert all( mp_value_results_df.columns == [mp_value_gene_replicate_groups, "mp_value"] ) # Tests on chemical screen dataset mp_value_compound_control_perts = ["DMSO"] mp_value_compound_replicate_groups = "Metadata_broad_sample" mp_value_results_df = evaluate( profiles=compound_profiles, features=compound_features, meta_features=compound_meta_features, replicate_groups=mp_value_compound_replicate_groups, operation="mp_value", grit_control_perts=mp_value_compound_control_perts, ) # Negative controls should be "close to themselves" assert all( mp_value_results_df[ [ x in mp_value_compound_control_perts for x in mp_value_results_df.Metadata_broad_sample ] ].mp_value >= 0.05 ) # Strong perturbation should differ from controls assert "BRD-A94756469-001-04-7" in list( mp_value_results_df[mp_value_results_df.mp_value == 0].Metadata_broad_sample ) assert all( mp_value_results_df.columns == [mp_value_compound_replicate_groups, "mp_value"] ) with pytest.raises(AssertionError) as ae: mp_value_results_df = evaluate( profiles=compound_profiles, features=compound_features, meta_features=compound_meta_features, replicate_groups=mp_value_compound_replicate_groups, operation="mp_value", grit_control_perts=mp_value_compound_control_perts, mp_value_params={"something else": 1}, ) assert "Unknown parameters provided. Only" in str(ae.value)