def test_evaluate_precision_recall():
    ks = [1, 5, 10, 50, 5000]
    expected_result = {
        "gene": {
            "precision": {"1": 1, "5": 0, "10": 0, "50": 0, "5000": 0},
            "recall": {"1": 0, "5": 3, "10": 4, "50": 14, "5000": 118},
        },
        "compound": {
            "precision": {"1": 18, "5": 9, "10": 5, "50": 2, "5000": 0},
            "recall": {"1": 0, "5": 0, "10": 0, "50": 1, "5000": 58},
        },
    }

    gene_groupby_columns = ["Metadata_pert_name"]
    compound_groupby_columns = ["Metadata_broad_sample"]

    for k in ks:

        # first test the function with k = float, later we test with k = list of floats
        result = evaluate(
            profiles=gene_profiles,
            features=gene_features,
            meta_features=gene_meta_features,
            replicate_groups=gene_groups,
            groupby_columns=gene_groupby_columns,
            operation="precision_recall",
            similarity_metric="pearson",
            precision_recall_k=k,
        )

        assert (
            result.query("precision == 1").shape[0]
            == expected_result["gene"]["precision"][str(k)]
        )
        assert (
            result.query("recall == 1").shape[0]
            == expected_result["gene"]["recall"][str(k)]
        )
        # test function with argument k = list of floats, should give same result as above
        result = evaluate(
            profiles=compound_profiles,
            features=compound_features,
            meta_features=compound_meta_features,
            replicate_groups=["Metadata_broad_sample"],
            groupby_columns=compound_groupby_columns,
            operation="precision_recall",
            similarity_metric="pearson",
            precision_recall_k=[k],
        )

        assert (
            result.query("precision == 1").shape[0]
            == expected_result["compound"]["precision"][str(k)]
        )
        assert (
            result.query("recall == 1").shape[0]
            == expected_result["compound"]["recall"][str(k)]
        )
def test_evaluate_replicate_reproducibility():
    similarity_metrics = get_available_similarity_metrics()
    replicate_reproducibility_quantiles = [0.5, 0.95]

    expected_result = {
        "gene": {
            "pearson": {"0.5": 0.431, "0.95": 0.056},
            "kendall": {"0.5": 0.429, "0.95": 0.054},
            "spearman": {"0.5": 0.429, "0.95": 0.055},
        },
        "compound": {
            "pearson": {"0.5": 0.681, "0.95": 0.458},
            "kendall": {"0.5": 0.679, "0.95": 0.463},
            "spearman": {"0.5": 0.679, "0.95": 0.466},
        },
    }

    for sim_metric in similarity_metrics:
        for quant in replicate_reproducibility_quantiles:
            gene_res = evaluate(
                profiles=gene_profiles,
                features=gene_features,
                meta_features=gene_meta_features,
                replicate_groups=gene_groups,
                operation="replicate_reproducibility",
                replicate_reproducibility_return_median_cor=False,
                similarity_metric=sim_metric,
                replicate_reproducibility_quantile=quant,
            )

            compound_res = evaluate(
                profiles=compound_profiles,
                features=compound_features,
                meta_features=compound_meta_features,
                replicate_groups=compound_groups,
                operation="replicate_reproducibility",
                replicate_reproducibility_return_median_cor=False,
                similarity_metric=sim_metric,
                replicate_reproducibility_quantile=quant,
            )

            assert (
                np.round(gene_res, 3) == expected_result["gene"][sim_metric][str(quant)]
            )
            assert (
                np.round(compound_res, 3)
                == expected_result["compound"][sim_metric][str(quant)]
            )
Exemplo n.º 3
0
def test_evaluate_replicate_reprod_return_cor_true():
    reprod, med_cor_df = evaluate(
        profiles=gene_profiles,
        features=gene_features,
        meta_features=gene_meta_features,
        replicate_groups=gene_groups,
        operation="replicate_reproducibility",
        replicate_reproducibility_return_median_cor=True,
        similarity_metric="pearson",
        replicate_reproducibility_quantile=0.95,
    )

    med_cor_df = med_cor_df.sort_values(by="similarity_metric",
                                        ascending=False)

    assert np.round(reprod, 3) == 0.056

    top_genes = med_cor_df.Metadata_gene_name[0:5].tolist()
    assert top_genes == ["CDK2", "CCNE1", "ATF4", "KIF11", "CCND1"]

    assert np.round(med_cor_df.similarity_metric.max(), 3) == 0.949
    assert sorted(med_cor_df.columns.tolist()) == sorted([
        "Metadata_gene_name",
        "Metadata_pert_name",
        "similarity_metric",
    ])
def test_compare_functions():
    percent_list = [0.95, 0.9]
    eval_res = evaluate(
        profiles=df,
        features=features,
        meta_features=meta_features,
        replicate_groups=replicate_groups,
        operation="enrichment",
        similarity_metric="pearson",
        enrichment_percentile=percent_list,
    )
    enr_res = enrichment(
        similarity_melted_df=similarity_melted_df,
        replicate_groups=replicate_groups,
        percentile=percent_list,
    )
    assert enr_res.equals(eval_res)
def test_evaluate_hitk():
    hitk_replicate_groups = ["Metadata_moa"]
    hitk_percent_list = "all"
    groupby_columns = ["Metadata_broad_sample", "Metadata_Plate", "Metadata_Well"]

    hitk_hits_list, percent_scores = evaluate(
        profiles=compound_profiles,
        features=compound_features,
        meta_features=compound_meta_features,
        replicate_groups=hitk_replicate_groups,
        operation="hitk",
        groupby_columns=groupby_columns,
        hitk_percent_list=hitk_percent_list,
    )
    assert isclose(percent_scores[0], 150.75, abs_tol=1e-1)

    last_score = percent_scores[len(percent_scores) - 1]
    assert isclose(last_score, 0, abs_tol=1e-1)
            similarity_metric="pearson",
            eval_metric="replicate_reproducibility",
        )

        audit_df = assign_replicates(similarity_melted_df=audit_df,
                                     replicate_groups=audit_cols)
        # What is 95% of the non replicate null distribution
        cutoff = audit_df.query(
            "not group_replicate").similarity_metric.quantile(0.95)

        # Calculate a single number for percent strong
        percent_strong = evaluate(
            profiles=df,
            features=features,
            meta_features=meta_features,
            replicate_groups=audit_cols,
            operation="replicate_reproducibility",
            similarity_metric="pearson",
            replicate_reproducibility_quantile=0.95,
        )

        grid_string = "~{}".format("+".join(
            [f"{x}_pair_a" for x in audit_cols]))

        # Visualize the audit - output two plots for each plate
        output_base = os.path.join(
            figure_output_dir,
            "{}_{}_replicate_correlation".format(batch, plate))
        plot_replicate_correlation(
            df=audit_df,
            batch=batch,
Exemplo n.º 7
0
                use_features = subset_df.drop(meta_features,
                                              axis="columns").columns.tolist()
            else:
                subset_df = df.loc[:, meta_features + compartment_features]
                dropped_or_exclusive = "exclusive"
                use_features = compartment_features

            result = evaluate(
                profiles=subset_df.query("Metadata_cell_line == @cell_line"),
                features=use_features,
                meta_features=[barcode_col, gene_col],
                replicate_groups=replicate_group_grit,
                operation="grit",
                similarity_metric="pearson",
                grit_control_perts=control_barcodes,
            ).assign(
                cell_line=cell_line,
                barcode_control="cutting_control",
                cor_method="pearson",
                compartment=compartment,
                channel="all",
                feature_group="all",
                num_features=len(compartment_features),
                dropped_or_exclusive=dropped_or_exclusive,
            )

            grit_compartment_results.append(result)

grit_compartment_results = pd.concat(grit_compartment_results).reset_index(
    drop=True)

print(grit_compartment_results.shape)
def test_evaluate_grit():
    grit_gene_control_perts = [
        "Chr2-1",
        "Chr2-2",
        "Chr2-3",
        "Chr2-4",
        "Chr2-5",
        "Chr2-6",
        "Luc-1",
        "Luc-2",
        "LacZ-2",
        "LacZ-3",
    ]

    grit_gene_replicate_groups = {
        "replicate_id": "Metadata_pert_name",
        "group_id": "Metadata_gene_name",
    }

    grit_results_df = evaluate(
        profiles=gene_profiles,
        features=gene_features,
        meta_features=gene_meta_features,
        replicate_groups=grit_gene_replicate_groups,
        operation="grit",
        grit_control_perts=grit_gene_control_perts,
    )

    top_result = (grit_results_df.sort_values(
        by="grit", ascending=False).reset_index(drop=True).iloc[0, ])
    assert np.round(top_result.grit, 4) == 2.2597
    assert top_result.group == "PTK2"
    assert top_result.perturbation == "PTK2-2"

    grit_compound_replicate_groups = {
        "replicate_id": "Metadata_broad_sample",
        "group_id": "Metadata_moa",
    }

    grit_compound_control_perts = ["DMSO"]

    grit_results_df = evaluate(
        profiles=compound_profiles,
        features=compound_features,
        meta_features=compound_meta_features,
        replicate_groups=grit_compound_replicate_groups,
        operation="grit",
        grit_control_perts=grit_compound_control_perts,
    )

    top_result = (grit_results_df.sort_values(
        by="grit", ascending=False).reset_index(drop=True).iloc[0, ])

    assert np.round(top_result.grit, 4) == 0.9990
    assert top_result.group == "ATPase inhibitor"
    assert top_result.perturbation == "BRD-A94756469-001-04-7"

    with pytest.raises(AssertionError) as ae:
        grit_results_df = evaluate(
            profiles=compound_profiles,
            features=compound_features,
            meta_features=compound_meta_features,
            replicate_groups=compound_groups,
            operation="grit",
            grit_control_perts=grit_compound_control_perts,
        )
    assert "For grit, replicate_groups must be a dict" in str(ae.value)
def test_evaluate_precision_recall():
    ks = [1, 5, 10, 50, 5000]
    expected_result = {
        "gene": {
            "precision": {
                "1": 1,
                "5": 0,
                "10": 0,
                "50": 0,
                "5000": 0
            },
            "recall": {
                "1": 0,
                "5": 3,
                "10": 4,
                "50": 14,
                "5000": 118
            },
        },
        "compound": {
            "precision": {
                "1": 18,
                "5": 9,
                "10": 5,
                "50": 2,
                "5000": 0
            },
            "recall": {
                "1": 0,
                "5": 0,
                "10": 0,
                "50": 1,
                "5000": 58
            },
        },
    }

    for k in ks:

        result = evaluate(
            profiles=gene_profiles,
            features=gene_features,
            meta_features=gene_meta_features,
            replicate_groups=gene_groups,
            operation="precision_recall",
            similarity_metric="pearson",
            precision_recall_k=k,
        )

        assert (result.query("precision == 1").shape[0] ==
                expected_result["gene"]["precision"][str(k)])
        assert (result.query("recall == 1").shape[0] == expected_result["gene"]
                ["recall"][str(k)])

        result = evaluate(
            profiles=compound_profiles,
            features=compound_features,
            meta_features=compound_meta_features,
            replicate_groups=["Metadata_broad_sample"],
            operation="precision_recall",
            similarity_metric="pearson",
            precision_recall_k=k,
        )

        assert (result.query("precision == 1").shape[0] ==
                expected_result["compound"]["precision"][str(k)])
        assert (result.query("recall == 1").shape[0] ==
                expected_result["compound"]["recall"][str(k)])
Exemplo n.º 10
0
# In[17]:

meta_features

# In[18]:

data_df.Metadata_treatment_profile_id.value_counts()

# In[19]:

# Get replicate correlation
percent_strong, corr_df = evaluate(
    profiles=data_df,
    features=features,
    meta_features=meta_features,
    replicate_groups=["Metadata_clone_number", "Metadata_treatment"],
    operation="replicate_reproducibility",
    replicate_reproducibility_return_median_cor=True)

# In[20]:

percent_strong

# In[21]:

corr_df.head()

# In[22]:

# Get technical grit for batch
barcode_col = "Metadata_guide_identity"
gene_col = "Metadata_gene_identity"

replicate_group_grit = {"replicate_id": barcode_col, "group_id": gene_col}

neg_controls = [
    x for x in bulk_subset_df.Metadata_guide_identity if "neg_ctrl" in x
]
neg_controls

# In[9]:

result = evaluate(
    profiles=bulk_df,
    features=genes_to_retain,
    meta_features=[barcode_col, gene_col],
    replicate_groups=replicate_group_grit,
    operation="grit",
    grit_control_perts=neg_controls,
)

result = result.dropna().sort_values(by="grit",
                                     ascending=False).reset_index(drop=True)

print(result.shape)
result.head(3)

# In[10]:

# Merge with activity results and output file
output_results_file = pathlib.Path(f"results/{gse_id}_grit.tsv")
        # Now calculate sc-Grit per guide
        for guide in guides:
            if guide in control_group_guides_cut:
                continue

            subset_guide_df = pd.concat([
                subset_sc_df.query("Metadata_pert_name == @guide"),
                neg_controls_df
            ]).reset_index(drop=True)

            # Calculate Grit
            sc_grit_result = evaluate(
                profiles=subset_guide_df,
                features=morph_features,
                meta_features=["Metadata_pert_name", "Metadata_cell_identity"],
                replicate_groups=replicate_group_grit,
                operation="grit",
                grit_control_perts=sc_neg_control_cells,
            ).assign(gene=gene, guide=guide)

            all_sc_grit_results.append(
                sc_grit_result.assign(grit_gene=gene, grit_guide=guide))

    # Output results
    all_sc_umap_embeddings = pd.concat(all_sc_umap_embeddings).reset_index(
        drop=True)
    output_results_file = pathlib.Path(
        f"results/cellhealth_single_cell_umap_embeddings_{plate}_chr2.tsv.gz")
    all_sc_umap_embeddings.to_csv(output_results_file,
                                  sep="\t",
                                  compression="gzip",
def test_evaluate_mp_value():
    # Permutations in mp_value could lead to some edge cases
    np.random.seed(2020)

    # Tests on genetic screen dataset
    mp_value_gene_control_perts = [
        "Chr2-1",
        "Chr2-2",
        "Chr2-3",
        "Chr2-4",
        "Chr2-5",
        "Chr2-6",
        "Luc-1",
        "Luc-2",
        "LacZ-2",
        "LacZ-3",
    ]

    mp_value_gene_replicate_groups = "Metadata_pert_name"

    mp_value_results_df = evaluate(
        profiles=gene_profiles,
        features=gene_features,
        meta_features=gene_meta_features,
        replicate_groups=mp_value_gene_replicate_groups,
        operation="mp_value",
        grit_control_perts=mp_value_gene_control_perts,
    )

    # Negative controls should be "close to themselves"
    assert all(
        mp_value_results_df[
            [
                x in mp_value_gene_control_perts
                for x in mp_value_results_df.Metadata_pert_name
            ]
        ].mp_value
        >= 0.05
    )

    # Strong perturbation should differ from controls
    assert "PTK2-2" in list(
        mp_value_results_df[mp_value_results_df.mp_value == 0].Metadata_pert_name
    )

    assert all(
        mp_value_results_df.columns == [mp_value_gene_replicate_groups, "mp_value"]
    )

    # Tests on chemical screen dataset
    mp_value_compound_control_perts = ["DMSO"]
    mp_value_compound_replicate_groups = "Metadata_broad_sample"

    mp_value_results_df = evaluate(
        profiles=compound_profiles,
        features=compound_features,
        meta_features=compound_meta_features,
        replicate_groups=mp_value_compound_replicate_groups,
        operation="mp_value",
        grit_control_perts=mp_value_compound_control_perts,
    )

    # Negative controls should be "close to themselves"
    assert all(
        mp_value_results_df[
            [
                x in mp_value_compound_control_perts
                for x in mp_value_results_df.Metadata_broad_sample
            ]
        ].mp_value
        >= 0.05
    )

    # Strong perturbation should differ from controls
    assert "BRD-A94756469-001-04-7" in list(
        mp_value_results_df[mp_value_results_df.mp_value == 0].Metadata_broad_sample
    )

    assert all(
        mp_value_results_df.columns == [mp_value_compound_replicate_groups, "mp_value"]
    )

    with pytest.raises(AssertionError) as ae:
        mp_value_results_df = evaluate(
            profiles=compound_profiles,
            features=compound_features,
            meta_features=compound_meta_features,
            replicate_groups=mp_value_compound_replicate_groups,
            operation="mp_value",
            grit_control_perts=mp_value_compound_control_perts,
            mp_value_params={"something else": 1},
        )
    assert "Unknown parameters provided. Only" in str(ae.value)