Exemplo n.º 1
0
def test_TCRpublic_with_neighborhood_dif():
    """
    Use values from neighborhood_diff
    """
    import os
    import pandas as pd
    import numpy as np
    from tcrdist.repertoire import TCRrep
    from tcrdist.public import TCRpublic
    fn = os.path.join(
        'tcrdist', 'data', 'covid19',
        'mira_epitope_55_524_ALRKVPTDNYITTY_KVPTDNYITTY.tcrdist3.radius.csv')
    df = pd.read_csv(fn)
    tr = TCRrep(cell_df=df[[
        'cohort', 'subject', 'v_b_gene', 'j_b_gene', 'cdr3_b_aa', 'radius'
    ]],
                organism="human",
                chains=["beta"])

    from tcrdist.rep_diff import neighborhood_diff
    ndif = neighborhood_diff(clone_df=tr.clone_df,
                             pwmat=tr.pw_beta,
                             count_col='count',
                             x_cols=['cohort'],
                             knn_radius=25,
                             test_method="chi2")
    # Add neighbors and other columns of interest
    # from neighbor_diff result to the clone_df
    tr.clone_df = pd.concat([
        tr.clone_df,
        ndif[['neighbors', 'K_neighbors', 'val_0', 'ct_0', 'pvalue']]
    ],
                            axis=1)
    # Because neighors and K_neighbor are already added to the clone_df
    # TCRpublic.report() uses those instead of finding new ones.
    tp = TCRpublic(tcrrep=tr,
                   output_html_name="quasi_public_clones_with_ndif.html")
    # Add any columns neighbor_diff columns
    #that you want to display in the final report
    tp.labels.append('val_0')
    tp.labels.append('ct_0')
    tp.labels.append('pvalue')
    # chagne sort to be pvalue not publicity
    tp.sort_columns = ['pvalue']
    # because you are sorting by pvalue, change to True
    tp.sort_ascending = True
    tp.report()
Exemplo n.º 2
0
def find_centers_beta(background_filename,
                      target_filename,
                      ncpus,
                      min_nsubject,
                      ctrl_bkgd=10**-5,
                      prefilter=False):
    import os
    import pandas as pd
    import numpy as np
    from tcrdist.repertoire import TCRrep
    from tcrdist.neighbors import compute_ecdf, bkgd_cntl_nn2
    from tcrdist.automate import auto_pgen
    from tcrdist.rep_diff import neighborhood_diff
    from tcrdist.summarize import _summ, _dist_summ, _select, filter_gt, filter_is, test_for_subsets, test_for_almost_subsets
    import scipy.sparse

    df_background = pd.read_csv(background_filename)
    print(df_background)
    tr_background = TCRrep(cell_df=df_background.copy(),
                           organism="human",
                           chains=['beta'],
                           compute_distances=False)

    df_mira = pd.read_csv(target_filename)
    df_mira = df_mira[[
        'subject', 'cell_type', 'v_b_gene', 'j_b_gene', 'cdr3_b_aa'
    ]]
    print(df_mira)
    tr = TCRrep(cell_df=df_mira.copy(),
                organism='human',
                chains=['beta'],
                db_file='alphabeta_gammadelta_db.tsv',
                store_all_cdr=False,
                compute_distances=True)

    if prefilter:
        # We can greatly cut down on the number of searches if we drop centroids without minimum publicicity
        nn_df = neighborhood_diff(clone_df=tr.clone_df,
                                  pwmat=tr.pw_beta,
                                  count_col='count',
                                  x_cols=['cell_type'],
                                  knn_radius=37)

        def tabulate_publicity(neighbor_df, clone_df, col_nn='neighbors'):
            # Tabulate the number of unique subjects at each node
            neighbor_df['nsubject'] = neighbor_df[col_nn].apply(lambda x: len(
                set(_select(clone_df, iloc_rows=x, col='subject'))))
            return neighbor_df

        print(f"TABULATING PUBLIC CLUSTERS")
        nn_df = tabulate_publicity(nn_df, tr.clone_df)
        nn_df = filter_gt(nn_df, 'nsubject', min_nsubject)

        if nn_df.shape[0] == 0:
            centers_df = pd.DataFrame(
                {},
                columns=[
                    'cdr3_b_aa', 'v_b_gene', 'j_b_gene', 'pgen', 'max_radi',
                    'target_hits', 'bkgd_hits', 'bkgd_hits_weighted',
                    'bkgd_total', 'ctrl', 'ctrl_weighted', 'target_misses',
                    'TR', 'TR2', 'BR_weighted', 'RR_weighted', 'OR_weighted',
                    'chi2dist', 'target_neighbors', 'target_seqs',
                    'background_neighbors', 'background_seqs', 'background_v',
                    'background_j', 'regex', 'target_re_hits', 'bkgd_re_hits',
                    'bkgd_re_weighted_hits', 'TR_re', 'BR_re_weighted',
                    'RR_re_weighted', 'OR_re_weighted', 'chi2re', 'chi2joint',
                    'nsubject'
                ])
            tr.pw_beta[tr.pw_beta == 0] = 1  # set true zeros to 1
            tr.pw_beta[tr.pw_beta > 50] = 0  # ignores everything less than 100
            pw_beta_sparse = scipy.sparse.csr_matrix(tr.pw_beta)
            return centers_df, pw_beta_sparse

        tr.clone_df = tr.clone_df.loc[nn_df.index, :].reset_index(drop=True)
        del nn_df
        # Compute pairwise again with filtered set
        tr.compute_distances()
        # compute pgens automatically, currently parmap will max out cpus on this step

    print("COMPUTING PROBABILITY OF GENERATION")
    auto_pgen(tr)
    print(
        f"COMPUTING RECT DIST {tr.clone_df.shape[0]}x{tr_background.clone_df.shape[0]}"
    )
    tr.compute_rect_distances(df=tr.clone_df,
                              df2=tr_background.clone_df,
                              store=False)

    assert tr.rw_beta.shape[0] == tr.clone_df.shape[0]

    centers_df = bkgd_cntl_nn2(
        tr=tr,
        tr_background=tr_background,
        ctrl_bkgd=ctrl_bkgd,  #ctrl_bkgd = 2*10**-5
        weights=tr_background.clone_df.weights,
        col='cdr3_b_aa',
        ncpus=ncpus,
        thresholds=[x
                    for x in range(0, 38, 2)],  # Settign 38 as the max radius
        generate_regex=True,
        test_regex=True)

    def tabulate_publicity(neighbor_df, clone_df, col_nn='neighbors'):
        # Tabulate the number of unique subjects at each node
        neighbor_df['nsubject'] = neighbor_df[col_nn].apply(
            lambda x: len(set(_select(clone_df, iloc_rows=x, col='subject'))))
        return neighbor_df

    centers_df = tabulate_publicity(neighbor_df=centers_df,
                                    clone_df=tr.clone_df,
                                    col_nn='target_neighbors')

    tr.rw_beta[tr.rw_beta == 0] = 1  # set true zeros to 1
    tr.rw_beta[tr.rw_beta > 50] = 0  # ignores everything less than 100
    rw_beta_sparse = scipy.sparse.csr_matrix(tr.rw_beta)
    #scipy.sparse.save_npzz(output_matrix_filename, rw_beta_sparse)
    return centers_df, rw_beta_sparse
Exemplo n.º 3
0
def run_one(ref_fn, rep_fn, ss=-1, ncpus=1):
    ref_df = pd.read_csv(ref_fn)
    ref_df.columns = [{
        'v_b_name': 'v_b_gene',
        'j_b_name': 'j_b_gene',
        'cdr3_b_aa': 'cdr3_b_aa'
    }.get(c, c) for c in ref_df.columns]
    ref_df.loc[:, 'count'] = 1
    if ss == -1:
        ref_tr = TCRrep(cell_df=ref_df,
                        organism='human',
                        chains=['beta'],
                        compute_distances=False,
                        store_all_cdr=False)
    else:
        ref_tr = TCRrep(cell_df=ref_df.sample(n=ss, replace=False),
                        organism='human',
                        chains=['beta'],
                        compute_distances=False,
                        store_all_cdr=False)

    rep_df = pd.read_csv(rep_fn).assign(count=1)
    tr = TCRrep(cell_df=rep_df[[
        'v_b_gene', 'j_b_gene', 'cdr3_b_aa', 'epitope', 'experiment',
        'subject', 'count'
    ]],
                organism='human',
                chains=['beta'],
                db_file='alphabeta_gammadelta_db.tsv',
                compute_distances=False)

    if tr.clone_df.shape[0] > 6000:
        """Limit size of MIRA set to 2000"""
        tr.clone_df = tr.clone_df.sample(n=6000,
                                         replace=False,
                                         random_state=110820)

    auto_pgen(tr)

    out = []
    print(rep_fn)
    for metric in ['tcrdist', 'tcrdist-cdr3', 'edit']:
        if 'tcr' in metric:
            metric_thresholds = np.arange(76)
            fcluster_thresholds = [0, 25, 50]
        else:
            metric_thresholds = np.arange(9)
            fcluster_thresholds = [0, 1, 2]
        """Enforce no clustering analysis"""
        fcluster_thresholds = [0]

        epitope_name = os.path.split(rep_fn)[1].split('.')[0]
        epitope_name = epitope_name.replace('mira_epitope_', 'M')

        # rep_fn = opj(_fg_data, 'ncov_tcrs/adaptive_bio_r2/tcrs_by_mira_epitope/pw_computed', rep_fn)
        print(f'\t{metric}')
        """with open(rep_fn, 'rb') as fh:
            tr = dill.load(fh)"""
        """Compute repertoire PW distances and create flat clusters"""
        rep_pwmat = _pwrect(tr,
                            clone_df1=tr.clone_df,
                            metric=metric,
                            ncpus=ncpus)
        print('Computed MIRA set pwrect.')

        ref_pwmat = _pwrect(tr,
                            clone_df1=tr.clone_df,
                            clone_df2=ref_tr.clone_df,
                            metric=metric,
                            ncpus=ncpus)
        print('Computed reference pwrect.')

        for fclust_thresh in fcluster_thresholds:
            if fclust_thresh > 0:
                rep_pwvec = scipy.spatial.distance.squareform(rep_pwmat,
                                                              force='tovector')
                Z = sch.linkage(rep_pwvec, method='complete')
                labels = sch.fcluster(Z, t=fclust_thresh, criterion='distance')
            else:
                labels = np.arange(1, rep_pwmat.shape[0] + 1)
            """Compute ECDF for each cluster within the repertoire"""
            # rep_ecdf = np.zeros((int(np.max(labels)), len(metric_thresholds)))
            for lab in range(1, np.max(labels) + 1):
                lab_ind = labels == lab
                rep_ecdf = compute_ecdf(np.mean(
                    rep_pwmat[lab_ind, :][:, ~lab_ind], axis=0),
                                        thresholds=metric_thresholds)
                tmp_df = pd.DataFrame({
                    'ecdf': rep_ecdf,
                    'thresholds': metric_thresholds
                })
                tmp_df = tmp_df.assign(
                    metric=metric,
                    fclust_thresh=fclust_thresh,
                    label=lab,
                    name=epitope_name,
                    versus='rep',
                    pgen=np.median(
                        tr.clone_df['pgen_cdr3_b_aa'].values[lab_ind]),
                    K=lab_ind.sum())
                out.append(tmp_df)
            """Compute distances to the reference for each cluster and compute ECDF vs reference"""
            # ref_ecdf = np.zeros((int(np.max(labels)), len(metric_thresholds)))
            for lab in range(1, np.max(labels) + 1):
                lab_ind = labels == lab
                ref_ecdf = compute_ecdf(np.mean(ref_pwmat[lab_ind, :], axis=0),
                                        thresholds=metric_thresholds,
                                        weights=ref_tr.clone_df['weights'])
                tmp_df = pd.DataFrame({
                    'ecdf': ref_ecdf,
                    'thresholds': metric_thresholds
                })
                tmp_df = tmp_df.assign(
                    metric=metric,
                    fclust_thresh=fclust_thresh,
                    label=lab,
                    name=epitope_name,
                    versus='ref',
                    pgen=np.median(
                        tr.clone_df['pgen_cdr3_b_aa'].values[lab_ind]),
                    K=lab_ind.sum())
                out.append(tmp_df)
    out = pd.concat(out, axis=0)
    return out
Exemplo n.º 4
0
df = pd.read_csv(dash_fn)
tr = TCRrep(cell_df=df,
            organism='human',
            chains=['alpha', 'beta'],
            compute_distances=False)
"""Compute pgen of each epitope-specific sequence"""
olga_beta = OlgaModel(chain_folder="human_T_beta", recomb_type="VDJ")
olga_alpha = OlgaModel(chain_folder="human_T_alpha", recomb_type="VJ")

tr.clone_df['pgen_cdr3_b_aa'] = olga_beta.compute_aa_cdr3_pgens(
    tr.clone_df.cdr3_b_aa)
tr.clone_df['pgen_cdr3_a_aa'] = olga_alpha.compute_aa_cdr3_pgens(
    tr.clone_df.cdr3_a_aa)
"""Force pgen > 0: there were 7 CDR3 alphas with pgen = 0"""
tr.clone_df = tr.clone_df.loc[(tr.clone_df['pgen_cdr3_a_aa'] > 0)
                              & (tr.clone_df['pgen_cdr3_b_aa'] > 0)]

norm_pgen = mpl.colors.LogNorm(vmin=1e-10, vmax=1e-6)
norm_a = mpl.colors.LogNorm(vmin=tr.clone_df['pgen_cdr3_a_aa'].min(),
                            vmax=tr.clone_df['pgen_cdr3_a_aa'].max())

norm_b = mpl.colors.LogNorm(vmin=tr.clone_df['pgen_cdr3_b_aa'].min(),
                            vmax=tr.clone_df['pgen_cdr3_b_aa'].max())


def color_lu(norm, colors, pgen):
    i = int(np.floor(norm(pgen) * len(colors)))
    if i >= len(colors):
        i = len(colors) - 1
    if i < 0:
        i = 0