def test_TCRsampler_build(): t = TCRsampler() fn = os.path.join('tcrsampler', 'tests', 'pmbc_mixcr_example_data.txt') t.clean_mixcr(filename=fn) t.build_background() assert isinstance(t.ref_dict, dict) assert isinstance(t.ref_dict.popitem()[1], pd.DataFrame)
def test_prob_sampler_sample_key_warn(): t = TCRsampler() fn = os.path.join('tcrsampler', 'tests', 'pmbc_mixcr_example_data.txt') t.clean_mixcr(filename=fn) t.build_background() with pytest.warns(None): r = t.sample([['TRBV999*01', 'TRBJ2-7*01', 2]]) assert r == [[None]]
def test_prob_sampler_sample_background(): t = TCRsampler() fn = os.path.join('tcrsampler', 'tests', 'pmbc_mixcr_example_data.txt') t.clean_mixcr(filename=fn) t.build_background() r = t.sample_background('TRBV9*01', 'TRBJ2-7*01', n=10) assert r == [ 'CASSRTGSLADEQYF', 'CASSATGVVSAQYF', 'CASSAWGQVYEQYF', 'CASSVSGSPYEQYF', 'CASSAWGQVYEQYF', 'CASSAWGQVYEQYF', 'CASRWGEQYF', 'CASSGDDWEQYF', 'CASSATGTSGPYEQYF', 'CASSSRTSGSNSEQYF' ]
def test_TCRsampler_build_vj_components(): t = TCRsampler() fn = os.path.join('tcrsampler', 'tests', 'pmbc_mixcr_example_data.txt') t.clean_mixcr(filename=fn) t.build_background() assert np.isclose(np.sum([k for _, k in t.vj_freq.items()]), 1.0) assert np.isclose(np.sum([k for _, k in t.j_freq.items()]), 1.0) assert np.isclose(np.sum([k for _, k in t.v_freq.items()]), 1.0) assert np.isclose(np.sum([k for _, k in t.vj_occur_freq.items()]), 1.0) assert np.isclose(np.sum([k for _, k in t.v_occur_freq.items()]), 1.0) assert np.isclose(np.sum([k for _, k in t.j_occur_freq.items()]), 1.0)
def test_ex12(): import pandas as pd import os from tcrsampler.sampler import TCRsampler # fn = 'britanova_chord_blood.csv' # real file fn = os.path.join('tcrdist','test_files', 'britanova_chord_blood_sample_5000.csv') # test_only file t = TCRsampler() t.ref_df = pd.read_csv(fn) t.build_background() t.v_freq t.j_freq t.vj_freq t.sample_background(v ='TRBV10-1*01', j ='TRBJ1-1*01',n=3, depth = 1, seed =1, use_frequency= True )
def test_prob_sampler_sample(): t = TCRsampler() fn = os.path.join('tcrsampler', 'tests', 'pmbc_mixcr_example_data.txt') t.clean_mixcr(filename=fn) t.build_background() r = t.sample([['TRBV9*01', 'TRBJ2-7*01', 2]]) assert r == [['CASSRTGSLADEQYF', 'CASSATGVVSAQYF']] r = t.sample([['TRBV9*01', 'TRBJ2-7*01', 2]], flatten=True) assert r == ['CASSRTGSLADEQYF', 'CASSATGVVSAQYF'] r = t.sample([['TRBV9*01', 'TRBJ2-7*01', 2], ['TRBV7-7*01', 'TRBJ2-4*01', 4]]) assert r == [['CASSRTGSLADEQYF', 'CASSATGVVSAQYF'], [ 'CASSLGQAARGIQYF', 'CASSLGQAARGIQYF', 'CASSLGQAARGIQYF', 'CASSLGQAARGIQYF' ]]
mixcr exportClones -cloneId -count -fraction -vGene -jGene -vHit -jHit -vHits -jHits -aaFeature CDR3 -nFeature CDR3 SRR2079522.1.clns SRR2079522.1.clns.best.txt -f mixcr exportAlignments SRR2079522.1.vdjca SRR2079522.1.vdjca.txt -f ``` #### Files Available For Download Beta: [SRR2079522.1.clns.best.txt](https://www.dropbox.com/s/czcewp7x7auwdsu/SRR2079522.1.clns.best.txt?dl=1) Alpha: [SRR2079521.1.clns.best.txt](https://www.dropbox.com/s/k4i0mt0cwhcn1h7/SRR2079521.1.clns.best.txt?dl=1) """ from tcrsampler.sampler import TCRsampler fn = 'SRR2079522.1.clns.best.subject.txt' t = TCRsampler() t.clean_mixcr(fn) t.build_background() t.ref_df t.ref_df.to_csv('ruggiero_mouse_beta_t.tsv.sampler.tsv', sep="\t", index=False) fn = 'SRR2079521.1.clns.best.subject.txt' t = TCRsampler() t.clean_mixcr(fn) t.build_background() t.ref_df t.ref_df.to_csv('ruggiero_mouse_alpha_t.tsv.sampler.tsv', sep="\t", index=False)
def test_quick_pipeline_with_fragmented_compute(): """ How can I used tcrdist3 to test for TCRs that may HLA restricted. """ import os import pandas as pd import numpy as np from scipy import sparse from tcrdist.repertoire import TCRrep from tcrdist.rep_funcs import compute_pw_sparse_out_of_memory f = 'mira_epitope_67_382_APHGVVFL_APHGVVFLHV_GVVFLHVTY_VVFLHVTYV.tcrdist3.csv' f = os.path.join('tcrdist','data','covid19',f) assert os.path.isfile(f) df = pd.read_csv(f) df = df[['subject', 'cell_type', 'v_b_gene', 'j_b_gene', 'cdr3_b_aa', 'cdr3_b_nucseq', 'cohort', 'hla-a', 'hla-a_1','hla-b', 'hla-b_1']] tr = TCRrep(cell_df = df, organism = 'human', chains = ['beta'], db_file = 'alphabeta_gammadelta_db.tsv', compute_distances = False, store_all_cdr = False) from tcrdist.rep_funcs import compute_pw_sparse_out_of_memory S, fragments = compute_pw_sparse_out_of_memory( tr = tr, row_size = 100, pm_processes = 2, pm_pbar = True, max_distance = 1000, matrix_name = 'rw_beta', reassemble = True, cleanup = False) tr.clone_df['B07'] = (tr.clone_df['hla-b'].str.startswith("B*07") | tr.clone_df['hla-b_1'].str.startswith("B*07")) tr.clone_df['B07'] = ["B*07" if (x) else "NOTB*07 " for x in tr.clone_df['B07']] #sparse.save_npz("S.npz", S) from tcrdist.rep_funcs import compute_n_tally_out_of_memory nn_tally_df_cohort = compute_n_tally_out_of_memory(fragments, matrix_name = "rw_beta", pm_processes = 6, to_file = False, to_memory = True, knn_radius = 25, x_cols = ['B07']) from hierdiff.association_testing import cluster_association_test nn_associations = cluster_association_test(res = nn_tally_df_cohort, y_col='cmember', method='fishers') nn_associations = nn_associations.sort_values('pvalue', ascending = True) import ast nn_associations['neighbors_i'] = nn_associations.neighbors.apply(lambda x: ast.literal_eval(x)) from tcrdist.summarize import test_for_almost_subsets, filter_is, filter_gt nn_associations['mostly_unique'] = test_for_almost_subsets(nn_associations['neighbors_i'], thr = 5) nr_nn_associations = filter_is(nn_associations, 'mostly_unique', 1).copy() #nr_nn_associations = filter_gt(nr_nn_associations, 'K_neighbors', 25).copy() nr_nn_associations # MOTIF GENERATION from tcrsampler.sampler import TCRsampler t = TCRsampler() if 'olga_human_beta_t.sampler.tsv' not in t.currently_available_backgrounds(): t.download_background_file('olga_sampler.zip') #t.download_background_file('olga_sampler.zip') # ONLY IF NOT ALREADY DONE tcrsampler_beta = TCRsampler(default_background = 'olga_human_beta_t.sampler.tsv') tcrsampler_beta.build_background(max_rows = 1000) """SEE PALMOTIF DOCS (https://github.com/agartland/palmotif)""" from palmotif import compute_pal_motif, svg_logo from tcrdist.summarize import _select """GENERATE SVG GRAPHIC FOR EACH NODE OF THE TREE""" #pwmat_str = 'pw_beta' cdr3_name = 'cdr3_b_aa' gene_names = ['v_b_gene','j_b_gene'] svgs_beta = list() svgs_beta_raw = list() info_list = list() from tcrdist.rep_diff import member_summ summary = member_summ( res_df = nr_nn_associations, clone_df = tr.clone_df, addl_cols=['cohort','hla-a', 'hla-a_1', 'hla-b', 'hla-b_1', 'subject']) nr_nn_associations = pd.concat([nr_nn_associations, summary], axis = 1).reset_index() for i,r in nr_nn_associations.head(25).iterrows(): dfnode = tr.clone_df.iloc[r['neighbors_i'],:].copy() # <pwnode> Pairwise Matrix for node sequences pwnode = S[r['neighbors_i'],:] [:,r['neighbors_i']].todense() if dfnode.shape[0] > 2: iloc_idx = pwnode.sum(axis = 0).argmin() centroid = dfnode[cdr3_name].to_list()[iloc_idx] else: centroid = dfnode[cdr3_name].to_list()[0] print(f"CENTROID: {centroid}") gene_usage_beta = dfnode.groupby(gene_names).size() sampled_rep = tcrsampler_beta.sample( gene_usage_beta.reset_index().to_dict('split')['data'], flatten = True, depth = max(100, 1000 // dfnode.shape[0])) sampled_rep = [x for x in sampled_rep if x is not None] motif, stat = compute_pal_motif( seqs = _select(df = tr.clone_df, iloc_rows = r['neighbors_i'], col = cdr3_name), refs = sampled_rep, centroid = centroid) svgs_beta.append(svg_logo(motif, return_str= True)) sampled_rep = sampled_rep.append(centroid) motif_raw, _ = compute_pal_motif( seqs =_select(df = tr.clone_df, iloc_rows = r['neighbors_i'], col = cdr3_name), centroid = centroid) svgs_beta_raw.append(svg_logo(motif_raw, return_str= True)) info_list.append(r) def row_to_string(r, vals = ['ct_columns', 'val_0', 'ct_0', 'val_1', 'ct_1', 'val_2', 'ct_2','val_3', 'ct_3', 'levels', 'K_neighbors', 'R_radius', 'RR', 'OR', 'pvalue', 'FWERp','FDRq']): #d = {v:r[v] for v in vals} return "<br></br>".join([f"\t{v} : {r[v]}" for v in vals]) def to_html_table(r, vals = ['ct_columns', 'hla-a', 'hla-a_1', 'hla-b', 'hla-b_1', 'val_0', 'ct_0', 'val_2', 'ct_2', 'K_neighbors', 'R_radius', 'pvalue', 'FDRq','cdr3_b_aa','v_b_gene', 'j_b_gene', 'cohort','subject']): return pd.DataFrame(r[vals]).transpose().to_html() def shrink(html_str): return html_str.replace('height="100%"', 'height="10%"').\ replace('width="100%"', 'width="10%"') with open('svgs_in_line.html', 'w') as fh: fh.write(f"<html><body>\n") for svg, svg_raw, details in zip(svgs_beta, svgs_beta_raw, info_list): fh.write(f"{shrink(svg_raw)}{shrink(svg)}") try: fh.write(to_html_table(details)) except: print("F") fh.write("<div></div>") fh.write(f"</html></body>\n")
import os import pandas as pd from tcrsampler.sampler import TCRsampler t = TCRsampler() fn = os.path.join('britanova_chord_blood.csv') t.ref_df = pd.read_csv(fn) t.build_background(max_rows=1000) t.sample( [['TRBV10-2*01', 'TRBV10-2*01*01', 1], ['TRBV27*01', 'TRBV27*01*01', 4]], depth=10) for k, v in t.ref_dict.items(): print(k, v.shape[0])
def test_dash_ecdf(): """ An empirical distribution function (ECDF) can be created for a target TCR and a reference set of TCRs to show the proportion of reference TCRs that are within a distance D of the target TCR, over a range of distances. A plot of the ECDF as a function of increasing D shows the density of TCR space in the reference set in the neighborhood around the target TCR. This can be very helpful for identifying dense antigen-specific clusters in an antigen enriched TCR repertoire, where the "reference" set is actually an experimentally enriched repertoire (e.g. pMHC:tetramer or AIM sorting). Or the ECDF can be helpful for identifying a radius around a TCR that retains high antigen specificity, by showing that the neighborhood is extremely sparse in an large unsorted/bulk TCR repertoire. """ import pandas as pd import numpy as np from tcrdist.repertoire import TCRrep from tcrsampler.sampler import TCRsampler from tcrdist.ecdf import distance_ecdf, make_ecdf_step from tcrdist.background import make_gene_usage_counter, make_vj_matched_background, \ make_flat_vj_background, get_gene_frequencies, calculate_adjustment import matplotlib.pyplot as plt df = pd.read_csv('dash.csv') df = df.loc[df['epitope'] == 'PB1'] tr = TCRrep(cell_df=df, organism='mouse', chains=['beta'], db_file='alphabeta_gammadelta_db.tsv') TCRsampler.download_background_file(download_file='wiraninha_sampler.zip') cols = ['v_b_gene', 'j_b_gene'] refs = [] for ts_fn in [f'wirasinha_mouse_beta_s_{i}.tsv.sampler.tsv' for i in '48']: ts = TCRsampler(default_background=ts_fn) ts.build_background(stratify_by_subject=True, use_frequency=False) """Sanitize the alleles to *01 for TCRSampler""" tmp = df[cols].applymap(lambda s: s.split('*')[0] + '*01') freqs = tmp.groupby(cols).size() freq_records = list(freqs.to_frame().to_records()) ref = ts.sample(freq_records, depth=10, seed=110820) ref_df = pd.concat([ pd.DataFrame({ 'cdr3_b_aa': ref[i] }).assign(v_b_gene=v, j_b_gene=j) for i, (v, j, _) in enumerate(freq_records) ]) """Assigns pV, pJ and pVJ to ref_df""" ref_df = get_gene_frequencies(ts=ts, df=ref_df) xdf = freqs.reset_index() xdf.columns = ['v_b_gene', 'j_b_gene', 'n'] """For each V,J pairing compute frequency in this reference""" xdf = xdf.assign(ref_freq=xdf['n'] / xdf['n'].sum()) ref_df = ref_df.merge(xdf, how='left', on=cols).reset_index() """ Assign weights to ref sequences: Pr_actual / Pr_sampling""" ref_df = ref_df.assign(weights=ref_df['pVJ'] / ref_df['ref_freq']) refs.append(ref_df) """Add uniformly sampled sequences""" ref_df = ts.ref_df.sample(100, random_state=1) refs.append(ref_df) ref_df = pd.concat(refs, axis=0) ref_tr = TCRrep(cell_df=ref_df[cols + ['cdr3_b_aa', 'weights']], organism='mouse', chains=['beta'], compute_distances=False, store_all_cdr=False) tr.compute_rect_distances(df=tr.clone_df, df2=ref_tr.clone_df, store=False) thresholds = np.arange(1, 50) thresholds, ref_ecdf = distance_ecdf(tr.rw_beta, thresholds=thresholds, weights=ref_tr.clone_df['weights'] * ref_tr.clone_df['count']) thresholds, target_ecdf = distance_ecdf(tr.pw_beta, thresholds=thresholds, weights=None) figh = plt.figure(figsize=(5, 5)) axh = figh.add_axes([0.15, 0.15, 0.6, 0.7], yscale='log') plt.ylabel(f'Proportion of reference TCRs') plt.xlabel(f'Distance from target TCR clone') for tari in range(ref_ecdf.shape[0]): x, y = make_ecdf_step(thresholds, ref_ecdf[tari, :]) axh.plot(x, y, color='k', alpha=0.2) x, y = make_ecdf_step(thresholds, np.mean(ref_ecdf, axis=0)) axh.plot(x, y, color='r', alpha=1) figh = plt.figure(figsize=(5, 5)) axh = figh.add_axes([0.15, 0.15, 0.6, 0.7], yscale='log') plt.ylabel(f'Proportion of target TCRs') plt.xlabel(f'Distance from target TCR clone') for tari in range(target_ecdf.shape[0]): x, y = make_ecdf_step(thresholds, target_ecdf[tari, :]) axh.plot(x, y, color='k', alpha=0.2) x, y = make_ecdf_step(thresholds, np.mean(target_ecdf, axis=0)) axh.plot(x, y, color='r', alpha=1) """Make an "ROC" plot combining the ECDF against the target (sensitivity) vs. ECDF against the reference (specificity)""" figh = plt.figure(figsize=(7, 5)) axh = figh.add_axes([0.15, 0.15, 0.6, 0.7], yscale='log', xscale='log') plt.ylabel(f'Proportion of target TCRs') plt.xlabel(f'Proportion of reference TCRs') for tari in range(target_ecdf.shape[0]): x, y = make_ecdf_step(ref_ecdf[tari, :], target_ecdf[tari, :]) axh.plot(x, y, color='k', alpha=0.2) x, y = make_ecdf_step(np.mean(ref_ecdf, axis=0), np.mean(target_ecdf, axis=0)) axh.plot(x, y, color='r', alpha=1) yl = plt.ylim() xl = plt.xlim() #yl = (1e-6, 0.3) plt.plot(yl, yl, '--', color='gray') plt.xlim(xl) plt.ylim(yl)
def test_TCRsampler_build_stratified(): t = TCRsampler() fn = os.path.join('tcrsampler', 'tests', 'pmbc_mixcr_example_data.txt') t.clean_mixcr(filename=fn) t.build_background(stratify_by_subject=True) r = t.sample_background('TRBV9*01', 'TRBJ2-7*01', n=10)
def test_v_j_freq_estimates(): d = { 'Unnamed: 0': { 0: 0, 1: 1, 2: 2, 3: 3, 4: 4 }, 'v_reps': { 0: 'TRBV24-1*01', 1: 'TRBV5-1*01', 2: 'TRBV7-2*01', 3: 'TRBV3-1*01', 4: 'TRBV7-3*01' }, 'j_reps': { 0: 'TRBJ2-1*01', 1: 'TRBJ2-5*01', 2: 'TRBJ2-3*01', 3: 'TRBJ2-5*01', 4: 'TRBJ2-3*01' }, 'cdr3': { 0: 'CATRQDNEQFF', 1: 'CASSLEETQYF', 2: 'CASSLADTQYF', 3: 'CASSQETQYF', 4: 'CASSLAGGTDTQYF' }, 'count': { 0: 252, 1: 166, 2: 113, 3: 98, 4: 89 }, 'freq': { 0: 0.0003726818302818776, 1: 0.0002454967612174273, 2: 0.00016711526516608003, 3: 0.00014493182288739684, 4: 0.00013162175752018694 }, 'subject': { 0: 'A5-S11.txt', 1: 'A5-S11.txt', 2: 'A5-S11.txt', 3: 'A5-S11.txt', 4: 'A5-S11.txt' } } df = pd.DataFrame(d) t = TCRsampler() t.ref_df = df t.build_background() assert t.v_occur_freq == { 'TRBV3-1*01': 0.2, 'TRBV5-1*01': 0.2, 'TRBV7-2*01': 0.2, 'TRBV7-3*01': 0.2, 'TRBV24-1*01': 0.2 } assert t.j_occur_freq == { 'TRBJ2-1*01': 0.2, 'TRBJ2-3*01': 0.4, 'TRBJ2-5*01': 0.4 }
dfd['freq'] = dfd['freq_x'] / dfd['freq_y'] print(dfd[['freq', 'subject']].groupby(['subject']).sum()) # Test that these will work with TCRsampler from tcrsampler.sampler import TCRsampler from tcrdist import repertoire_db ref = repertoire_db.RefGeneSet(db_file='alphabeta_gammadelta_db.tsv') ref.generate_all_genes() ref.all_genes ref.all_genes['human'].keys() tsd = TCRsampler() tsd.ref_df = dfd tsd.build_background() # find potential missing: print([x for x in tsd.v_freq.keys()]) print([x for x in tsd.v_freq.keys() if x not in ref.all_genes['human'].keys()]) assert len([ x for x in tsd.v_freq.keys() if x not in ref.all_genes['human'].keys() ]) == 0 print([x for x in tsd.j_freq.keys()]) print([x for x in tsd.j_freq.keys() if x not in ref.all_genes['human'].keys()]) assert len([ x for x in tsd.j_freq.keys() if x not in ref.all_genes['human'].keys() ]) == 0 tsg = TCRsampler() tsg.ref_df = dfg tsg.build_background()
import os import pandas as pd from tcrsampler.sampler import TCRsampler t = TCRsampler() fn = os.path.join('emerson_cmv_negative.csv') t.ref_df = pd.read_csv(fn) t.build_background(max_rows=100, stratify_by_subject=True) t.sample( [['TRBV10-2*01', 'TRBV10-2*01*01', 1], ['TRBV27*01', 'TRBV27*01*01', 4]], depth=10) for k, v in t.ref_dict.items(): print(k, v.shape[0])
from tcrdist.rep_diff import hcluster_diff tr.hcluster_df, tr.Z =\ hcluster_diff(clone_df = tr.clone_df, pwmat = tr.pw_alpha, x_cols = ['cohort'], count_col = 'count') """ SEE TCRSAMPLER (https://github.com/kmayerb/tcrsampler/blob/master/docs/tcrsampler.md) Here we used olga human alpha synthetic sequences for best coverage """ from tcrsampler.sampler import TCRsampler t = TCRsampler() #t.download_background_file('olga_sampler.zip') # ONLY IF NOT ALREADY DONE tcrsampler_alpha = TCRsampler(default_background = 'olga_human_alpha_t.sampler.tsv') tcrsampler_alpha.build_background(max_rows = 1000) """SEE PALMOTIF DOCS (https://github.com/agartland/palmotif)""" from palmotif import compute_pal_motif, svg_logo from tcrdist.summarize import _select """GENERATE SVG GRAPHIC FOR EACH NODE OF THE TREE""" pwmat_str = 'pw_alpha' cdr3_name = 'cdr3_a_aa' gene_names = ['v_a_gene','j_a_gene'] svgs_alpha = list() svgs_alpha_raw = list() for i,r in tr.hcluster_df.iterrows(): dfnode = tr.clone_df.iloc[r['neighbors_i'],].copy() # <pwnode> Pairwise Matrix for node sequences pwnode = getattr(tr, pwmat_str)[r['neighbors_i'],:][:,r['neighbors_i']].copy()