def test_ex12(): import pandas as pd import os from tcrsampler.sampler import TCRsampler # fn = 'britanova_chord_blood.csv' # real file fn = os.path.join('tcrdist','test_files', 'britanova_chord_blood_sample_5000.csv') # test_only file t = TCRsampler() t.ref_df = pd.read_csv(fn) t.build_background() t.v_freq t.j_freq t.vj_freq t.sample_background(v ='TRBV10-1*01', j ='TRBJ1-1*01',n=3, depth = 1, seed =1, use_frequency= True )
import os import pandas as pd from tcrsampler.sampler import TCRsampler t = TCRsampler() fn = os.path.join('britanova_chord_blood.csv') t.ref_df = pd.read_csv(fn) t.build_background(max_rows=1000) t.sample( [['TRBV10-2*01', 'TRBV10-2*01*01', 1], ['TRBV27*01', 'TRBV27*01*01', 4]], depth=10) for k, v in t.ref_dict.items(): print(k, v.shape[0])
def test_v_j_freq_estimates(): d = { 'Unnamed: 0': { 0: 0, 1: 1, 2: 2, 3: 3, 4: 4 }, 'v_reps': { 0: 'TRBV24-1*01', 1: 'TRBV5-1*01', 2: 'TRBV7-2*01', 3: 'TRBV3-1*01', 4: 'TRBV7-3*01' }, 'j_reps': { 0: 'TRBJ2-1*01', 1: 'TRBJ2-5*01', 2: 'TRBJ2-3*01', 3: 'TRBJ2-5*01', 4: 'TRBJ2-3*01' }, 'cdr3': { 0: 'CATRQDNEQFF', 1: 'CASSLEETQYF', 2: 'CASSLADTQYF', 3: 'CASSQETQYF', 4: 'CASSLAGGTDTQYF' }, 'count': { 0: 252, 1: 166, 2: 113, 3: 98, 4: 89 }, 'freq': { 0: 0.0003726818302818776, 1: 0.0002454967612174273, 2: 0.00016711526516608003, 3: 0.00014493182288739684, 4: 0.00013162175752018694 }, 'subject': { 0: 'A5-S11.txt', 1: 'A5-S11.txt', 2: 'A5-S11.txt', 3: 'A5-S11.txt', 4: 'A5-S11.txt' } } df = pd.DataFrame(d) t = TCRsampler() t.ref_df = df t.build_background() assert t.v_occur_freq == { 'TRBV3-1*01': 0.2, 'TRBV5-1*01': 0.2, 'TRBV7-2*01': 0.2, 'TRBV7-3*01': 0.2, 'TRBV24-1*01': 0.2 } assert t.j_occur_freq == { 'TRBJ2-1*01': 0.2, 'TRBJ2-3*01': 0.4, 'TRBJ2-5*01': 0.4 }
on='subject') dfd['freq'] = dfd['freq_x'] / dfd['freq_y'] print(dfd[['freq', 'subject']].groupby(['subject']).sum()) # Test that these will work with TCRsampler from tcrsampler.sampler import TCRsampler from tcrdist import repertoire_db ref = repertoire_db.RefGeneSet(db_file='alphabeta_gammadelta_db.tsv') ref.generate_all_genes() ref.all_genes ref.all_genes['human'].keys() tsd = TCRsampler() tsd.ref_df = dfd tsd.build_background() # find potential missing: print([x for x in tsd.v_freq.keys()]) print([x for x in tsd.v_freq.keys() if x not in ref.all_genes['human'].keys()]) assert len([ x for x in tsd.v_freq.keys() if x not in ref.all_genes['human'].keys() ]) == 0 print([x for x in tsd.j_freq.keys()]) print([x for x in tsd.j_freq.keys() if x not in ref.all_genes['human'].keys()]) assert len([ x for x in tsd.j_freq.keys() if x not in ref.all_genes['human'].keys() ]) == 0 tsg = TCRsampler() tsg.ref_df = dfg