def test_background_generation_in_mira_60(fn=os.path.join( 'tcrdist', 'data', 'covid19', 'mira_epitope_60_436_MWSFNPETNI_SFNPETNIL_SMWSFNPET.tcrdist3.csv')): import sys import os import numpy as np import pandas as pd from tcrsampler.sampler import TCRsampler from tcrdist.background import make_gene_usage_counter, get_gene_frequencies, calculate_adjustment, make_gene_usage_counter from tcrdist.background import make_vj_matched_background, make_flat_vj_background from tcrdist.background import get_stratified_gene_usage_frequency from tcrdist.background import sample_britanova """ SUPPOSE WE HAVE SOME REPERTOIRE WITH THE FOLLOWING GENE USAGE SPECIFIED BY ix < df_target > For testing we will use a set of 25 TCRs generated from rare and semi-rare V,J pairings. We use 25 only because we will be comuting distances against 4.6 Million seqs. 1. TCRsampler, replacing gene occurance frequencies with subject tratified estimates NOTE: with replace = True .vj_occur_freq will now be the stratified value 2. Make V,J gene usage matched backgound to match usage in df_target 3. Use a subject-stratifeid random draw from the Britanova Chord Blood Samples 4. Make V,J gene usage matched backgound to match usage in df_target """ ts = TCRsampler( default_background='britanova_human_beta_t_cb.tsv.sampler.tsv') # 1 ts = get_stratified_gene_usage_frequency(ts=ts, replace=True) df_target = pd.read_csv(fn) df_target = df_target[['v_b_gene', 'j_b_gene', 'cdr3_b_aa']] gene_usage_counter = make_gene_usage_counter(df_target) # 2 df_vj_bkgd = make_vj_matched_background( ts=ts, gene_usage_counter=gene_usage_counter, size= 150000, # Ask for a few extra as Olga can return none if it makes too many non-productive CDR3s recomb_type="VDJ", chain_folder="human_T_beta", cols=['v_b_gene', 'j_b_gene', 'cdr3_b_aa']) df_vj_bkgd = df_vj_bkgd.sample(100000).reset_index(drop=True) df_vj_bkgd['weights'] = calculate_adjustment(df=df_vj_bkgd, adjcol="pVJ") df_vj_bkgd['source'] = "vj_matched" df_britanova_100K = sample_britanova(size=100000) # 3 df_britanova_100K = get_gene_frequencies(ts=ts, df=df_britanova_100K) df_britanova_100K['weights'] = 1 df_britanova_100K['source'] = "stratified_random" df_bkgd = pd.concat([df_vj_bkgd, df_britanova_100K], axis = 0).\ reset_index(drop = True) # 4 assert df_bkgd.shape[0] == 200000 #df_bkgd. return df_bkgd
def synthesize_vj_matched_background(self, ts=None, chain="beta"): """ tcrsampler : TCRsampler or None chain : str 'beta' (in future, TODO: add 'alpha') TODO ------- ONLY WORKS CURRENTLY FOR HUMAN BETA, VIA OLGA """ if chain not in ["beta", "alpha"]: raise ValueError("Invalid <chain> argument.") if chain == "beta": if ts is None: ts = _default_sampler(organism=self.organism, chain="beta")() ts = get_stratified_gene_usage_frequency(ts=ts, replace=True) if self.organism == "human": vj_background = _synthesize_human_beta_vj_background( ts=ts, df=self.clone_df) elif self.organism == "mouse": vj_background = _synthesize_mouse_beta_vj_background( ts=ts, df=self.clone_df) # TODO: ADD OTHER OPTIONS elif chain == "alpha": if ts is None: ts = _default_sampler(organism=self.organism, chain="alpha")() ts = get_stratified_gene_usage_frequency(ts=ts, replace=True) if self.organism == "human": #raise ValueError("TODO: FUTURE VERSIONS NEED ALPHA(HUMAN)") vj_background = _synthesize_human_alpha_vj_background( ts=ts, df=self.clone_df) elif self.organism == "mouse": #raise ValueError("TODO: FUTURE VERSIONS NEED ALPHA(MOUSE)") vj_background = _synthesize_mouse_alpha_vj_background( ts=ts, df=self.clone_df) return vj_background
def test_example_with_report(): """ Example of TCR radii defined for each TCR in an antigen enriched repertoire, and logo-motif report. """ import os import numpy as np import pandas as pd from tcrdist.repertoire import TCRrep from tcrdist.sample import _default_sampler from tcrdist.background import get_stratified_gene_usage_frequency from tcrdist.centers import calc_radii from tcrdist.public import _neighbors_sparse_variable_radius, _neighbors_variable_radius from tcrdist.public import TCRpublic from tcrdist.ecdf import _plot_manuscript_ecdfs import matplotlib.pyplot as plt # ANTIGEN ENRICHED REPERTOIRE # Load all TCRs tetramer-sorted for the epitope influenza PA epitope df = pd.read_csv("dash.csv").query('epitope == "PA"').\ reset_index(drop = True) # Load <df> into a TCRrep instance, to infer CDR1, CDR2, and CDR2.5 region of each clone tr = TCRrep(cell_df=df.copy(), organism='mouse', chains=['beta'], db_file='alphabeta_gammadelta_db.tsv', compute_distances=True) # UN-ENRICHED REPERTOIRE # For illustration we pull a default sampler for mouse beta chains. # This is used to estimate the gene usage # probabilities P(TRBV = V, TRBJ = J) ts = _default_sampler(organism="mouse", chain="beta")() ts = get_stratified_gene_usage_frequency(ts=ts, replace=True) # Then we synthesize a background using Olga (Sethna et al. 2019), # using the P(TRBV = V, TRBJ = J) for inverse probability weighting. df_vj_background = tr.synthesize_vj_matched_background(ts=ts, chain='beta') # Load <df_vj_background> into a TCRrep instance, to infer CDR1,CDR2,CDR2.5 trb = TCRrep(cell_df=df_vj_background.copy(), organism='mouse', chains=['beta'], db_file='alphabeta_gammadelta_db.tsv', compute_distances=False) # Take advantage of multiple CPUs tr.cpus = 4 # Compute radii for each TCR that controls neighbor-discovery in the background at # estimate of 1/10^5 inverse probability weighted TCRs. # Note we are set <use_sparse> to True, which allows us to take advantage of # multiple cpus and only store distance less than or equal to <max_radius> radii, thresholds, ecdfs = \ calc_radii(tr = tr, tr_bkgd = trb, chain = 'beta', ctrl_bkgd = 10**-5, use_sparse = True, max_radius=50) # Optional, set a maximum radius tr.clone_df['radius'] = radii tr.clone_df['radius'][tr.clone_df['radius'] > 26] = 26 # Tabulate index of neighboring clones in the ANTIGEN ENRICHED REPERTOIRE, # at each TCR-specific radius tr.clone_df['neighbors'] = _neighbors_variable_radius( pwmat=tr.pw_beta, radius_list=tr.clone_df['radius']) # Tabulate neighboring sequences in background tr.clone_df['background_neighbors'] = _neighbors_sparse_variable_radius( csrmat=tr.rw_beta, radius_list=tr.clone_df['radius']) # Tabulate number of unique subjects tr.clone_df['nsubject'] = tr.clone_df['neighbors'].\ apply(lambda x: tr.clone_df['subject'].iloc[x].nunique()) # Score Quasi(Publicity) : True (Quasi-Public), False (private) tr.clone_df['qpublic'] = tr.clone_df['nsubject'].\ apply(lambda x: x > 1) # OPTIONAL: HTML Report # Note: you can call TCRpublic() with fixed radius or directly # after tr.clone_df['radius'] is defined. tp = TCRpublic(tcrrep=tr, output_html_name="quasi_public_clones.html") tp.fixed_radius = False # Generates the HTML report rp = tp.report() # OPTIONAL: ECDF Figure, against reference f1 = _plot_manuscript_ecdfs(thresholds=thresholds, ecdf_mat=ecdfs, ylab='Proportion of Background TCRs', cdr3_len=tr.clone_df.cdr3_b_aa.str.len(), min_freq=1E-10) f1.savefig(os.path.join("", "PA1.png")) from tcrdist.ecdf import distance_ecdf tresholds, antigen_enriched_ecdf = distance_ecdf(pwrect=tr.pw_beta, thresholds=thresholds, weights=None, pseudo_count=0, skip_diag=False, absolute_weight=True) # It is straightforward to make a ECDF between antigen enriched TCRs as well: antigen_enriched_ecdf[antigen_enriched_ecdf == antigen_enriched_ecdf.min()] = 1E-10 f2 = _plot_manuscript_ecdfs(thresholds=thresholds, ecdf_mat=antigen_enriched_ecdf, ylab='Proportion of Antigen Enriched PA TCRs', cdr3_len=tr.clone_df.cdr3_b_aa.str.len(), min_freq=1E-10) f2.savefig(os.path.join("", "PA2.png"))
def test_background_generation_toy_example(): import sys import os import numpy as np import pandas as pd from tcrsampler.sampler import TCRsampler from tcrdist.background import make_gene_usage_counter, get_gene_frequencies, calculate_adjustment, make_gene_usage_counter from tcrdist.background import make_vj_matched_background, make_flat_vj_background from tcrdist.background import get_stratified_gene_usage_frequency from tcrdist.background import sample_britanova """ SUPPOSE WE HAVE SOME REPERTOIRE WITH THE FOLLOWING GENE USAGE SPECIFIED BY ix < df_target > For testing we will use a set of 25 TCRs generated from rare and semi-rare V,J pairings. We use 25 only because we will be comuting distances against 4.6 Million seqs. 1. TCRsampler, replacing gene occurance frequencies with subject tratified estimates NOTE: with replace = True .vj_occur_freq will now be the stratified value 2. Make V,J gene usage matched backgound to match usage in df_target 3. Use a subject-stratifeid random draw from the Britanova Chord Blood Samples 4. Make V,J gene usage matched backgound to match usage in df_target """ ts = TCRsampler( default_background='britanova_human_beta_t_cb.tsv.sampler.tsv') # 1 ts = get_stratified_gene_usage_frequency(ts=ts, replace=True) ix = [['TRBV19*01', 'TRBJ2-5*01', 3], ['TRBV24-1*01', 'TRBJ2-4*01', 3], ['TRBV25-1*01', 'TRBJ2-4*01', 3], ['TRBV30*01', 'TRBJ2-3*01', 2], ['TRBV5-4*01', 'TRBJ2-3*01', 2], ['TRBV11-2*01', 'TRBJ2-2*01', 2], ['TRBV2*01', 'TRBJ1-5*01', 1], ['TRBV12-5*01', 'TRBJ2-7*01', 1], ['TRBV4-1*01', 'TRBJ1-6*01', 1], ['TRBV6-5*01', 'TRBJ1-6*01', 1], ['TRBV13*01', 'TRBJ2-3*01', 1], ['TRBV18*01', 'TRBJ2-3*01', 1], ['TRBV14*01', 'TRBJ2-7*01', 1], ['TRBV6-6*01', 'TRBJ2-7*01', 1], ['TRBV10-3*01', 'TRBJ2-3*01', 1], ['TRBV7-2*01', 'TRBJ2-1*01', 1], ['TRBV5-1*01', 'TRBJ2-1*01', 1]] flatten = lambda l: [item for sublist in l for item in sublist] df_target = pd.concat([ pd.DataFrame({ 'cdr3_b_aa': flatten(ts.sample([[x[0], x[1], x[2]]])), 'v_b_gene': x[0], 'j_b_gene': x[1] }) for x in ix ]).reset_index(drop=True) gene_usage_counter = make_gene_usage_counter(df_target) # 2 df_vj_bkgd = make_vj_matched_background( ts=ts, gene_usage_counter=gene_usage_counter, size= 101000, # Ask for a few extra as Olga can return none if it makes too many non-productive CDR3s recomb_type="VDJ", chain_folder="human_T_beta", cols=['v_b_gene', 'j_b_gene', 'cdr3_b_aa']) df_vj_bkgd = df_vj_bkgd.sample(100000).reset_index(drop=True) df_vj_bkgd['weights'] = calculate_adjustment(df=df_vj_bkgd, adjcol="pVJ") df_vj_bkgd['source'] = "vj_matched" df_britanova_100K = sample_britanova(size=100000) # 3 df_britanova_100K = get_gene_frequencies(ts=ts, df=df_britanova_100K) df_britanova_100K['weights'] = 1 df_britanova_100K['source'] = "stratified_random" df_bkgd = pd.concat([df_vj_bkgd, df_britanova_100K], axis = 0).\ reset_index(drop = True) # 4 assert df_bkgd.shape[0] == 200000 """ Visually inspect the gene_usage between target seqs and vj-matched background """ df_check_match = pd.concat([ df_vj_bkgd.groupby(['v_b_gene', 'j_b_gene']).size() / df_vj_bkgd.shape[0], df_target.groupby(['v_b_gene', 'j_b_gene']).size() / df_target.shape[0] ], axis=1) assert np.all(abs(df_check_match[0] - df_check_match[1]) < 0.001) return df_bkgd
def find_metaclonotypes( project_path = "tutorial48", source_path = os.path.join(path_to_base,'tcrdist','data','covid19'), antigen_enriched_file = 'mira_epitope_48_610_YLQPRTFL_YLQPRTFLL_YYVGYLQPRTF.tcrdist3.csv', ncpus = 4, seed = 3434): """ This functions encapsulates a complete workflow for finding meta-clonotypes in antigen-enriched data. """ np.random.seed(seed) if not os.path.isdir(project_path): os.mkdir(project_path) ############################################################################ # Step 1: Select and load a antigen-enriched (sub)repertoire. #### ############################################################################ print(f"INITIATING A TCRrep() with {antigen_enriched_file}") assert os.path.isfile(os.path.join(source_path, antigen_enriched_file)) # Read file into a Pandas DataFrame <df> df = pd.read_csv(os.path.join(source_path, antigen_enriched_file)) # Drop cells without any gene usage information df = df[( df['v_b_gene'].notna() ) & (df['j_b_gene'].notna()) ] # Initialize a TCRrep class, using ONLY columns that are complete and unique define a a clone. # Class provides a 'count' column if non is present # Counts of identical subject:VCDR3 'clones' will be aggregated into a TCRrep.clone_df. from tcrdist.repertoire import TCRrep tr = TCRrep(cell_df = df[['subject','cell_type','v_b_gene', 'j_b_gene', 'cdr3_b_aa']], organism = "human", chains = ['beta'], compute_distances = True) tr.cpus = ncpus ############################################################################ # Step 1.1: Estimate Probability of Generation #### ############################################################################ ### It will be useful later to know the pgen of each from tcrdist.automate import auto_pgen print(f"COMPUTING PGEN WITH OLGA (Sethna et al 2018)") print("FOR ANTIGEN-ENRICHED CLONES TO BE USED FOR SUBSEQUENT ANALYSES") auto_pgen(tr) # Tip: Users of tcrdist3 should be aware that by default a <TCRrep.clone_df> # DataFrame is created out of non-redundant cells in the cell_df, and # pairwise distance matrices automatically computed. # Notice that attributes <tr.clone_df> and <tr.pw_beta> , <tr.pw_cdr3_b_aa>, # are immediately accessible. # Attributes <tr.pw_pmhc_b_aa>, <tr.pw_cdr2_b_aa>, and <tr.pw_cdr1_b_aa> # are also available if <TCRrep.store_all_cdr> is set to True. # For large datasets, i.e., >15,000 clones, this approach may consume too much # memory so <TCRrep.compute_distances> is automatically set to False. ############################################################################ # Step 2: Synthesize an Inverse Probability Weighted VJ Matched Background # ############################################################################ # Generating an appropriate set of unenriched reference TCRs is important; for # each set of antigen-associated TCRs, discovered by MIRA, we created a two part # background. One part consists of 100,000 synthetic TCRs whose V-gene and J-gene # frequencies match those in the antigen-enriched repertoire, using the software # OLGA (Sethna et al. 2019; Marcou et al. 2018). The other part consists of # 100,000 umbilical cord blood TCRs sampled uniformly from 8 subjects (Britanova # et al., 2017). This mix balances dense sampling of sequences near the # biochemical neighborhoods of interest with broad sampling of TCRs from an # antigen-naive repertoire. Importantly, we adjust for the biased sampling by # using the V- and J-gene frequencies observed in the cord-blood data (see # Methods for details about inverse probability weighting adjustment). Using this # approach we are able to estimate the abundance of TCRs similar to a centroid # TCR in an unenriched background repertoire of ~1,000,000 TCRs, using a # comparatively modest background dataset of 200,000 TCRs. While this estimate # may underestimate the true specificity, since some of the neighborhood TCRs in # the unenriched background repertoire may in fact recognize the antigen of # interest, it is useful for prioritizing neighborhoods and selecting a radius # for each neighborhood that balances sensitivity and specificity. # Initialize a TCRsampler -- human, beta, umbilical cord blood from 8 people. print(f"USING tcrsampler TO CONSTRUCT A CUSTOM V-J MATCHED BACKGROUND") from tcrsampler.sampler import TCRsampler ts = TCRsampler(default_background = 'britanova_human_beta_t_cb.tsv.sampler.tsv') # Stratify sample so that each subject contributes similarly to estimate of # gene usage frequency from tcrdist.background import get_stratified_gene_usage_frequency ts = get_stratified_gene_usage_frequency(ts = ts, replace = True) # Synthesize an inverse probability weighted V,J gene background that matches # usage in your enriched repertoire df_vj_background = tr.synthesize_vj_matched_background(ts = ts, chain = 'beta') # Get a randomly drawn stratified sampler of beta, cord blood from # Britanova et al. 2016 # Dynamics of Individual T Cell Repertoires: From Cord Blood to Centenarians from tcrdist.background import sample_britanova df_britanova_100K = sample_britanova(size = 100000) # Append frequency columns using, using sampler above df_britanova_100K = get_gene_frequencies(ts = ts, df = df_britanova_100K) df_britanova_100K['weights'] = 1 df_britanova_100K['source'] = "stratified_random" # Combine the two parts of the background into a single DataFrame df_bkgd = pd.concat([df_vj_background.copy(), df_britanova_100K.copy()], axis = 0).\ reset_index(drop = True) # Assert that the backgrounds have the expected number of rows. assert df_bkgd.shape[0] == 200000 # Save the background for future use background_outfile = os.path.join(project_path, f"{antigen_enriched_file}.olga100K_brit100K_bkgd.csv") print(f'WRITING {background_outfile}') df_bkgd.to_csv(background_outfile, index = False) # Load the background to a TCRrep without computing pairwise distances # (i.e., compute_distances = False) tr_bkgd = TCRrep( cell_df = df_bkgd, organism = "human", chains = ['beta'], compute_distances = False) # Compute rectangular distances. Those are, distances between each clone in # the antigen-enriched repertoire and each TCR in the background. # With a single 1 CPU and < 10GB RAM, 5E2x2E5 = 100 million pairwise distances, # across CDR1, CDR2, CDR2.5, and CDR3 # 1min 34s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each) # %timeit -r 1 tr.compute_rect_distances(df = tr.clone_df, df2 = tr_bkdg.clone_df, store = False) ############################################################################ # Step 4: Calculate Distances ##### ############################################################################ print(f"COMPUTING RECTANGULARE DISTANCE") tr.compute_sparse_rect_distances( df = tr.clone_df, df2 = tr_bkgd.clone_df, radius=50, chunk_size = 100) scipy.sparse.save_npz(os.path.join(project_path, f"{antigen_enriched_file}.rw_beta.npz"), tr.rw_beta) # Tip: For larger dataset you can use a sparse implementation: # 30.8 s ± 0 ns per loop ; tr.cpus = 6 # %timeit -r tr.compute_sparse_rect_distances(df = tr.clone_df, df2 = tr_bkdg.clone_df,radius=50, chunk_size=85) ############################################################################ # Step 5: Examine Density ECDFS ##### ############################################################################ # Investigate the density of neighbors to each TCR, based on expanding # distance radius. from tcrdist.ecdf import distance_ecdf, _plot_manuscript_ecdfs import matplotlib.pyplot as plt # Compute empirical cumulative density function (ecdf) # Compare Antigen Enriched TCRs (against itself). thresholds, antigen_enriched_ecdf = distance_ecdf( tr.pw_beta, thresholds=range(0,50,2)) # Compute empirical cumulative density function (ecdf) # Compare Antigen Enriched TCRs (against) 200K probability # inverse weighted background thresholds, background_ecdf = distance_ecdf( tr.rw_beta, thresholds=range(0,50,2), weights= tr_bkgd.clone_df['weights'], absolute_weight = True) # plot_ecdf similar to tcrdist3 manuscript # antigen_enriched_ecdf[antigen_enriched_ecdf == antigen_enriched_ecdf.min()] = 1E-10 f1 = _plot_manuscript_ecdfs( thresholds, antigen_enriched_ecdf, ylab= 'Proportion of Antigen Enriched TCRs', cdr3_len=tr.clone_df.cdr3_b_aa.str.len(), min_freq=1E-10) f1.savefig(os.path.join(project_path, f'{antigen_enriched_file}.ecdf_AER_plot.png')) f2 = _plot_manuscript_ecdfs( thresholds, background_ecdf, ylab= 'Proportion of Reference TCRs', cdr3_len=tr.clone_df.cdr3_b_aa.str.len(), min_freq=1E-10) f2.savefig(os.path.join(project_path, f'{antigen_enriched_file}.ecdf_BUR_plot.png')) ############################################################################ # Step 6: Find optimal radii (theta = 1E5 ##### ############################################################################ # To ascertain which meta-clonotypes are likely to be most specific, # take advantage of an existing function <bkgd_cntrl_nn2>. # d888 .d8888b. 8888888888 888888888 # d8888 d88P Y88b 888 888 # 888 888 888 888 888 # 888 888 888 8888888 8888888b. # 888 888 888 888 "Y88b # 888 888 888 888 888888 888 # 888 Y88b d88P 888 Y88b d88P # 8888888 "Y8888P" 8888888888 "Y8888P" level_tag = '1E5' from tcrdist.neighbors import bkgd_cntl_nn2 centers_df = bkgd_cntl_nn2( tr = tr, tr_background = tr_bkgd, weights = tr_bkgd.clone_df.weights, ctrl_bkgd = 10**-5, col = 'cdr3_b_aa', add_cols = ['v_b_gene', 'j_b_gene'], ncpus = 4, include_seq_info = True, thresholds = [x for x in range(0,50,2)], generate_regex = True, test_regex = True, forced_max_radius = 36) ############################################################################ # Step 6.2: (theta = 1E5) ALL meta-clonotypes .tsv file ## ############################################################################ # save center to project_path for future use centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" ) # Many of meta-clonotypes contain redundant information. # We can winnow down to less-redundant list. We do this # by ranking clonotypes from most to least specific. # <min_nsubject> is minimum publicity of the meta-clonotype, # <min_nr> is minimum non-redundancy # Add neighbors, K_neighbors, and nsubject columns from tcrdist.public import _neighbors_variable_radius, _neighbors_sparse_variable_radius centers_df['neighbors'] = _neighbors_variable_radius(pwmat=tr.pw_beta, radius_list = centers_df['radius']) centers_df['K_neighbors'] = centers_df['neighbors'].apply(lambda x : len(x)) # We determine how many <nsubjects> are in the set of neighbors centers_df['nsubject'] = centers_df['neighbors'].\ apply(lambda x: tr.clone_df['subject'].iloc[x].nunique()) centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" ) from tcrdist.centers import rank_centers ranked_centers_df = rank_centers( centers_df = centers_df, rank_column = 'chi2joint', min_nsubject = 2, min_nr = 1) ############################################################################ # Step 6.3: (theta = 1E5) NR meta-clonotypes .tsv file ### ############################################################################ # Output, ready to search bulk data. ranked_centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.ranked_centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" ) ############################################################################ # Step 6.4: (theta = 1E5) Output Meta-Clonotypes HTML Summary ### ############################################################################ # Here we can make a svg logo for each NR meta-clonotype if ranked_centers_df.shape[0] > 0: from progress.bar import IncrementalBar from tcrdist.public import make_motif_logo cdr3_name = 'cdr3_b_aa' v_gene_name = 'v_b_gene' svgs = list() svgs_raw = list() bar = IncrementalBar('Processing', max = ranked_centers_df.shape[0]) for i,r in ranked_centers_df.iterrows(): bar.next() centroid = r[cdr3_name] v_gene = r[v_gene_name] svg, svg_raw = make_motif_logo( tcrsampler = ts, pwmat = tr.pw_beta, clone_df = tr.clone_df, centroid = centroid , v_gene = v_gene , radius = r['radius'], pwmat_str = 'pw_beta', cdr3_name = 'cdr3_b_aa', v_name = 'v_b_gene', gene_names = ['v_b_gene','j_b_gene']) svgs.append(svg) svgs_raw.append(svg_raw) bar.next();bar.finish() ranked_centers_df['svg'] = svgs ranked_centers_df['svg_raw'] = svgs_raw def shrink(s): return s.replace('height="100%"', 'height="20%"').replace('width="100%"', 'width="20%"') labels =['cdr3_b_aa','v_b_gene', 'j_b_gene', 'pgen', 'radius', 'regex','nsubject','K_neighbors', 'bkgd_hits_weighted','chi2dist','chi2re','chi2joint'] output_html_name = os.path.join(project_path, f'{antigen_enriched_file}.ranked_centers_bkgd_ctlr_{level_tag}.html') # 888 888 88888888888 888b d888 888 # 888 888 888 8888b d8888 888 # 888 888 888 88888b.d88888 888 # 8888888888 888 888Y88888P888 888 # 888 888 888 888 Y888P 888 888 # 888 888 888 888 Y8P 888 888 # 888 888 888 888 " 888 888 # 888 888 888 888 888 88888888 with open(output_html_name, 'w') as output_handle: for i,r in ranked_centers_df.iterrows(): #import pdb; pdb.set_trace() svg, svg_raw = r['svg'],r['svg_raw'] output_handle.write("<br></br>") output_handle.write(shrink(svg)) output_handle.write(shrink(svg_raw)) output_handle.write("<br></br>") output_handle.write(pd.DataFrame(r[labels]).transpose().to_html()) output_handle.write("<br></br>") # To ascertain which meta-clonotypes are likely to be most specific, # take advantage of an existing function <bkgd_cntrl_nn2>. # d888 .d8888b. 8888888888 .d8888b. # d8888 d88P Y88b 888 d88P Y88b # 888 888 888 888 888 # 888 888 888 8888888 888d888b. # 888 888 888 888 888P "Y88b # 888 888 888 888 888888 888 888 # 888 Y88b d88P 888 Y88b d88P # 8888888 "Y8888P" 8888888888 "Y8888P" ############################################################################ # Step 6.5: Find optimal radii (theta = 1E6) ### ############################################################################ level_tag = '1E6' from tcrdist.neighbors import bkgd_cntl_nn2 centers_df = bkgd_cntl_nn2( tr = tr, tr_background = tr_bkgd, weights = tr_bkgd.clone_df.weights, ctrl_bkgd = 10**-6, col = 'cdr3_b_aa', add_cols = ['v_b_gene', 'j_b_gene'], ncpus = 4, include_seq_info = True, thresholds = [x for x in range(0,50,2)], generate_regex = True, test_regex = True, forced_max_radius = 36) ############################################################################ # Step 6.6: (theta = 1E6) ALL meta-clonotypes .tsv file ## ############################################################################ # save center to project_path for future use centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" ) # Many of meta-clonotypes contain redundant information. # We can winnow down to less-redundant list. We do this # by ranking clonotypes from most to least specific. # <min_nsubject> is minimum publicity of the meta-clonotype, # <min_nr> is minimum non-redundancy # Add neighbors, K_neighbors, and nsubject columns from tcrdist.public import _neighbors_variable_radius, _neighbors_sparse_variable_radius centers_df['neighbors'] = _neighbors_variable_radius(pwmat=tr.pw_beta, radius_list = centers_df['radius']) centers_df['K_neighbors'] = centers_df['neighbors'].apply(lambda x : len(x)) # We determine how many <nsubjects> are in the set of neighbors centers_df['nsubject'] = centers_df['neighbors'].\ apply(lambda x: tr.clone_df['subject'].iloc[x].nunique()) centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" ) from tcrdist.centers import rank_centers ranked_centers_df = rank_centers( centers_df = centers_df, rank_column = 'chi2joint', min_nsubject = 2, min_nr = 1) ############################################################################ # Step 6.7: (theta = 1E6) NR meta-clonotypes .tsv file ### ############################################################################ # Output, ready to search bulk data. ranked_centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.ranked_centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" ) ############################################################################ # Step 6.8: (theta = 1E6) Output Meta-Clonotypes HTML Summary ### ############################################################################ # Here we can make a svg logo for each meta-clonotype from progress.bar import IncrementalBar from tcrdist.public import make_motif_logo if ranked_centers_df.shape[0] > 0: cdr3_name = 'cdr3_b_aa' v_gene_name = 'v_b_gene' svgs = list() svgs_raw = list() bar = IncrementalBar('Processing', max = ranked_centers_df.shape[0]) for i,r in ranked_centers_df.iterrows(): bar.next() centroid = r[cdr3_name] v_gene = r[v_gene_name] svg, svg_raw = make_motif_logo( tcrsampler = ts, pwmat = tr.pw_beta, clone_df = tr.clone_df, centroid = centroid , v_gene = v_gene , radius = r['radius'], pwmat_str = 'pw_beta', cdr3_name = 'cdr3_b_aa', v_name = 'v_b_gene', gene_names = ['v_b_gene','j_b_gene']) svgs.append(svg) svgs_raw.append(svg_raw) bar.next();bar.finish() ranked_centers_df['svg'] = svgs ranked_centers_df['svg_raw'] = svgs_raw def shrink(s): return s.replace('height="100%"', 'height="20%"').replace('width="100%"', 'width="20%"') labels =['cdr3_b_aa', 'v_b_gene', 'j_b_gene', 'pgen', 'radius', 'regex','nsubject','K_neighbors', 'bkgd_hits_weighted','chi2dist','chi2re','chi2joint'] output_html_name = os.path.join(project_path, f'{antigen_enriched_file}.ranked_centers_bkgd_ctlr_{level_tag}.html') # 888 888 88888888888 888b d888 888 # 888 888 888 8888b d8888 888 # 888 888 888 88888b.d88888 888 # 8888888888 888 888Y88888P888 888 # 888 888 888 888 Y888P 888 888 # 888 888 888 888 Y8P 888 888 # 888 888 888 888 " 888 888 # 888 888 888 888 888 88888888 with open(output_html_name, 'w') as output_handle: for i,r in ranked_centers_df.iterrows(): #import pdb; pdb.set_trace() svg, svg_raw = r['svg'],r['svg_raw'] output_handle.write("<br></br>") output_handle.write(shrink(svg)) output_handle.write(shrink(svg_raw)) output_handle.write("<br></br>") output_handle.write(pd.DataFrame(r[labels]).transpose().to_html()) output_handle.write("<br></br>")