def make_motif_logo_from_index(tcrsampler, ind, clone_df, centroid, cdr3_name='cdr3_b_aa', v_name='v_b_gene', gene_names=['v_b_gene', 'j_b_gene']): """ make motif logo from a specific index """ dfnode = clone_df.iloc[ind, :].copy() dfnode[gene_names[0]] = dfnode[gene_names[0]].apply(lambda x: allele_01(x)) dfnode[gene_names[1]] = dfnode[gene_names[1]].apply(lambda x: allele_01(x)) gene_usage = dfnode.groupby(gene_names).size() sampled_rep = tcrsampler.sample( gene_usage.reset_index().to_dict('split')['data'], flatten=True, depth=100) sampled_rep = [x for x in sampled_rep if x is not None] motif, stat = compute_pal_motif(seqs=dfnode[cdr3_name], refs=sampled_rep, centroid=centroid) svg = svg_logo(motif, return_str=True) motif_raw, _ = compute_pal_motif(seqs=dfnode[cdr3_name], centroid=centroid) svg_raw = svg_logo(motif_raw, return_str=True) return svg, svg_raw
def motif_creation_human_betas(): import re import pandas as pd from tcrdist.repertoire import TCRrep import palmotif from tcrdist.pgen import OlgaModel oma = OlgaModel(recomb_type="VJ", chain_folder="human_T_alpha") from tcrdist.pgen import OlgaModel omb = OlgaModel(recomb_type="VDJ", chain_folder="human_T_beta") df = pd.read_csv("dash_human.csv") tr = TCRrep(cell_df=df, organism='human', chains=['alpha', 'beta'], db_file='alphabeta_gammadelta_db.tsv') from tcrdist.adpt_funcs import get_basic_centroids get_basic_centroids(tr, max_dist=75) with open("test_3.svg", 'w') as oh: oh.write('<body>') for i, r in tr.centroids_df.iterrows(): if len(r['neighbors']) < 5: break seqs = tr.clone_df.iloc[r['neighbors'], ]['cdr3_b_aa'].to_list() gene_usages = tr.clone_df.iloc[r['neighbors'], ][[ 'v_b_gene', 'j_b_gene' ]].value_counts().reset_index().to_dict('split')['data'] depth = 3 refs = flatten([ omb.gen_cdr3s(allele_01(v), allele_01(j), i * depth) for v, j, i in combos_alpha ]) refs = [x for x in refs if x is not None] matrix, stats = palmotif.compute_pal_motif(seqs=seqs, refs=refs, centroid=r['cdr3_b_aa']) matrix_raw, _ = palmotif.compute_pal_motif(seqs=seqs, centroid=r['cdr3_b_aa']) refs.append(r['cdr3_b_aa']) matrix_bkgd, _ = palmotif.compute_pal_motif( seqs=refs, centroid=r['cdr3_b_aa']) svgs = [ palmotif.svg_logo(matrix, 'test.svg', return_str=True), palmotif.svg_logo(matrix_raw, 'test.svg', return_str=True), palmotif.svg_logo(matrix_bkgd, 'test.svg', return_str=True) ] [oh.write(f"{s}<div></div>\n") for s in svgs] oh.write('<div></div>') oh.write(str(r)) oh.write('<div></div>') oh.write('</body>')
def _discover_motif_in_cluster(self, tcr_rep, index, row, negative_examples=None) -> Tuple[List[ReportOutput], List[ReportOutput]]: from tcrdist.adpt_funcs import get_centroid_seq from tcrdist.summarize import _select from palmotif import compute_pal_motif from palmotif import svg_logo dfnode = tcr_rep.clone_df.iloc[row['neighbors_i'],] figure_outputs, table_outputs = [], [] logging.info(f"{TCRdistMotifDiscovery.__name__}: in cluster {index+1}, there are {dfnode.shape[0]} neighbors.") for chain in ['a', 'b']: if dfnode.shape[0] > 2: centroid, *_ = get_centroid_seq(df=dfnode) else: centroid = dfnode[f'cdr3_{chain}_aa'].to_list()[0] motif, stat = compute_pal_motif(seqs=_select(df=tcr_rep.clone_df, iloc_rows=row['neighbors_i'], col=f'cdr3_{chain}_aa'), centroid=centroid, refs=negative_examples[chain] if self.use_reference_sequences else None) figure_path = self.result_path / f"motif_{chain}_{index + 1}.svg" svg_logo(motif, filename=figure_path) motif_data_path = self.result_path / f"motif_{chain}_{index + 1}.csv" motif.to_csv(motif_data_path) figure_outputs.append(ReportOutput(figure_path, f'Motif {index + 1} ({Chain.get_chain(chain.upper()).name.lower()} chain)')) table_outputs.append(ReportOutput(motif_data_path, f'motif {index + 1} ({Chain.get_chain(chain.upper()).name.lower()} chain) csv data')) return figure_outputs, table_outputs
def _index_to_matrix(ind, clone_df, pwmat=None, col='cdr3_b_aa', centroid=None): """ Example ------- """ dfnode = clone_df.iloc[ind, ].copy() seqs = dfnode[col].to_list() if centroid is None: pwnode = pwmat[ind, :][:, ind].copy() iloc_idx = pwnode.sum(axis=0).argmin() centroid = dfnode[col].to_list()[iloc_idx] matrix, stats = compute_pal_motif(seqs=seqs, centroid=centroid) return matrix
def test_gallery_hdiff(): """ All imports are provided here, and are repeated step-wise below, for clarity, and for module cut-and-paste. This example performs paired alpha-beta analysis, but code blocks can be used for single chain analysis as well. """ import pandas as pd from tcrdist.repertoire import TCRrep from tcrdist.rep_diff import hcluster_diff, member_summ from tcrsampler.sampler import TCRsampler from tcrdist.adpt_funcs import get_centroid_seq from tcrdist.summarize import _select from palmotif import compute_pal_motif, svg_logo from hierdiff import plot_hclust_props """ Load a subset of data that contains paired alpha-beta chain mouse TCR receptors that recognized the PA or PB1 epitopes (present in mouse influenza). """ import pandas as pd df = pd.read_csv("dash.csv") conditional = df['epitope'].apply( lambda x: x in ['PA','PB1']) """ For illustrative/testing purposes, randomly subset the data to include only 100 clones. Increase for more informative plot. """ df = df[conditional].\ reset_index(drop = True).\ sample(100, random_state = 3).\ reset_index(drop = True).\ copy() """ Load DataFrame into TCRrep instance, which automatically computes attributes: 1. .clone_df DataFrame 2. .pw_beta nd.array 3. .pw_alpha nd.array """ from tcrdist.repertoire import TCRrep tr = TCRrep(cell_df = df, organism = 'mouse', chains = ['beta','alpha'], db_file = 'alphabeta_gammadelta_db.tsv') """ Apply hcluster_diff, which hierarchically clusters. Note ---- pwmat could easily be tr.pw_beta or tr.pw_alpha if clustering should be done on a single chain. """ from tcrdist.rep_diff import hcluster_diff tr.hcluster_df, tr.Z =\ hcluster_diff(clone_df = tr.clone_df, pwmat = tr.pw_beta + tr.pw_alpha, x_cols = ['epitope'], count_col = 'count') """ Load a custom background, mouse appropriate dataset to sample CDR3s according to the V and J gene usage frequencies observed in each node. See the tcrsampler package for more details (https://github.com/kmayerb/tcrsampler/blob/master/docs/getting_default_backgrounds.md) """ from tcrsampler.sampler import TCRsampler t = TCRsampler() t.download_background_file("ruggiero_mouse_sampler.zip") tcrsampler_beta = TCRsampler(default_background = 'ruggiero_mouse_beta_t.tsv.sampler.tsv') tcrsampler_alpha = TCRsampler(default_background = 'ruggiero_mouse_alpha_t.tsv.sampler.tsv') """ Add an SVG graphic to every node of the tree aligned to the cluster centroid. """ from tcrdist.adpt_funcs import get_centroid_seq from tcrdist.summarize import _select from palmotif import compute_pal_motif, svg_logo """Beta Chain""" svgs_beta = list() for i,r in tr.hcluster_df.iterrows(): dfnode = tr.clone_df.iloc[r['neighbors_i'],] if dfnode.shape[0] > 2: centroid, *_ = get_centroid_seq(df = dfnode) else: centroid = dfnode['cdr3_b_aa'].to_list()[0] print(f"BETA-CHAIN: {centroid}") gene_usage_beta = dfnode.groupby(['v_b_gene','j_b_gene']).size() sampled_rep = tcrsampler_beta.sample( gene_usage_beta.reset_index().to_dict('split')['data'], flatten = True, depth = 10) sampled_rep = [x for x in sampled_rep if x is not None] motif, stat = compute_pal_motif( seqs = _select(df = tr.clone_df, iloc_rows = r['neighbors_i'], col = 'cdr3_b_aa'), refs = sampled_rep, centroid = centroid) svgs_beta.append(svg_logo(motif, return_str= True)) """Add Beta SVG graphics to hcluster_df""" tr.hcluster_df['svg_beta'] = svgs_beta """Alpha Chain""" svgs_alpha = list() for i,r in tr.hcluster_df.iterrows(): dfnode = tr.clone_df.iloc[r['neighbors_i'],] if dfnode.shape[0] > 2: centroid, *_ = get_centroid_seq(df = dfnode) else: centroid = dfnode['cdr3_a_aa'].to_list()[0] print(f"ALPHA-CHAIN: {centroid}") gene_usage_alpha = dfnode.groupby(['v_a_gene','j_a_gene']).size() sampled_rep = tcrsampler_alpha.sample( gene_usage_alpha.reset_index().to_dict('split')['data'], flatten = True, depth = 10) sampled_rep = [x for x in sampled_rep if x is not None] motif, stat = compute_pal_motif( seqs = _select(df = tr.clone_df, iloc_rows = r['neighbors_i'], col = 'cdr3_a_aa'), refs = sampled_rep, centroid = centroid) svgs_alpha.append(svg_logo(motif, return_str= True)) """Add Alpha SVG graphics to hcluster_df""" tr.hcluster_df['svg_alpha'] = svgs_alpha """ Produce summary information for tooltips. For instance, describe percentage of TCRs with a given epitope at a given node. """ res_summary = member_summ( res_df = tr.hcluster_df, clone_df = tr.clone_df, addl_cols=['epitope']) tr.hcluster_df_detailed = \ pd.concat([tr.hcluster_df, res_summary], axis = 1) """ Write D3 html for interactive denogram graphic. Specify desired tooltips. """ from hierdiff import plot_hclust_props html = plot_hclust_props(tr.Z, title='PA Epitope Example', res=tr.hcluster_df_detailed, tooltip_cols=['cdr3_b_aa','v_b_gene', 'j_b_gene','svg_alpha','svg_beta'], alpha=0.00001, colors = ['blue','gray'], alpha_col='pvalue') with open('hierdiff_example_PA_v_PB1.html', 'w') as fh: fh.write(html)
def test_quick_pipeline_with_fragmented_compute(): """ How can I used tcrdist3 to test for TCRs that may HLA restricted. """ import os import pandas as pd import numpy as np from scipy import sparse from tcrdist.repertoire import TCRrep from tcrdist.rep_funcs import compute_pw_sparse_out_of_memory f = 'mira_epitope_67_382_APHGVVFL_APHGVVFLHV_GVVFLHVTY_VVFLHVTYV.tcrdist3.csv' f = os.path.join('tcrdist','data','covid19',f) assert os.path.isfile(f) df = pd.read_csv(f) df = df[['subject', 'cell_type', 'v_b_gene', 'j_b_gene', 'cdr3_b_aa', 'cdr3_b_nucseq', 'cohort', 'hla-a', 'hla-a_1','hla-b', 'hla-b_1']] tr = TCRrep(cell_df = df, organism = 'human', chains = ['beta'], db_file = 'alphabeta_gammadelta_db.tsv', compute_distances = False, store_all_cdr = False) from tcrdist.rep_funcs import compute_pw_sparse_out_of_memory S, fragments = compute_pw_sparse_out_of_memory( tr = tr, row_size = 100, pm_processes = 2, pm_pbar = True, max_distance = 1000, matrix_name = 'rw_beta', reassemble = True, cleanup = False) tr.clone_df['B07'] = (tr.clone_df['hla-b'].str.startswith("B*07") | tr.clone_df['hla-b_1'].str.startswith("B*07")) tr.clone_df['B07'] = ["B*07" if (x) else "NOTB*07 " for x in tr.clone_df['B07']] #sparse.save_npz("S.npz", S) from tcrdist.rep_funcs import compute_n_tally_out_of_memory nn_tally_df_cohort = compute_n_tally_out_of_memory(fragments, matrix_name = "rw_beta", pm_processes = 6, to_file = False, to_memory = True, knn_radius = 25, x_cols = ['B07']) from hierdiff.association_testing import cluster_association_test nn_associations = cluster_association_test(res = nn_tally_df_cohort, y_col='cmember', method='fishers') nn_associations = nn_associations.sort_values('pvalue', ascending = True) import ast nn_associations['neighbors_i'] = nn_associations.neighbors.apply(lambda x: ast.literal_eval(x)) from tcrdist.summarize import test_for_almost_subsets, filter_is, filter_gt nn_associations['mostly_unique'] = test_for_almost_subsets(nn_associations['neighbors_i'], thr = 5) nr_nn_associations = filter_is(nn_associations, 'mostly_unique', 1).copy() #nr_nn_associations = filter_gt(nr_nn_associations, 'K_neighbors', 25).copy() nr_nn_associations # MOTIF GENERATION from tcrsampler.sampler import TCRsampler t = TCRsampler() if 'olga_human_beta_t.sampler.tsv' not in t.currently_available_backgrounds(): t.download_background_file('olga_sampler.zip') #t.download_background_file('olga_sampler.zip') # ONLY IF NOT ALREADY DONE tcrsampler_beta = TCRsampler(default_background = 'olga_human_beta_t.sampler.tsv') tcrsampler_beta.build_background(max_rows = 1000) """SEE PALMOTIF DOCS (https://github.com/agartland/palmotif)""" from palmotif import compute_pal_motif, svg_logo from tcrdist.summarize import _select """GENERATE SVG GRAPHIC FOR EACH NODE OF THE TREE""" #pwmat_str = 'pw_beta' cdr3_name = 'cdr3_b_aa' gene_names = ['v_b_gene','j_b_gene'] svgs_beta = list() svgs_beta_raw = list() info_list = list() from tcrdist.rep_diff import member_summ summary = member_summ( res_df = nr_nn_associations, clone_df = tr.clone_df, addl_cols=['cohort','hla-a', 'hla-a_1', 'hla-b', 'hla-b_1', 'subject']) nr_nn_associations = pd.concat([nr_nn_associations, summary], axis = 1).reset_index() for i,r in nr_nn_associations.head(25).iterrows(): dfnode = tr.clone_df.iloc[r['neighbors_i'],:].copy() # <pwnode> Pairwise Matrix for node sequences pwnode = S[r['neighbors_i'],:] [:,r['neighbors_i']].todense() if dfnode.shape[0] > 2: iloc_idx = pwnode.sum(axis = 0).argmin() centroid = dfnode[cdr3_name].to_list()[iloc_idx] else: centroid = dfnode[cdr3_name].to_list()[0] print(f"CENTROID: {centroid}") gene_usage_beta = dfnode.groupby(gene_names).size() sampled_rep = tcrsampler_beta.sample( gene_usage_beta.reset_index().to_dict('split')['data'], flatten = True, depth = max(100, 1000 // dfnode.shape[0])) sampled_rep = [x for x in sampled_rep if x is not None] motif, stat = compute_pal_motif( seqs = _select(df = tr.clone_df, iloc_rows = r['neighbors_i'], col = cdr3_name), refs = sampled_rep, centroid = centroid) svgs_beta.append(svg_logo(motif, return_str= True)) sampled_rep = sampled_rep.append(centroid) motif_raw, _ = compute_pal_motif( seqs =_select(df = tr.clone_df, iloc_rows = r['neighbors_i'], col = cdr3_name), centroid = centroid) svgs_beta_raw.append(svg_logo(motif_raw, return_str= True)) info_list.append(r) def row_to_string(r, vals = ['ct_columns', 'val_0', 'ct_0', 'val_1', 'ct_1', 'val_2', 'ct_2','val_3', 'ct_3', 'levels', 'K_neighbors', 'R_radius', 'RR', 'OR', 'pvalue', 'FWERp','FDRq']): #d = {v:r[v] for v in vals} return "<br></br>".join([f"\t{v} : {r[v]}" for v in vals]) def to_html_table(r, vals = ['ct_columns', 'hla-a', 'hla-a_1', 'hla-b', 'hla-b_1', 'val_0', 'ct_0', 'val_2', 'ct_2', 'K_neighbors', 'R_radius', 'pvalue', 'FDRq','cdr3_b_aa','v_b_gene', 'j_b_gene', 'cohort','subject']): return pd.DataFrame(r[vals]).transpose().to_html() def shrink(html_str): return html_str.replace('height="100%"', 'height="10%"').\ replace('width="100%"', 'width="10%"') with open('svgs_in_line.html', 'w') as fh: fh.write(f"<html><body>\n") for svg, svg_raw, details in zip(svgs_beta, svgs_beta_raw, info_list): fh.write(f"{shrink(svg_raw)}{shrink(svg)}") try: fh.write(to_html_table(details)) except: print("F") fh.write("<div></div>") fh.write(f"</html></body>\n")
def make_motif_logo(tcrsampler, clone_df, pwmat, centroid='CASSPDIEKYF', v_gene='TRBV7-9*01', radius=24, pwmat_str='pw_delta', cdr3_name='cdr3_d_aa', v_name='v_d_gene', gene_names=['v_d_gene', 'j_d_gene']): """ Make a motif from a tcrrep clone_df, pwmat, and a tcrsampler. Parameters ---------- tcrsampler : tcrsamper.TCRsampler, clone_df : pd.DataFrame, pwmat : np.array, centroid : str e.g.,'CASSPDIEKYF', v_gene : str e.g. 'TRBV7-9*01', radius = int e.g., 26, pwmat_str : str e.g.,'pw_delta', cdr3_name : str e.g., 'cdr3_d_aa', v_name : str e.g., 'v_d_gene', gene_names : list eg., ['v_d_gene','j_d_gene'] Returns ------- svg : str svg_raw : str Notes ----- There is a safety first, efficiency loss involved since we are relocating neighbors that may already be know, but by looking up the row index <irow> fisrst matching V,CDR3 this function can be evoked without knowing anything about the positions of the neighbors ahead of time. This is particularly useful since clone_df order is not stable after groupby and deduplication. """ irow = clone_df[(clone_df[cdr3_name] == centroid) & (clone_df[v_name] == v_gene)].index[0] dfnode = clone_df[pd.Series(pwmat[irow, :]) <= radius].copy() dfnode[gene_names[0]] = dfnode[gene_names[0]].apply(lambda x: allele_01(x)) dfnode[gene_names[1]] = dfnode[gene_names[1]].apply(lambda x: allele_01(x)) gene_usage = dfnode.groupby(gene_names).size() sampled_rep = tcrsampler.sample( gene_usage.reset_index().to_dict('split')['data'], flatten=True, depth=100) sampled_rep = [x for x in sampled_rep if x is not None] motif, stat = compute_pal_motif(seqs=dfnode[cdr3_name], refs=sampled_rep, centroid=centroid) svg = svg_logo(motif, return_str=True) motif_raw, _ = compute_pal_motif(seqs=dfnode[cdr3_name], centroid=centroid) svg_raw = svg_logo(motif_raw, return_str=True) return svg, svg_raw
iloc_idx = pwnode.sum(axis = 0).argmin() centroid = dfnode[cdr3_name].to_list()[iloc_idx] else: centroid = dfnode[cdr3_name].to_list()[0] print(f"ALPHA-CHAIN CENTROID: {centroid}") gene_usage_alpha = dfnode.groupby(gene_names).size() sampled_rep = tcrsampler_alpha.sample( gene_usage_alpha.reset_index().to_dict('split')['data'], flatten = True, depth = 10) sampled_rep = [x for x in sampled_rep if x is not None] motif, stat = compute_pal_motif( seqs = _select(df = tr.clone_df, iloc_rows = r['neighbors_i'], col = cdr3_name), refs = sampled_rep, centroid = centroid) svgs_alpha.append(svg_logo(motif, return_str= True)) sampled_rep = sampled_rep.append(centroid) motif_raw, _ = compute_pal_motif( seqs =_select(df = tr.clone_df, iloc_rows = r['neighbors_i'], col = cdr3_name), centroid = centroid) svgs_alpha_raw.append(svg_logo(motif_raw, return_str= True)) """Add Alpha SVG graphics to hcluster_df""" tr.hcluster_df['svg_alpha'] = svgs_alpha tr.hcluster_df['svg_alpha_raw'] = svgs_alpha_raw
def _tcrsampler_svgs(tcrrep, default_background=None, default_background_if_missing=None, cdr3_name='cdr3_b_aa', pwmat_str='pw_cdr3_b_aa', chain='beta', gene_names=['v_b_gene', 'j_b_gene'], combine_olga=False, verbose=True): """ Breath. What does this do? Given a TCRrep instance, this function samples a background repertoire using TCRsampler and makes svg-logos using palmotif. This function doesn't return anything it. It needs to access attribute values of a TCRrep (tcrrep) instance and it modifies th etcrrep in place adding svgs and stats colums to .hcluster_df_detailed DataFrame. TODO: could just output a dataframe which would then just be concattenated. ONLY WORKS WITH _BETA using defaults: Notes ----- Note: TCRSampler.build_background() accepts kwargs, we've set these as fixed as most user won't know what these do and won't need to change them. max_rows : int Maximum clones per v,j pair (per subject) stratify_by_subject : bool If True, max_rows will apply to v,j,subject. If False, max_rows applies to v,j use_frequency : bool If True, uses frequency for ranking rows. If False, uses raw counts. make_singleton : bool If True, background is still sorted by frequency or counts, but final fequency and counts values are overridden and set to 1. """ from tcrsampler.sampler import TCRsampler from palmotif import compute_pal_motif, svg_logo import pandas as pd from tcrdist.summarize import _select if chain == 'alpha' and tcrrep.organism == "mouse": # Here we enforce the rule that alpha-mouse cannot use an olga-sampler # TODO: This should be removed as soon as TCRsampler can be updated with a valid # mouse-alpha simulated background. combine_olga = False # _default_sampler returns a TCRSampler based on organism and chain if verbose: print(f"INITIALIZING A TCRSAMPLER") print(tcrrep.organism, chain) t = _default_sampler(organism=tcrrep.organism, chain=chain)( default_background=default_background, default_background_if_missing=default_background_if_missing) build_kargs = { 'max_rows': 100, 'stratify_by_subject': True, 'use_frequency': True, 'make_singleton': False } build_kargs_olga = { 'max_rows': 1000, 'stratify_by_subject': False, 'use_frequency': True, 'make_singleton': False } if verbose: print(f"BUILDING A DEEPER BACKGROUND {report_kwargs(build_kargs)}") t.build_background(**build_kargs) # Olga Sampler if combine_olga: t_olga = _default_sampler_olga(chain=chain, organism=tcrrep.organism)() if verbose: print( f"BUILDING A DEEPER BACKGROUND {report_kwargs(build_kargs_olga)}" ) t.build_background(**build_kargs_olga) olga_model = { ('beta', 'human'): OlgaModel(recomb_type="VDJ", chain_folder="human_T_beta"), ('alpha', 'human'): OlgaModel(recomb_type="VJ", chain_folder="human_T_alpha"), ('beta', 'mouse'): OlgaModel(recomb_type="VDJ", chain_folder="mouse_T_beta") }[(chain, tcrrep.organism)] if 'prune' not in tcrrep.hcluster_df.columns: if verbose: print("NO PRUNE COLUMNS USED ALL SET TO 0") tcrrep.hcluster_df['prune'] = 0 print("ITERATE THROUGH CLUSTERS") svgs = list() svgs_raw = list() reference_unique = list() reference_unique_olga = list() reference_size = list() reference_size_olga = list() percent_missing_sampler = list() percent_missing_sampler_olga = list() n_rows = tcrrep.hcluster_df.shape[0] bar = IncrementalBar(f'Make {chain} SVGs :', max=n_rows, suffix='%(percent)d%%') for i, r in tcrrep.hcluster_df.iterrows(): bar.next() if r['prune'] == 0: # <dfnode> is dataframe with all the clones at a given tree node dfnode = tcrrep.clone_df.iloc[r['neighbors_i'], ].copy() # <pwnode> Pairwise Matrix for node sequences pwnode = getattr( tcrrep, pwmat_str)[r['neighbors_i'], :][:, r['neighbors_i']].copy() iloc_idx = pwnode.sum(axis=0).argmin() centroid = dfnode[cdr3_name].to_list()[iloc_idx] # Compute gene usage at the node # Convert to allele_01 for gene_name in gene_names: dfnode[gene_name] = dfnode[gene_name].apply( lambda x: allele_01(x)) gene_usage = dfnode.groupby( gene_names).size() # e.g., ['v_b_gene','j_b_gene'] gene_usage_tuples = gene_usage.reset_index().to_dict( 'split')['data'] # Given gene usage use the <t> a TCRsampler instance to get background seqs # Adjust depth for small nodes adjust_depth = 10 * round(10 / dfnode.shape[0]) if adjust_depth < 10: adjust_depth = 10 sampled_rep = t.sample(gene_usage_tuples, flatten=True, depth=adjust_depth * 10) # Only keep the non-none sequences sampled_rep = [x for x in sampled_rep if x is not None] # < missing_gene > Count the percentage missing, sampler returns none when no v,j pair is present expected_depth = dfnode.shape[0] * adjust_depth * 10 recovered_depth = len(sampled_rep) percent_missing = round( 100 * (1 - (recovered_depth / expected_depth)), 1) percent_missing_sampler.append(f"{percent_missing}%") reference_unique.append(str(pd.Series(sampled_rep).nunique())) reference_size.append(str(pd.Series(sampled_rep).count())) if combine_olga: # We modified Olga source code slightly, such that we simulated sequences # with a given V,J gene usage # OLD METHOD WHERE WE ACTUALLY SAMPLED, slower but can go much deeper. I don't think one rare sequence however, really make a big difference. #flatten = lambda l: [item for sublist in l for item in sublist] #sampled_rep_olga = [olga_model.gen_cdr3s(allele_01(v),allele_01(j),n*adjust_depth*10) for v,j,n in gene_usage_tuples] #sampled_rep_olga = [x for x in flatten(sampled_rep_olga) if x is not None] sampled_rep_olga = t_olga.sample(gene_usage_tuples, flatten=True, depth=adjust_depth * 10) sampled_rep_olga = [ x for x in sampled_rep_olga if x is not None ] expected_depth = dfnode.shape[0] * adjust_depth * 10 recovered_depth = len(sampled_rep_olga) percent_missing_olga = round( 100 * (1 - (recovered_depth / expected_depth)), 1) percent_missing_sampler_olga.append(f"{percent_missing_olga}%") reference_unique_olga.append( str(pd.Series(sampled_rep_olga).nunique())) reference_size_olga.append( str(pd.Series(sampled_rep_olga).count())) # HERE WE COMBINE INTO A SINGLE BACKGROUND: sampled_rep = sampled_rep + sampled_rep_olga # Get motif matrix and motif stats motif, stat = compute_pal_motif(seqs=_select( df=tcrrep.clone_df, iloc_rows=r['neighbors_i'], col=cdr3_name), refs=sampled_rep, centroid=centroid) svgs.append(svg_logo(motif, return_str=True)) # repeaat without references raw_motif, raw_stat = compute_pal_motif(seqs=_select( df=tcrrep.clone_df, iloc_rows=r['neighbors_i'], col=cdr3_name), centroid=centroid) # Convert the motif matrix into an svg_logo, append to list svgs_raw.append(svg_logo(raw_motif, return_str=True)) else: # If prune column is 1 don't go to the trouble of sampling and generating seqs svgs.append("PRUNE") svgs_raw.append("PRUNE") reference_size.append("PRUNE") reference_unique.append("PRUNE") percent_missing_sampler.append("PRUNE") percent_missing_sampler_olga.append("PRUNE") reference_unique_olga.append("PRUNE") reference_size_olga.append("PRUNE") bar.next() bar.finish() # The standard svg_ includes background, whereas raw has no background tcrrep.hcluster_df_detailed[f'svg_{chain}'] = svgs tcrrep.hcluster_df_detailed[f'svg_raw_{chain}'] = svgs_raw tcrrep.hcluster_df_detailed[f'ref_size_{chain}'] = reference_size tcrrep.hcluster_df_detailed[f'ref_unique_{chain}'] = reference_unique tcrrep.hcluster_df_detailed[ f'percent_missing_{chain}'] = percent_missing_sampler if combine_olga: tcrrep.hcluster_df_detailed[ f'ref_size_olga_{chain}'] = reference_size_olga tcrrep.hcluster_df_detailed[ f'ref_unique_olga_{chain}'] = reference_unique_olga tcrrep.hcluster_df_detailed[ f'percent_missing_olga_{chain}'] = percent_missing_sampler_olga return True