def add_features(self, include_indep_genes=False, include_joint_genes=True, custom_pgen_model=None): """Generates a list of feature_lsts for a length dependent L pos model. Parameters ---------- include_genes : bool If true, features for gene selection are also generated. Currently joint V/J pairs used. custom_pgen_model: string path to folder of custom olga model. """ features = [] L_features = [['l' + str(L)] for L in range(self.min_L, self.max_L + 1)] features += L_features for L in range(self.min_L, self.max_L + 1): for i in range(L): for aa in self.amino_acids: features.append(['l' + str(L), 'a' + aa + str(i)]) if include_indep_genes or include_joint_genes: import olga.load_model as olga_load_model if custom_pgen_model is None: main_folder = os.path.join(os.path.dirname(__file__), 'default_models', self.chain_type) else: main_folder = custom_pgen_model params_file_name = os.path.join(main_folder, 'model_params.txt') V_anchor_pos_file = os.path.join(main_folder, 'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(main_folder, 'J_gene_CDR3_anchors.csv') if self.vj: genomic_data = olga_load_model.GenomicDataVJ() else: genomic_data = olga_load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) if include_indep_genes: features += [[v] for v in set([ gene_to_num_str(genV[0], 'V') for genV in genomic_data.genV ])] features += [[j] for j in set([ gene_to_num_str(genJ[0], 'J') for genJ in genomic_data.genJ ])] if include_joint_genes: features += [[v, j] for v in set([ gene_to_num_str(genV[0], 'V') for genV in genomic_data.genV ]) for j in set([ gene_to_num_str(genJ[0], 'J') for genJ in genomic_data.genJ ])] self.update_model(add_features=features)
def __init__(self, sonia_model=None, include_genes=True, processes=None, custom_olga_model=None): if type(sonia_model) == str or sonia_model is None: print('ERROR: you need to pass a Sonia object') return self.sonia_model = sonia_model self.include_genes = include_genes # only count usable cpus # (mp.cpu_count() returns total number of cpus even if not all are available e.g. when running on cluster) if processes is None: self.processes = len(os.sched_getaffinity(0)) else: self.processes = processes # define olga model if custom_olga_model is not None: self.pgen_model = custom_olga_model else: try: if self.sonia_model.custom_pgen_model is None: main_folder = os.path.join(os.path.dirname(__file__), 'default_models', self.sonia_model.chain_type) else: main_folder = self.sonia_model.custom_pgen_model except: main_folder = os.path.join(os.path.dirname(__file__), 'default_models', self.sonia_model.chain_type) params_file_name = os.path.join(main_folder, 'model_params.txt') marginals_file_name = os.path.join(main_folder, 'model_marginals.txt') V_anchor_pos_file = os.path.join(main_folder, 'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(main_folder, 'J_gene_CDR3_anchors.csv') if self.sonia_model.vj: self.genomic_data = olga_load_model.GenomicDataVJ() self.genomic_data.load_igor_genomic_data( params_file_name, V_anchor_pos_file, J_anchor_pos_file) self.generative_model = olga_load_model.GenerativeModelVJ() self.generative_model.load_and_process_igor_model( marginals_file_name) self.pgen_model = pgen.GenerationProbabilityVJ( self.generative_model, self.genomic_data) else: self.genomic_data = olga_load_model.GenomicDataVDJ() self.genomic_data.load_igor_genomic_data( params_file_name, V_anchor_pos_file, J_anchor_pos_file) self.generative_model = olga_load_model.GenerativeModelVDJ() self.generative_model.load_and_process_igor_model( marginals_file_name) self.pgen_model = pgen.GenerationProbabilityVDJ( self.generative_model, self.genomic_data)
def generate_simulated_beta_seqs( params_file_name='tcrdist/default_models/human_T_beta/model_params.txt', marginals_file_name='tcrdist/default_models/human_T_beta/model_marginals.txt', V_anchor_pos_file='tcrdist/default_models/human_T_beta/V_gene_CDR3_anchors.csv', J_anchor_pos_file='tcrdist/default_models/human_T_beta/J_gene_CDR3_anchors.csv', output_cols=['cdr3_b_aa', "v_b_gene", 'j_b_gene'], n=100000): #Load data genomic_data = load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) #Load model generative_model = load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model(marginals_file_name) seq_gen_model = seq_gen.SequenceGenerationVDJ(generative_model, genomic_data) #Generate some random sequences vs = [x[0] for x in genomic_data.__dict__['genV']] js = [x[0] for x in genomic_data.__dict__['genJ']] vs = {i: k for i, k in enumerate(vs)} js = {i: k for i, k in enumerate(js)} sim_cdr3 = [seq_gen_model.gen_rnd_prod_CDR3()[1:4] for x in range(n)] sim_cdr3_long = [(i, vs[v], js[j]) for i, v, j in sim_cdr3] df = pd.DataFrame(sim_cdr3_long, columns=output_cols) return df
def compute_pgen(index, seq): index_ = int(index) main_folder = os.path.join(local_directory, 'default_models', options_of[index_]) params_file_name = os.path.join(main_folder, 'model_params.txt') marginals_file_name = os.path.join(main_folder, 'model_marginals.txt') V_anchor_pos_file = os.path.join(main_folder, 'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(main_folder, 'J_gene_CDR3_anchors.csv') if options_of[index] in vj_chains: genomic_data = olga_load_model.GenomicDataVJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVJ() generative_model.load_and_process_igor_model(marginals_file_name) pgen_model = pgen.GenerationProbabilityVJ(generative_model, genomic_data) else: genomic_data = olga_load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model(marginals_file_name) pgen_model = pgen.GenerationProbabilityVDJ(generative_model, genomic_data) return pgen_model.compute_aa_CDR3_pgen(seq[0], seq[1], seq[2]) / norms[index_][0]
def __init__(self,sonia_model=None,include_genes=True,processes=None,custom_olga_model=None): if type(sonia_model)==str or sonia_model is None: print('ERROR: you need to pass a Sonia object') return self.sonia_model=sonia_model self.include_genes=include_genes if processes is None: self.processes = mp.cpu_count() else: self.processes = processes # define olga model if custom_olga_model is not None: self.pgen_model = custom_olga_model else: main_folder=os.path.join(os.path.dirname(__file__), 'default_models', self.sonia_model.chain_type) params_file_name = os.path.join(main_folder,'model_params.txt') marginals_file_name = os.path.join(main_folder,'model_marginals.txt') V_anchor_pos_file = os.path.join(main_folder,'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(main_folder,'J_gene_CDR3_anchors.csv') if self.sonia_model.vj: genomic_data = olga_load_model.GenomicDataVJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVJ() generative_model.load_and_process_igor_model(marginals_file_name) self.pgen_model = pgen.GenerationProbabilityVJ(generative_model, genomic_data) else: genomic_data = olga_load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model(marginals_file_name) self.pgen_model = pgen.GenerationProbabilityVDJ(generative_model, genomic_data)
def sample_olga(num_gen_seqs=1, chain_index=0, ppost=False, seed=None): if seed is not None: np.random.seed(seed) else: np.random.seed() num_gen_seqs = np.min([num_gen_seqs, 1000]) chain_type = options_of[chain_index] main_folder = os.path.join(local_directory, 'default_models', chain_type) params_file_name = os.path.join(main_folder, 'model_params.txt') marginals_file_name = os.path.join(main_folder, 'model_marginals.txt') V_anchor_pos_file = os.path.join(main_folder, 'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(main_folder, 'J_gene_CDR3_anchors.csv') if options_of[chain_index] in vj_chains: genomic_data = olga_load_model.GenomicDataVJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVJ() generative_model.load_and_process_igor_model(marginals_file_name) sg_model = seq_gen.SequenceGenerationVJ(generative_model, genomic_data) else: genomic_data = olga_load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model(marginals_file_name) sg_model = seq_gen.SequenceGenerationVDJ(generative_model, genomic_data) if not bool(ppost): return [ [ seq[0], seq[1], genomic_data.genV[seq[2]][0].split('*')[0], genomic_data.genJ[seq[3]][0].split('*')[0] ] for seq in [sg_model.gen_rnd_prod_CDR3() for _ in range(int(num_gen_seqs))] ] else: qm = MinimalSonia(qfiles[chain_index], norms[chain_index][1]) seqs_post = [['a', 'b', 'c', 'd']] # initialize while len(seqs_post) < num_gen_seqs: seqs = [[ seq[0], seq[1], genomic_data.genV[seq[2]][0].split('*')[0], genomic_data.genJ[seq[3]][0].split('*')[0] ] for seq in [ sg_model.gen_rnd_prod_CDR3() for _ in range(int(11 * num_gen_seqs)) ]] Qs = qm.compute_sel_factor(list(np.array(seqs)[:, 1:])) random_samples = np.random.uniform( size=len(Qs)) # sample from uniform distribution #do rejection rejection_selection = random_samples < np.clip(Qs, 0, 10) / 10. print( np.sum(rejection_selection) / float(len(rejection_selection))) seqs_post = np.concatenate( [seqs_post, np.array(seqs)[rejection_selection]]) return seqs_post[1:num_gen_seqs + 1]
def __init__(self, sonia_model=None, custom_olga_model=None, custom_genomic_data=None): if type(sonia_model) == str or sonia_model is None: print('ERROR: you need to pass a Sonia object') return self.sonia_model = sonia_model # sonia model passed as an argument # define olga sequence_generation model if custom_olga_model is not None: if custom_genomic_data is None: print('ERROR: you need to pass also the custom_genomic_data') return self.genomic_data = custom_genomic_data self.seq_gen_model = custom_olga_model else: main_folder = os.path.join( os.path.dirname(olga_load_model.__file__), 'default_models', self.sonia_model.chain_type) params_file_name = os.path.join(main_folder, 'model_params.txt') marginals_file_name = os.path.join(main_folder, 'model_marginals.txt') V_anchor_pos_file = os.path.join(main_folder, 'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(main_folder, 'J_gene_CDR3_anchors.csv') if self.sonia_model.chain_type != 'human_T_alpha': self.genomic_data = olga_load_model.GenomicDataVDJ() self.genomic_data.load_igor_genomic_data( params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model( marginals_file_name) self.seq_gen_model = seq_gen.SequenceGenerationVDJ( generative_model, self.genomic_data) else: self.genomic_data = olga_load_model.GenomicDataVJ() self.genomic_data.load_igor_genomic_data( params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVJ() generative_model.load_and_process_igor_model( marginals_file_name) self.seq_gen_model = seq_gen.SequenceGenerationVJ( generative_model, self.genomic_data) # you need Z for rejection selection and generate sequences ppost --> compute only once self.energies_gen = self.sonia_model.compute_energy( self.sonia_model.gen_seq_features) self.Z = np.sum(np.exp(-self.energies_gen)) / len(self.energies_gen)
def add_features(self, custom_pgen_model=None): """Generates a list of feature_lsts for L/R pos model. Parameters ---------- include_genes : bool If true, features for gene selection are also generated. Currently joint V/J pairs used. custom_pgen_model: string path to folder of custom olga model. """ features = [] L_features = [['l' + str(L)] for L in range(1, self.max_L + 1)] import olga.load_model as olga_load_model if custom_pgen_model is None: main_folder = os.path.join(os.path.dirname(__file__), 'default_models', self.chain_type) else: main_folder = custom_pgen_model params_file_name = os.path.join(main_folder, 'model_params.txt') V_anchor_pos_file = os.path.join(main_folder, 'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(main_folder, 'J_gene_CDR3_anchors.csv') if self.vj: genomic_data = olga_load_model.GenomicDataVJ() else: genomic_data = olga_load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) if self.joint_vjl: features += [[v, j, 'l' + str(l)] for v in set([ 'v' + genV[0].split('*')[0].split('V')[-1] for genV in genomic_data.genV ]) for j in set([ 'j' + genJ[0].split('*')[0].split('J')[-1] for genJ in genomic_data.genJ ]) for l in range(1, self.max_L + 1)] elif self.include_indep_genes: features += L_features features += [[v] for v in set( [gene_to_num_str(genV[0], 'V') for genV in genomic_data.genV])] features += [[j] for j in set( [gene_to_num_str(genJ[0], 'J') for genJ in genomic_data.genJ])] elif self.include_joint_genes: features += L_features features += [[v, j] for v in set([ gene_to_num_str(genV[0], 'V') for genV in genomic_data.genV ]) for j in set( [gene_to_num_str(genJ[0], 'J') for genJ in genomic_data.genJ])] self.update_model(add_features=features)
def define_olga_models(self,olga_model=None): """Defines Olga pgen and seqgen models and keeps them as attributes. Parameters ---------- olga_model: string Path to a folder specifying a custom IGoR formatted model to be used as a generative model. Folder must contain 'model_params.txt', model_marginals.txt','V_gene_CDR3_anchors.csv' and 'J_gene_CDR3_anchors.csv'. Attributes set -------------- genomic_data: object genomic data associate with the olga model. pgen_model: object olga model for evaluation of pgen. seq_gen_model: object olga model for generation of seqs. """ #Load generative model if olga_model is not None: try: # relative path pathdir= os.getcwd() main_folder = os.path.join(pathdir,olga_model) os.path.isfile(os.path.join(main_folder,'model_params.txt')) except: # absolute path main_folder=olga_model else: main_folder=os.path.join(os.path.dirname(olga_load_model.__file__), 'default_models', self.chain_type) params_file_name = os.path.join(main_folder,'model_params.txt') marginals_file_name = os.path.join(main_folder,'model_marginals.txt') V_anchor_pos_file = os.path.join(main_folder,'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(main_folder,'J_gene_CDR3_anchors.csv') genomic_data = olga_load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) self.genomic_data=genomic_data generative_model = olga_load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model(marginals_file_name) self.pgen_model = pgen.GenerationProbabilityVDJ(generative_model, genomic_data) self.pgen_model.V_mask_mapping=self.complement_V_mask(self.pgen_model) self.seq_gen_model = seq_gen.SequenceGenerationVDJ(generative_model, genomic_data)
def add_features(self, include_indep_genes=False, include_joint_genes=True, custom_pgen_model=None): """Generates a list of feature_lsts for a length dependent L pos model. Parameters ---------- min_L : int Minimum length CDR3 sequence max_L : int Maximum length CDR3 sequence include_genes : bool If true, features for gene selection are also generated. Currently joint V/J pairs used. """ import olga.load_model as olga_load_model features = [] if custom_pgen_model is None: main_folder = os.path.join( os.path.dirname(olga_load_model.__file__), 'default_models', self.chain_type) else: main_folder = custom_pgen_model params_file_name = os.path.join(main_folder, 'model_params.txt') V_anchor_pos_file = os.path.join(main_folder, 'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(main_folder, 'J_gene_CDR3_anchors.csv') genomic_data = olga_load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) features += [[v, j, 'l' + str(l)] for v in set([ 'v' + genV[0].split('*')[0].split('V')[-1] for genV in genomic_data.genV ]) for j in set([ 'j' + genJ[0].split('*')[0].split('J')[-1] for genJ in genomic_data.genJ ]) for l in range(self.min_L, self.max_L + 1)] self.update_model(add_features=features)
def define_olga_models(self, olga_model=None): """ Defines Olga pgen and seqgen models and keeps them as attributes. """ import olga.load_model as load_model import olga.generation_probability as pgen import olga.sequence_generation as seq_gen #Load generative model if olga_model is not None: try: # relative path pathdir = os.getcwd() main_folder = os.path.join(pathdir, olga_model) os.path.isfile(os.path.join(main_folder, 'model_params.txt')) except: # absolute path main_folder = olga_model else: main_folder = os.path.join(os.path.dirname(load_model.__file__), 'default_models', self.chain_type) params_file_name = os.path.join(main_folder, 'model_params.txt') marginals_file_name = os.path.join(main_folder, 'model_marginals.txt') V_anchor_pos_file = os.path.join(main_folder, 'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(main_folder, 'J_gene_CDR3_anchors.csv') genomic_data = load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) self.genomic_data = genomic_data generative_model = load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model(marginals_file_name) self.pgen_model = pgen.GenerationProbabilityVDJ( generative_model, genomic_data) self.pgen_model.V_mask_mapping = self.complement_V_mask( self.pgen_model) self.seq_gen_model = seq_gen.SequenceGenerationVDJ( generative_model, genomic_data)
def _load_params(self, model_params): """Private function for loading in the genomic parameter data for the IGoR model. Parameters ---------- model_params : str A file path location for the IGoR parameters model file. Returns ------- GenomicDataVJ or GenomicDataVDJ OLGA object The genomic data object class for a VJ or VDJ model. Raises ------ TypeError When the model input data cannot be loaded in as either a VJ or VDJ model. OSError When OLGA produces and system error with the input data. """ # Try to load the genomic data model for VDJ. try: genomic_data = None if self.type == 'VDJ': genomic_data = olga_load_model.GenomicDataVDJ() genomic_data.genD = olga_load_model.read_igor_D_gene_parameters( model_params) elif self.type == 'VJ': genomic_data = olga_load_model.GenomicDataVJ() else: raise TypeError( "Model genomic data could not be loaded as 'VDJ' or 'VJ' type" ) # Load the remainder of the data for the VJ model and return. genomic_data.genV = olga_load_model.read_igor_V_gene_parameters( model_params) genomic_data.genJ = olga_load_model.read_igor_J_gene_parameters( model_params) return genomic_data except Exception as err: raise OSError(err)
def return_genes(index): main_folder = os.path.join(local_directory, 'default_models', options_of[index]) params_file_name = os.path.join(main_folder, 'model_params.txt') marginals_file_name = os.path.join(main_folder, 'model_marginals.txt') V_anchor_pos_file = os.path.join(main_folder, 'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(main_folder, 'J_gene_CDR3_anchors.csv') if options_of[index] in vj_chains: genomic_data = olga_load_model.GenomicDataVJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) else: genomic_data = olga_load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) #select out genes gene_v = np.unique([value[0].split('*')[0] for value in genomic_data.genV]) gene_j = np.unique([value[0].split('*')[0] for value in genomic_data.genJ]) gene_v = list(gene_v) gene_j = list(gene_j) #select out bad genes if options_of[index] == 'human_T_beta': return gene_v, gene_j elif options_of[index] == 'human_T_alpha': bad_vs = ['TRAV8-4', 'TRAV3', 'TRAV26-2'] bad_js = ['TRAJ9', 'TRAJ58'] for v in bad_vs: gene_v.remove(v) for j in bad_js: gene_j.remove(j) return gene_v, gene_j elif options_of[index] == 'human_B_heavy': bad_vs = ['IGHV1-8', 'IGHV3-9', 'IGHV4-31', 'IGHV4-30-4'] for v in bad_vs: gene_v.remove(v) return gene_v, gene_j else: return gene_v, gene_j
def compute_all_pgens(seqs, model=None, processes=None, include_genes=True): ''' Compute Pgen of sequences using OLGA ''' #Load OLGA for seq pgen estimation if model is None: import olga.load_model as load_model import olga.generation_probability as pgen main_folder = os.path.join(os.path.dirname(load_model.__file__), 'default_models', chain_type) params_file_name = os.path.join(main_folder, 'model_params.txt') marginals_file_name = os.path.join(main_folder, 'model_marginals.txt') V_anchor_pos_file = os.path.join(main_folder, 'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(main_folder, 'J_gene_CDR3_anchors.csv') genomic_data = load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model(marginals_file_name) model_pgen = pgen.GenerationProbabilityVDJ(generative_model, genomic_data) # every process needs to access this vector, for sure there is a smarter way to implement this. final_models = [model for i in range(len(seqs))] pool = mp.Pool(processes=processes) if include_genes: f = pool.map(compute_pgen_expand, zip(seqs, final_models)) pool.close() return f else: f = pool.map(compute_pgen_expand_novj, zip(seqs, final_models)) pool.close() return f
def main(): """ Evaluate sequences.""" parser = OptionParser(conflict_handler="resolve") #specify model parser.add_option('--humanTRA', '--human_T_alpha', action='store_true', dest='humanTRA', default=False, help='use default human TRA model (T cell alpha chain)') parser.add_option('--humanTRB', '--human_T_beta', action='store_true', dest='humanTRB', default=False, help='use default human TRB model (T cell beta chain)') parser.add_option('--mouseTRB', '--mouse_T_beta', action='store_true', dest='mouseTRB', default=False, help='use default mouse TRB model (T cell beta chain)') parser.add_option('--humanIGH', '--human_B_heavy', action='store_true', dest='humanIGH', default=False, help='use default human IGH model (B cell heavy chain)') parser.add_option('--humanIGK', '--human_B_kappa', action='store_true', dest='humanIGK', default=False, help='use default human IGK model (B cell light kappa chain)') parser.add_option('--humanIGL', '--human_B_lambda', action='store_true', dest='humanIGL', default=False, help='use default human IGL model (B cell light lambda chain)') parser.add_option('--mouseTRA', '--mouse_T_alpha', action='store_true', dest='mouseTRA', default=False, help='use default mouse TRA model (T cell alpha chain)') parser.add_option('--set_custom_model_VDJ', dest='vdj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VDJ generative model') parser.add_option('--set_custom_model_VJ', dest='vj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VJ generative model') parser.add_option('--sonia_model', type='string', default = 'leftright', dest='model_type' ,help=' specify model type: leftright or lengthpos, default is leftright') parser.add_option('--ppost', '--Ppost', action='store_true', dest='ppost', default=False, help='compute Ppost, also computes pgen and Q') parser.add_option('--pgen', '--Pgen', action='store_true', dest='pgen', default=False, help='compute pgen') parser.add_option('--Q', '--selection_factor', action='store_true', dest='Q', default=False, help='compute Q') parser.add_option('--recompute_productive_norm', '--compute_norm', action='store_true', dest='recompute_productive_norm', default=False, help='recompute productive normalization') parser.add_option('--skip_off','--skip_empty_off', action='store_true', dest = 'skip_empty', default=True, help='stop skipping empty or blank sequences/lines (if for example you want to keep line index fidelity between the infile and outfile).') parser.add_option('-s','--chunk_size', type='int',metavar='N', dest='chunck_size', default = mp.cpu_count()*int(5e2), help='Number of sequences to evaluate at each iteration') #vj genes parser.add_option('--v_in', '--v_mask_index', type='int', metavar='INDEX', dest='V_mask_index', default=None, help='specifies V_masks are found in column INDEX in the input file. Default is None (do not condition on J usage).') parser.add_option('--j_in', '--j_mask_index', type='int', metavar='INDEX', dest='J_mask_index', default=None, help='specifies J_masks are found in column INDEX in the input file. Default is None (do not condition on J usage).') parser.add_option('--v_mask', type='string', dest='V_mask', help='specify V usage to condition as arguments.') parser.add_option('--j_mask', type='string', dest='J_mask', help='specify J usage to condition as arguments.') # input output parser.add_option('-i', '--infile', dest = 'infile_name',metavar='PATH/TO/FILE', help='read in CDR3 sequences (and optionally V/J masks) from PATH/TO/FILE') parser.add_option('-o', '--outfile', dest = 'outfile_name', metavar='PATH/TO/FILE', help='write CDR3 sequences and pgens to PATH/TO/FILE') parser.add_option('--seq_in', '--seq_index', type='int', metavar='INDEX', dest='seq_in_index', default = 0, help='specifies sequences to be read in are in column INDEX. Default is index 0 (the first column).') parser.add_option('-m', '--max_number_of_seqs', type='int',metavar='N', dest='max_number_of_seqs', help='evaluate for at most N sequences.') parser.add_option('--lines_to_skip', type='int',metavar='N', dest='lines_to_skip', default = 0, help='skip the first N lines of the file. Default is 0.') #delimiters parser.add_option('-d', '--delimiter', type='choice', dest='delimiter', choices=['tab', 'space', ',', ';', ':'], help="declare infile delimiter. Default is tab for .tsv input files, comma for .csv files, and any whitespace for all others. Choices: 'tab', 'space', ',', ';', ':'") parser.add_option('--raw_delimiter', type='str', dest='delimiter', help="declare infile delimiter as a raw string.") parser.add_option('--delimiter_out', type='choice', dest='delimiter_out', choices=['tab', 'space', ',', ';', ':'], help="declare outfile delimiter. Default is tab for .tsv output files, comma for .csv files, and the infile delimiter for all others. Choices: 'tab', 'space', ',', ';', ':'") parser.add_option('--raw_delimiter_out', type='str', dest='delimiter_out', help="declare for the delimiter outfile as a raw string.") parser.add_option('--gene_mask_delimiter', type='choice', dest='gene_mask_delimiter', choices=['tab', 'space', ',', ';', ':'], help="declare gene mask delimiter. Default comma unless infile delimiter is comma, then default is a semicolon. Choices: 'tab', 'space', ',', ';', ':'") parser.add_option('--raw_gene_mask_delimiter', type='str', dest='gene_mask_delimiter', help="declare delimiter of gene masks as a raw string.") parser.add_option('--comment_delimiter', type='str', dest='comment_delimiter', help="character or string to indicate comment or header lines to skip.") (options, args) = parser.parse_args() #Check that the model is specified properly main_folder = os.path.dirname(__file__) default_models = {} default_models['humanTRA'] = [os.path.join(main_folder, 'default_models', 'human_T_alpha'), 'VJ'] default_models['humanTRB'] = [os.path.join(main_folder, 'default_models', 'human_T_beta'), 'VDJ'] default_models['mouseTRB'] = [os.path.join(main_folder, 'default_models', 'mouse_T_beta'), 'VDJ'] default_models['humanIGH'] = [os.path.join(main_folder, 'default_models', 'human_B_heavy'), 'VDJ'] default_models['humanIGK'] = [os.path.join(main_folder, 'default_models', 'human_B_kappa'), 'VJ'] default_models['humanIGL'] = [os.path.join(main_folder, 'default_models', 'human_B_lambda'), 'VJ'] default_models['mouseTRA'] = [os.path.join(main_folder, 'default_models', 'mouse_T_alpha'), 'VJ'] num_models_specified = sum([1 for x in list(default_models.keys()) + ['vj_model_folder', 'vdj_model_folder'] if getattr(options, x)]) recompute_productive_norm=False if num_models_specified == 1: #exactly one model specified try: d_model = [x for x in default_models.keys() if getattr(options, x)][0] model_folder = default_models[d_model][0] recomb_type = default_models[d_model][1] except IndexError: if options.vdj_model_folder: #custom VDJ model specified recompute_productive_norm=True model_folder = options.vdj_model_folder recomb_type = 'VDJ' elif options.vj_model_folder: #custom VJ model specified recompute_productive_norm=True model_folder = options.vj_model_folder recomb_type = 'VJ' elif num_models_specified == 0: print('Need to indicate generative model.') print('Exiting...') return -1 elif num_models_specified > 1: print('Only specify one model') print('Exiting...') return -1 #Generative model specification -- note we'll probably change this syntax to #allow for arbitrary model file specification params_file_name = os.path.join(model_folder,'model_params.txt') marginals_file_name = os.path.join(model_folder,'model_marginals.txt') V_anchor_pos_file = os.path.join(model_folder,'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(model_folder,'J_gene_CDR3_anchors.csv') for x in [params_file_name, marginals_file_name, V_anchor_pos_file, J_anchor_pos_file]: if not os.path.isfile(x): print('Cannot find: ' + x) print('Please check the files (and naming conventions) in the model folder ' + model_folder) print('Exiting...') return -1 #Load up model based on recomb_type #VDJ recomb case --- used for TCRB and IGH if recomb_type == 'VDJ': genomic_data = olga_load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model(marginals_file_name) pgen_model = generation_probability.GenerationProbabilityVDJ(generative_model, genomic_data) #VJ recomb case --- used for TCRA and light chain elif recomb_type == 'VJ': genomic_data = olga_load_model.GenomicDataVJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVJ() generative_model.load_and_process_igor_model(marginals_file_name) pgen_model = generation_probability.GenerationProbabilityVJ(generative_model, genomic_data) if options.infile_name is not None: infile_name = options.infile_name if not os.path.isfile(infile_name): print('Cannot find input file: ' + infile_name) print('Exiting...') return -1 if options.outfile_name is not None: outfile_name = options.outfile_name # if os.path.isfile(outfile_name): # if not input(outfile_name + ' already exists. Overwrite (y/n)? ').strip().lower() in ['y', 'yes']: # print('Exiting...') # return -1 #Parse delimiter delimiter = options.delimiter if delimiter is None: #Default case if options.infile_name is None: delimiter = '\t' elif infile_name.endswith('.tsv'): #parse TAB separated value file delimiter = '\t' elif infile_name.endswith('.csv'): #parse COMMA separated value file delimiter = ',' else: try: delimiter = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[delimiter] except KeyError: pass #Other string passed as the delimiter. #Parse delimiter_out delimiter_out = options.delimiter_out if delimiter_out is None: #Default case if delimiter is None: delimiter_out = '\t' else: delimiter_out = delimiter if options.outfile_name is None: pass elif outfile_name.endswith('.tsv'): #output TAB separated value file delimiter_out = '\t' elif outfile_name.endswith('.csv'): #output COMMA separated value file delimiter_out = ',' else: try: delimiter_out = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[delimiter_out] except KeyError: pass #Other string passed as the delimiter. #Parse gene_delimiter gene_mask_delimiter = options.gene_mask_delimiter if gene_mask_delimiter is None: #Default case gene_mask_delimiter = ',' if delimiter == ',': gene_mask_delimiter = ';' else: try: gene_mask_delimiter = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[gene_mask_delimiter] except KeyError: pass #Other string passed as the delimiter. #More options seq_in_index = options.seq_in_index #where in the line the sequence is after line.split(delimiter) lines_to_skip = options.lines_to_skip #one method of skipping header comment_delimiter = options.comment_delimiter #another method of skipping header max_number_of_seqs = options.max_number_of_seqs V_mask_index = options.V_mask_index #Default is not conditioning on V identity J_mask_index = options.J_mask_index #Default is not conditioning on J identity skip_empty = options.skip_empty #print(V_mask_index,J_mask_index,seq_in_index,gene_mask_delimiter,delimiter) # choose sonia model type sonia_model=SoniaLeftposRightpos(feature_file=os.path.join(model_folder,'features.tsv'),log_file=os.path.join(model_folder,'log.txt'),vj=recomb_type == 'VJ',custom_pgen_model=model_folder) if options.recompute_productive_norm: print('Recompute productive normalization.') sonia_model.norm_productive=pgen_model.compute_regex_CDR3_template_pgen('CX{0,}') # load Evaluate model class ev=EvaluateModel(sonia_model, custom_olga_model=pgen_model, include_genes=False if ((V_mask_index is None) and (J_mask_index is None)) else True) if options.infile_name is None: #No infile specified -- args should be the input seq print_warnings = True if len(args)>1 : print('ERROR: can process only one sequence at the time. Submit thourgh file instead.') return -1 seq=args[0] #Format V and J masks -- uniform for all argument input sequences try: V_mask = options.V_mask.split(',') unrecognized_v_genes = [v for v in V_mask if gene_to_num_str(v, 'V') not in pgen_model.V_mask_mapping.keys()] V_mask = [v for v in V_mask if gene_to_num_str(v, 'V') in pgen_model.V_mask_mapping.keys()] if len(unrecognized_v_genes) > 0: print('These V genes/alleles are not recognized: ' + ', '.join(unrecognized_v_genes)) if len(V_mask) == 0: print('No recognized V genes/alleles in the provided V_mask. Continuing without conditioning on V usage.') V_mask = None except AttributeError: V_mask = options.V_mask #Default is None, i.e. not conditioning on V identity try: J_mask = options.J_mask.split(',') unrecognized_j_genes = [j for j in J_mask if gene_to_num_str(j, 'J') not in pgen_model.J_mask_mapping.keys()] J_mask = [j for j in J_mask if gene_to_num_str(j, 'J') in pgen_model.J_mask_mapping.keys()] if len(unrecognized_j_genes) > 0: print('These J genes/alleles are not recognized: ' + ', '.join(unrecognized_j_genes)) if len(J_mask) == 0: print('No recognized J genes/alleles in the provided J_mask. Continuing without conditioning on J usage.') J_mask = None except AttributeError: J_mask = options.J_mask #Default is None, i.e. not conditioning on J identity print('') if options.ppost: if options.V_mask is None: V_mask=[''] if options.J_mask is None: J_mask=[''] v,j=V_mask[0],J_mask[0] Q,pgen,ppost=ev.evaluate_seqs([[seq,v,j]]) print('Ppost of ' + seq + ' '+v+ ' '+j+ ': ' + str(ppost[0])) print('Pgen of ' + seq + ' '+v+ ' '+j+ ': ' + str(pgen[0])) print('Q of ' + seq + ' '+v+ ' '+j+ ': ' + str(Q[0])) print('') elif options.Q: if options.V_mask is None: V_mask=[''] if options.J_mask is None: J_mask=[''] v,j=V_mask[0],J_mask[0] Q=ev.evaluate_selection_factors([[seq,v,j]]) print('Q of ' + seq + ' '+v+ ' '+j+ ': ' + str(Q[0])) elif options.pgen: pgen=pgen_model.compute_aa_CDR3_pgen(seq,V_mask,J_mask) if J_mask is None: J_mask= '' if V_mask is None: V_mask= '' print('Pgen of ' + seq + ' '+','.join(V_mask)+ ' '+','.join(J_mask)+ ': ' + str(pgen)) else: print('Specify and option: --ppost, --pgen or --Q') else: print('Load file') seqs = [] V_usage_masks = [] J_usage_masks = [] infile = open(infile_name, 'r') for i, line in enumerate(infile): if comment_delimiter is not None: #Default case -- no comments/header delimiter if line.startswith(comment_delimiter): #allow comments continue if i < lines_to_skip: continue if delimiter is None: #Default delimiter is any whitespace split_line = line.split('\n')[0].split() else: split_line = line.split('\n')[0].split(delimiter) #Find the seq try: seq = split_line[seq_in_index].strip() if len(seq.strip()) == 0: if skip_empty: continue else: seqs.append(seq) #keep the blank seq as a placeholder #seq_types.append('aaseq') else: seqs.append(seq) #seq_types.append(determine_seq_type(seq, aa_alphabet)) except IndexError: #no index match for seq if skip_empty and len(line.strip()) == 0: continue print('seq_in_index is out of range') print('Exiting...') infile.close() return -1 #Find and format V_usage_mask if V_mask_index is None: V_usage_masks.append(['']) #default mask else: try: V_usage_mask = split_line[V_mask_index].strip().split(gene_mask_delimiter) #check that all V gene/allele names are recognized if all([gene_to_num_str(v, 'V') in pgen_model.V_mask_mapping for v in V_usage_mask]): V_usage_masks.append(V_usage_mask) else: print(str(V_usage_mask) + " is not a usable V_usage_mask composed exclusively of recognized V gene/allele names") print('Unrecognized V gene/allele names: ' + ', '.join([v for v in V_usage_mask if gene_to_num_str(v, 'V') not in pgen_model.V_mask_mapping.keys()])) print('Exiting...') infile.close() return -1 except IndexError: #no index match for V_mask_index print('V_mask_index is out of range') print('Exiting...') infile.close() return -1 #Find and format J_usage_mask if J_mask_index is None: J_usage_masks.append(['']) #default mask else: try: J_usage_mask = split_line[J_mask_index].strip().split(gene_mask_delimiter) #check that all V gene/allele names are recognized if all([gene_to_num_str(j, 'J') in pgen_model.J_mask_mapping for j in J_usage_mask]): J_usage_masks.append(J_usage_mask) else: print(str(J_usage_mask) + " is not a usable J_usage_mask composed exclusively of recognized J gene/allele names") print('Unrecognized J gene/allele names: ' + ', '.join([j for j in J_usage_mask if gene_to_num_str(j, 'J') not in pgen_model.J_mask_mapping.keys()])) print('Exiting...') infile.close() return -1 except IndexError: #no index match for J_mask_index print('J_mask_index is out of range') print('Exiting...') infile.close() return -1 if max_number_of_seqs is not None: if len(seqs) >= max_number_of_seqs: break # combine sequences. zipped=[[seqs[i],V_usage_masks[i][0],J_usage_masks[i][0]] for i in range(len(seqs))] print('Evaluate') if options.outfile_name is not None: #OUTFILE SPECIFIED with open(options.outfile_name,'w') as file: if options.ppost:file.write('Q'+delimiter_out+'Pgen'+delimiter_out+'Ppost\n') elif options.Q:file.write('Q\n') elif options.pgen:file.write('Pgen\n') else: print('Specify one option: --ppost, --pgen or --Q') return -1 for t in tqdm(chunks(zipped,options.chunck_size)): if options.ppost: Q,pgen,ppost=ev.evaluate_seqs(t) for i in range(len(Q)):file.write(str(Q[i])+delimiter_out+str(pgen[i])+delimiter_out+str(ppost[i])+'\n') elif options.Q: Q=ev.evaluate_selection_factors(t) for i in range(len(Q)):file.write(str(Q[i])+'\n') elif options.pgen: pgens=ev.compute_all_pgens(t) for i in range(len(pgens)):file.write(str(pgens[i])+'\n') else: #print to stdout for t in chunks(zipped,options.chunck_size): if options.ppost: Q,pgen,ppost=ev.evaluate_seqs(t) print ('Q, Pgen, Ppost') for i in range(len(Q)):print(Q[i],pgen[i],ppost[i]) elif options.Q: Q=ev.evaluate_selection_factors(t) print ('Q') print(Q) elif options.pgen: pgens=ev.compute_all_pgens(t) print ('Pgen') print(pgens) else: print('Specify one option: --ppost, --pgen or --Q')
def __init__(self, sonia_model=None, custom_olga_model=None, custom_genomic_data=None): if type(sonia_model) == str or sonia_model is None: print('ERROR: you need to pass a Sonia object') return self.sonia_model = sonia_model # sonia model passed as an argument # define olga sequence_generation model if custom_olga_model is not None: if type(custom_olga_model) == str: print( 'ERROR: you need to pass a olga object for the seq_gen model' ) return if custom_genomic_data is None: print('ERROR: you need to pass also the custom_genomic_data') return if type(custom_genomic_data) == str: print( 'ERROR: you need to pass a olga object for the genomic_data' ) return self.genomic_data = custom_genomic_data self.seq_gen_model = custom_olga_model else: try: if self.sonia_model.custom_pgen_model is None: main_folder = os.path.join(os.path.dirname(__file__), 'default_models', self.sonia_model.chain_type) else: main_folder = self.sonia_model.custom_pgen_model except: main_folder = os.path.join(os.path.dirname(__file__), 'default_models', self.sonia_model.chain_type) params_file_name = os.path.join(main_folder, 'model_params.txt') marginals_file_name = os.path.join(main_folder, 'model_marginals.txt') V_anchor_pos_file = os.path.join(main_folder, 'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(main_folder, 'J_gene_CDR3_anchors.csv') if not self.sonia_model.vj: self.genomic_data = olga_load_model.GenomicDataVDJ() self.genomic_data.load_igor_genomic_data( params_file_name, V_anchor_pos_file, J_anchor_pos_file) self.generative_model = olga_load_model.GenerativeModelVDJ() self.generative_model.load_and_process_igor_model( marginals_file_name) self.seq_gen_model = seq_gen.SequenceGenerationVDJ( self.generative_model, self.genomic_data) else: self.genomic_data = olga_load_model.GenomicDataVJ() self.genomic_data.load_igor_genomic_data( params_file_name, V_anchor_pos_file, J_anchor_pos_file) self.generative_model = olga_load_model.GenerativeModelVJ() self.generative_model.load_and_process_igor_model( marginals_file_name) self.seq_gen_model = seq_gen.SequenceGenerationVJ( self.generative_model, self.genomic_data)
def main(): """ Generate sequences.""" parser = OptionParser(conflict_handler="resolve") #specify model parser.add_option('--humanTRA', '--human_T_alpha', action='store_true', dest='humanTRA', default=False, help='use default human TRA model (T cell alpha chain)') parser.add_option('--humanTRB', '--human_T_beta', action='store_true', dest='humanTRB', default=False, help='use default human TRB model (T cell beta chain)') parser.add_option('--mouseTRB', '--mouse_T_beta', action='store_true', dest='mouseTRB', default=False, help='use default mouse TRB model (T cell beta chain)') parser.add_option('--humanIGH', '--human_B_heavy', action='store_true', dest='humanIGH', default=False, help='use default human IGH model (B cell heavy chain)') parser.add_option('--humanIGK', '--human_B_kappa', action='store_true', dest='humanIGK', default=False, help='use default human IGK model (B cell light kappa chain)') parser.add_option('--humanIGL', '--human_B_lambda', action='store_true', dest='humanIGL', default=False, help='use default human IGL model (B cell light lambda chain)') parser.add_option('--mouseTRA', '--mouse_T_alpha', action='store_true', dest='mouseTRA', default=False, help='use default mouse TRA model (T cell alpha chain)') parser.add_option('--set_custom_model_VDJ', dest='vdj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VDJ generative model') parser.add_option('--set_custom_model_VJ', dest='vj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VJ generative model') parser.add_option('--sonia_model', type='string', default = 'leftright', dest='model_type' ,help=' specify model type: leftright or lengthpos, default is leftright') parser.add_option('--post', '--ppost', action='store_true', dest='ppost', default=False, help='sample from post selected repertoire') parser.add_option('--pre', '--pgen', action='store_true', dest='pgen', default=False, help='sample from pre selected repertoire ') parser.add_option('--delimiter_out','-d', type='choice', dest='delimiter_out', choices=['tab', 'space', ',', ';', ':'], help="declare outfile delimiter. Default is tab for .tsv output files, comma for .csv files, and the infile delimiter for all others. Choices: 'tab', 'space', ',', ';', ':'") parser.add_option('-s','--chunk_size', type='int',metavar='N', dest='chunck_size', default = int(1e3), help='Number of sequences to generate at each iteration') parser.add_option('-r','--rejection_bound', type='int',metavar='N', dest='rejection_bound', default = 10, help='limit above which sequences are always accepted.') # input output parser.add_option('-o', '--outfile', dest = 'outfile_name', metavar='PATH/TO/FILE', help='write CDR3 sequences to PATH/TO/FILE') parser.add_option('-n', '--N', type='int',metavar='N', dest='num_seqs_to_generate',default=1, help='Number of sequences to sample from.') (options, args) = parser.parse_args() #Check that the model is specified properly main_folder = os.path.dirname(__file__) default_models = {} default_models['humanTRA'] = [os.path.join(main_folder, 'default_models', 'human_T_alpha'), 'VJ'] default_models['humanTRB'] = [os.path.join(main_folder, 'default_models', 'human_T_beta'), 'VDJ'] default_models['mouseTRB'] = [os.path.join(main_folder, 'default_models', 'mouse_T_beta'), 'VDJ'] default_models['humanIGH'] = [os.path.join(main_folder, 'default_models', 'human_B_heavy'), 'VDJ'] default_models['humanIGK'] = [os.path.join(main_folder, 'default_models', 'human_B_kappa'), 'VJ'] default_models['humanIGL'] = [os.path.join(main_folder, 'default_models', 'human_B_lambda'), 'VJ'] default_models['mouseTRA'] = [os.path.join(main_folder, 'default_models', 'mouse_T_alpha'), 'VJ'] num_models_specified = sum([1 for x in list(default_models.keys()) + ['vj_model_folder', 'vdj_model_folder'] if getattr(options, x)]) if num_models_specified == 1: #exactly one model specified try: d_model = [x for x in default_models.keys() if getattr(options, x)][0] model_folder = default_models[d_model][0] recomb_type = default_models[d_model][1] except IndexError: if options.vdj_model_folder: #custom VDJ model specified model_folder = options.vdj_model_folder recomb_type = 'VDJ' elif options.vj_model_folder: #custom VJ model specified model_folder = options.vj_model_folder recomb_type = 'VJ' elif num_models_specified == 0: print('Need to indicate generative model.') print('Exiting...') return -1 elif num_models_specified > 1: print('Only specify one model') print('Exiting...') return -1 #Parse delimiter_out delimiter_out = options.delimiter_out if delimiter_out is None: #Default case delimiter_out = '\t' if options.outfile_name is None: pass elif options.outfile_name.endswith('.tsv'): #output TAB separated value file delimiter_out = '\t' elif options.outfile_name.endswith('.csv'): #output COMMA separated value file delimiter_out = ',' else: try: delimiter_out = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[delimiter_out] except KeyError: pass #Other string passed as the delimiter. #Generative model specification -- note we'll probably change this syntax to #allow for arbitrary model file specification params_file_name = os.path.join(model_folder,'model_params.txt') marginals_file_name = os.path.join(model_folder,'model_marginals.txt') V_anchor_pos_file = os.path.join(model_folder,'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(model_folder,'J_gene_CDR3_anchors.csv') for x in [params_file_name, marginals_file_name, V_anchor_pos_file, J_anchor_pos_file]: if not os.path.isfile(x): print('Cannot find: ' + x) print('Please check the files (and naming conventions) in the model folder ' + model_folder) print('Exiting...') return -1 #Load up model based on recomb_type #VDJ recomb case --- used for TCRB and IGH if recomb_type == 'VDJ': genomic_data = olga_load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model(marginals_file_name) seqgen_model = sequence_generation.SequenceGenerationVDJ(generative_model, genomic_data) #VJ recomb case --- used for TCRA and light chain elif recomb_type == 'VJ': genomic_data = olga_load_model.GenomicDataVJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVJ() generative_model.load_and_process_igor_model(marginals_file_name) seqgen_model = sequence_generation.SequenceGenerationVJ(generative_model, genomic_data) if options.pgen:sonia_model=SoniaLeftposRightpos() else:sonia_model=SoniaLeftposRightpos(feature_file=os.path.join(model_folder,'features.tsv'),log_file=os.path.join(model_folder,'log.txt'),vj=recomb_type == 'VJ') # load Evaluate model class seq_gen=SequenceGeneration(sonia_model,custom_olga_model=seqgen_model,custom_genomic_data=genomic_data) if options.outfile_name is not None: #OUTFILE SPECIFIED with open(options.outfile_name,'w') as file: to_generate=chuncks(options.num_seqs_to_generate,options.chunck_size) for t in tqdm(to_generate): if options.pgen: seqs=seq_gen.generate_sequences_pre(num_seqs=t,nucleotide=True) elif options.ppost: seqs=seq_gen.generate_sequences_post(num_seqs=t,nucleotide=True,upper_bound=options.rejection_bound) else: print ('ERROR: give option between --pre or --post') return -1 for seq in seqs: file.write(seq[0]+delimiter_out+seq[1]+delimiter_out+seq[2]+delimiter_out+seq[3]+'\n') # np.savetxt(options.outfile_name,seqs,fmt='%s') else: #print to stdout to_generate=chuncks(options.num_seqs_to_generate,options.chunck_size) for t in to_generate: if options.pgen: seqs=seq_gen.generate_sequences_pre(num_seqs=t,nucleotide=True) elif options.ppost: seqs=seq_gen.generate_sequences_post(num_seqs=t,nucleotide=True,upper_bound=options.rejection_bound) else: print ('ERROR: give option between --pre or --post') return -1 for seq in seqs: print(seq[0],seq[1],seq[2],seq[3])
def add_generated_seqs(self, num_gen_seqs=0, reset_gen_seqs=True, custom_model_folder=None): """Generates MonteCarlo sequences for gen_seqs using OLGA. Only generates seqs from a V(D)J model. Requires the OLGA package (pip install olga). Parameters ---------- num_gen_seqs : int or float Number of MonteCarlo sequences to generate and add to the specified sequence pool. custom_model_folder : str Path to a folder specifying a custom IGoR formatted model to be used as a generative model. Folder must contain 'model_params.txt' and 'model_marginals.txt' Attributes set -------------- gen_seqs : list MonteCarlo sequences drawn from a VDJ recomb model gen_seq_features : list Features gen_seqs have been projected onto. """ #Load generative model if custom_model_folder is None: main_folder = os.path.join( os.path.dirname(olga_load_model.__file__), 'default_models', self.chain_type) else: main_folder = custom_model_folder params_file_name = os.path.join(main_folder, 'model_params.txt') marginals_file_name = os.path.join(main_folder, 'model_marginals.txt') V_anchor_pos_file = os.path.join(main_folder, 'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(main_folder, 'J_gene_CDR3_anchors.csv') if not os.path.isfile(params_file_name) or not os.path.isfile( marginals_file_name): print 'Cannot find specified custom generative model files: ' + '\n' + params_file_name + '\n' + marginals_file_name print 'Exiting sequence generation...' return None if not os.path.isfile(V_anchor_pos_file): V_anchor_pos_file = os.path.join( os.path.dirname(olga_load_model.__file__), 'default_models', self.chain_type, 'V_gene_CDR3_anchors.csv') if not os.path.isfile(J_anchor_pos_file): J_anchor_pos_file = os.path.join( os.path.dirname(olga_load_model.__file__), 'default_models', self.chain_type, 'J_gene_CDR3_anchors.csv') if self.chain_type.endswith('TRA'): genomic_data = olga_load_model.GenomicDataVJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVJ() generative_model.load_and_process_igor_model(marginals_file_name) sg_model = seq_gen.SequenceGenerationVJ(generative_model, genomic_data) else: genomic_data = olga_load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model(marginals_file_name) sg_model = seq_gen.SequenceGenerationVDJ(generative_model, genomic_data) #Generate sequences seqs = [ [ seq[1], genomic_data.genV[seq[2]][0].split('*')[0], genomic_data.genJ[seq[3]][0].split('*')[0] ] for seq in [sg_model.gen_rnd_prod_CDR3() for _ in range(int(num_gen_seqs))] ] if reset_gen_seqs: #reset gen_seqs if needed self.gen_seqs = [] #Add to specified pool(s) self.update_model(add_gen_seqs=seqs)
def add_generated_seqs(self, num_gen_seqs = 0, reset_gen_seqs = True, custom_model_folder = None, add_error=False,custom_error=None): """Generates MonteCarlo sequences for gen_seqs using OLGA. Only generates seqs from a V(D)J model. Requires the OLGA package (pip install olga). Parameters ---------- num_gen_seqs : int or float Number of MonteCarlo sequences to generate and add to the specified sequence pool. custom_model_folder : str Path to a folder specifying a custom IGoR formatted model to be used as a generative model. Folder must contain 'model_params.txt' and 'model_marginals.txt' add_error: bool simualate sequencing error: default is false custom_error: int set custom error rate for sequencing error. Default is the one inferred by igor. Attributes set -------------- gen_seqs : list MonteCarlo sequences drawn from a VDJ recomb model gen_seq_features : list Features gen_seqs have been projected onto. """ from sonia.utils import add_random_error from olga.utils import nt2aa #Load generative model if custom_model_folder is None: try: if self.custom_pgen_model is None: main_folder = os.path.join(os.path.dirname(__file__), 'default_models', self.chain_type) else: main_folder=self.custom_pgen_model except: main_folder = os.path.join(os.path.dirname(__file__), 'default_models', self.chain_type) else: main_folder = custom_model_folder params_file_name = os.path.join(main_folder,'model_params.txt') marginals_file_name = os.path.join(main_folder,'model_marginals.txt') V_anchor_pos_file = os.path.join(main_folder,'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(main_folder,'J_gene_CDR3_anchors.csv') if not os.path.isfile(params_file_name) or not os.path.isfile(marginals_file_name): print('Cannot find specified custom generative model files: ' + '\n' + params_file_name + '\n' + marginals_file_name) print('Exiting sequence generation...') return None if not os.path.isfile(V_anchor_pos_file): V_anchor_pos_file = os.path.join(os.path.dirname(olga_load_model.__file__), 'default_models', self.chain_type, 'V_gene_CDR3_anchors.csv') if not os.path.isfile(J_anchor_pos_file): J_anchor_pos_file = os.path.join(os.path.dirname(olga_load_model.__file__), 'default_models', self.chain_type, 'J_gene_CDR3_anchors.csv') with open(params_file_name,'r') as file: sep=0 error_rate='' lines=file.read().splitlines() while len(error_rate)<1: error_rate=lines[-1+sep] sep-=1 if custom_error is None: self.error_rate=float(error_rate) else: self.error_rate=custom_error if self.vj: genomic_data = olga_load_model.GenomicDataVJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVJ() generative_model.load_and_process_igor_model(marginals_file_name) sg_model = seq_gen.SequenceGenerationVJ(generative_model, genomic_data) else: genomic_data = olga_load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model(marginals_file_name) sg_model = seq_gen.SequenceGenerationVDJ(generative_model, genomic_data) #Generate sequences print('Generate sequences.') if add_error: seqs = [[nt2aa(add_random_error(seq[0],self.error_rate)), genomic_data.genV[seq[2]][0].split('*')[0], genomic_data.genJ[seq[3]][0].split('*')[0]] for seq in [sg_model.gen_rnd_prod_CDR3(conserved_J_residues='ABCEDFGHIJKLMNOPQRSTUVWXYZ') for _ in tqdm(range(int(num_gen_seqs)))]] else: seqs = [[seq[1], genomic_data.genV[seq[2]][0].split('*')[0], genomic_data.genJ[seq[3]][0].split('*')[0]] for seq in [sg_model.gen_rnd_prod_CDR3(conserved_J_residues='ABCEDFGHIJKLMNOPQRSTUVWXYZ') for _ in tqdm(range(int(num_gen_seqs)))]] if reset_gen_seqs: #reset gen_seqs if needed self.gen_seqs = [] #Add to specified pool(s) self.update_model(add_gen_seqs = seqs)
def __init__(self, sonia_model=None, include_genes=True, processes=None, custom_olga_model=None): if type(sonia_model) == str or sonia_model is None: print('ERROR: you need to pass a Sonia object') return self.sonia_model = sonia_model self.include_genes = include_genes if processes is None: self.processes = mp.cpu_count() else: self.processes = processes # you need Z for everything, better to compute it once at the beginning self.energies_gen = self.sonia_model.compute_energy( self.sonia_model.gen_seq_features[:int(1e6)]) self.Z = np.sum(np.exp(-self.energies_gen)) / len(self.energies_gen) # define olga model if custom_olga_model is not None: self.pgen_model = custom_olga_model self.norm = self.pgen_model.compute_regex_CDR3_template_pgen( 'X{0,}') else: main_folder = os.path.join( os.path.dirname(olga_load_model.__file__), 'default_models', self.sonia_model.chain_type) params_file_name = os.path.join(main_folder, 'model_params.txt') marginals_file_name = os.path.join(main_folder, 'model_marginals.txt') V_anchor_pos_file = os.path.join(main_folder, 'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(main_folder, 'J_gene_CDR3_anchors.csv') if self.sonia_model.chain_type != 'human_T_alpha': genomic_data = olga_load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model( marginals_file_name) else: genomic_data = olga_load_model.GenomicDataVJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVJ() generative_model.load_and_process_igor_model( marginals_file_name) self.pgen_model = pgen.GenerationProbabilityVDJ( generative_model, genomic_data) self.norm = self.pgen_model.compute_regex_CDR3_template_pgen( 'X{0,}')
def main(): """Compute Pgens from a file and output to another file.""" parser = OptionParser(conflict_handler="resolve") parser.add_option('--humanTRA', '--human_T_alpha', action='store_true', dest='humanTRA', default=False, help='use default human TRA model (T cell alpha chain)') parser.add_option('--humanTRB', '--human_T_beta', action='store_true', dest='humanTRB', default=False, help='use default human TRB model (T cell beta chain)') parser.add_option('--mouseTRB', '--mouse_T_beta', action='store_true', dest='mouseTRB', default=False, help='use default mouse TRB model (T cell beta chain)') parser.add_option('--humanIGH', '--human_B_heavy', action='store_true', dest='humanIGH', default=False, help='use default human IGH model (B cell heavy chain)') parser.add_option('--set_custom_model_VDJ', dest='vdj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VDJ generative model') parser.add_option('--set_custom_model_VJ', dest='vj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VJ generative model') parser.add_option('-i', '--infile', dest = 'infile_name',metavar='PATH/TO/FILE', help='read in CDR3 sequences (and optionally V/J masks) from PATH/TO/FILE') parser.add_option('-o', '--outfile', dest = 'outfile_name', metavar='PATH/TO/FILE', help='write CDR3 sequences and pgens to PATH/TO/FILE') parser.add_option('--seq_in', '--seq_index', type='int', metavar='INDEX', dest='seq_in_index', default = 0, help='specifies sequences to be read in are in column INDEX. Default is index 0 (the first column).') parser.add_option('--v_in', '--v_mask_index', type='int', metavar='INDEX', dest='V_mask_index', help='specifies V_masks are found in column INDEX in the input file. Default is no V mask.') parser.add_option('--j_in', '--j_mask_index', type='int', metavar='INDEX', dest='J_mask_index', help='specifies J_masks are found in column INDEX in the input file. Default is no J mask.') parser.add_option('--v_mask', type='string', dest='V_mask', help='specify V usage to condition Pgen on for seqs read in as arguments.') parser.add_option('--j_mask', type='string', dest='J_mask', help='specify J usage to condition Pgen on for seqs read in as arguments.') parser.add_option('-m', '--max_number_of_seqs', type='int',metavar='N', dest='max_number_of_seqs', help='compute Pgens for at most N sequences.') parser.add_option('--lines_to_skip', type='int',metavar='N', dest='lines_to_skip', default = 0, help='skip the first N lines of the file. Default is 0.') parser.add_option('-a', '--alphabet_filename', dest='alphabet_filename', metavar='PATH/TO/FILE', help="specify PATH/TO/FILE defining a custom 'amino acid' alphabet. Default is no custom alphabet.") parser.add_option('--seq_type_out', type='choice',metavar='SEQ_TYPE', dest='seq_type_out', choices=['all', 'ntseq', 'nucleotide', 'aaseq', 'amino_acid'], help="if read in sequences are ntseqs, declare what type of sequence to compute pgen for. Default is all. Choices: 'all', 'ntseq', 'nucleotide', 'aaseq', 'amino_acid'") parser.add_option('--skip_off','--skip_empty_off', action='store_true', dest = 'skip_empty', default=True, help='stop skipping empty or blank sequences/lines (if for example you want to keep line index fidelity between the infile and outfile).') parser.add_option('--display_off', action='store_false', dest='display_seqs', default=True, help='turn the sequence display off (only applies in write-to-file mode). Default is on.') parser.add_option('--num_lines_for_display', type='int', metavar='N', default = 50, dest='num_lines_for_display', help='N lines of the output file are displayed when sequence display is on. Also used to determine the number of sequences to average over for speed and time estimates.') parser.add_option('--time_updates_off', action='store_false', dest='time_updates', default=True, help='turn time updates off (only applies when sequence display is disabled).') parser.add_option('--seqs_per_time_update', type='float', metavar='N', default = 100, dest='seqs_per_time_update', help='specify the number of sequences between time updates. Default is 1e5.') parser.add_option('-d', '--delimiter', type='choice', dest='delimiter', choices=['tab', 'space', ',', ';', ':'], help="declare infile delimiter. Default is tab for .tsv input files, comma for .csv files, and any whitespace for all others. Choices: 'tab', 'space', ',', ';', ':'") parser.add_option('--raw_delimiter', type='str', dest='delimiter', help="declare infile delimiter as a raw string.") parser.add_option('--delimiter_out', type='choice', dest='delimiter_out', choices=['tab', 'space', ',', ';', ':'], help="declare outfile delimiter. Default is tab for .tsv output files, comma for .csv files, and the infile delimiter for all others. Choices: 'tab', 'space', ',', ';', ':'") parser.add_option('--raw_delimiter_out', type='str', dest='delimiter_out', help="declare for the delimiter outfile as a raw string.") parser.add_option('--gene_mask_delimiter', type='choice', dest='gene_mask_delimiter', choices=['tab', 'space', ',', ';', ':'], help="declare gene mask delimiter. Default comma unless infile delimiter is comma, then default is a semicolon. Choices: 'tab', 'space', ',', ';', ':'") parser.add_option('--raw_gene_mask_delimiter', type='str', dest='gene_mask_delimiter', help="declare delimiter of gene masks as a raw string.") parser.add_option('--comment_delimiter', type='str', dest='comment_delimiter', help="character or string to indicate comment or header lines to skip.") (options, args) = parser.parse_args() #Check that the model is specified properly main_folder = os.path.dirname(__file__) default_models = {} default_models['humanTRA'] = [os.path.join(main_folder, 'default_models', 'human_T_alpha'), 'VJ'] default_models['humanTRB'] = [os.path.join(main_folder, 'default_models', 'human_T_beta'), 'VDJ'] default_models['mouseTRB'] = [os.path.join(main_folder, 'default_models', 'mouse_T_beta'), 'VDJ'] default_models['humanIGH'] = [os.path.join(main_folder, 'default_models', 'human_B_heavy'), 'VDJ'] num_models_specified = sum([1 for x in default_models.keys() + ['vj_model_folder', 'vdj_model_folder'] if getattr(options, x)]) if num_models_specified == 1: #exactly one model specified try: d_model = [x for x in default_models.keys() if getattr(options, x)][0] model_folder = default_models[d_model][0] recomb_type = default_models[d_model][1] except IndexError: if options.vdj_model_folder: #custom VDJ model specified model_folder = options.vdj_model_folder recomb_type = 'VDJ' elif options.vj_model_folder: #custom VJ model specified model_folder = options.vj_model_folder recomb_type = 'VJ' elif num_models_specified == 0: print 'Need to indicate generative model.' print 'Exiting...' return -1 elif num_models_specified > 1: print 'Only specify one model' print 'Exiting...' return -1 #Check that all model and genomic files exist in the indicated model folder if not os.path.isdir(model_folder): print 'Check pathing... cannot find the model folder: ' + model_folder print 'Exiting...' return -1 params_file_name = os.path.join(model_folder,'model_params.txt') marginals_file_name = os.path.join(model_folder,'model_marginals.txt') V_anchor_pos_file = os.path.join(model_folder,'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(model_folder,'J_gene_CDR3_anchors.csv') for x in [params_file_name, marginals_file_name, V_anchor_pos_file, J_anchor_pos_file]: if not os.path.isfile(x): print 'Cannot find: ' + x print 'Please check the files (and naming conventions) in the model folder ' + model_folder print 'Exiting...' return -1 alphabet_filename = options.alphabet_filename #used if a custom alphabet is to be specified if alphabet_filename is not None: if not os.path.isfile(alphabet_filename): print 'Cannot find custom alphabet file: ' + infile_name print 'Exiting...' return -1 #Load up model based on recomb_type #VDJ recomb case --- used for TCRB and IGH if recomb_type == 'VDJ': genomic_data = load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model(marginals_file_name) pgen_model = generation_probability.GenerationProbabilityVDJ(generative_model, genomic_data, alphabet_filename) #VJ recomb case --- used for TCRA and light chain elif recomb_type == 'VJ': genomic_data = load_model.GenomicDataVJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = load_model.GenerativeModelVJ() generative_model.load_and_process_igor_model(marginals_file_name) pgen_model = generation_probability.GenerationProbabilityVJ(generative_model, genomic_data, alphabet_filename) aa_alphabet = ''.join(pgen_model.codons_dict.keys()) if options.infile_name is not None: infile_name = options.infile_name if not os.path.isfile(infile_name): print 'Cannot find input file: ' + infile_name print 'Exiting...' return -1 if options.outfile_name is not None: outfile_name = options.outfile_name if os.path.isfile(outfile_name): if not raw_input(outfile_name + ' already exists. Overwrite (y/n)? ').strip().lower() in ['y', 'yes']: print 'Exiting...' return -1 #Parse delimiter delimiter = options.delimiter if delimiter is None: #Default case if options.infile_name is None: delimiter = '\t' elif infile_name.endswith('.tsv'): #parse TAB separated value file delimiter = '\t' elif infile_name.endswith('.csv'): #parse COMMA separated value file delimiter = ',' else: try: delimiter = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[delimiter] except KeyError: pass #Other string passed as the delimiter. #Parse delimiter_out delimiter_out = options.delimiter_out if delimiter_out is None: #Default case if delimiter is None: delimiter_out = '\t' else: delimiter_out = delimiter if options.outfile_name is None: pass elif outfile_name.endswith('.tsv'): #output TAB separated value file delimiter_out = '\t' elif outfile_name.endswith('.csv'): #output COMMA separated value file delimiter_out = ',' else: try: delimiter_out = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[delimiter_out] except KeyError: pass #Other string passed as the delimiter. #Parse gene_delimiter gene_mask_delimiter = options.gene_mask_delimiter if gene_mask_delimiter is None: #Default case gene_mask_delimiter = ',' if delimiter == ',': gene_mask_delimiter = ';' else: try: gene_mask_delimiter = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[gene_mask_delimiter] except KeyError: pass #Other string passed as the delimiter. #More options time_updates = options.time_updates display_seqs = options.display_seqs num_lines_for_display = options.num_lines_for_display seq_in_index = options.seq_in_index #where in the line the sequence is after line.split(delimiter) lines_to_skip = options.lines_to_skip #one method of skipping header comment_delimiter = options.comment_delimiter #another method of skipping header seqs_per_time_update = options.seqs_per_time_update max_number_of_seqs = options.max_number_of_seqs V_mask_index = options.V_mask_index #Default is not conditioning on V identity J_mask_index = options.J_mask_index #Default is not conditioning on J identity skip_empty = options.skip_empty seq_type_out = options.seq_type_out #type of pgens to be computed. Can be ntseq, aaseq, or both if seq_type_out is not None: seq_type_out = {'all': None, 'ntseq': 'ntseq', 'nucleotide': 'ntseq', 'aaseq': 'aaseq', 'amino_acid': 'aaseq'}[seq_type_out] if options.infile_name is None: #No infile specified -- args should be the input seqs print_warnings = True seqs = args seq_types = [determine_seq_type(seq, aa_alphabet) for seq in seqs] unrecognized_seqs = [seq for i, seq in enumerate(seqs) if seq_types[i] is None] if len(unrecognized_seqs) > 0 and print_warnings: print 'The following sequences/arguments were not recognized: ' + ', '.join(unrecognized_seqs) seqs = [seq for i, seq in enumerate(seqs) if seq_types[i] is not None] seq_types = [seq_type for seq_type in seq_types if seq_type is not None] #Format V and J masks -- uniform for all argument input sequences try: V_mask = options.V_mask.split(',') unrecognized_v_genes = [v for v in V_mask if v not in pgen_model.V_mask_mapping.keys()] V_mask = [v for v in V_mask if v in pgen_model.V_mask_mapping.keys()] if len(unrecognized_v_genes) > 0: print 'These V genes/alleles are not recognized: ' + ', '.join(unrecognized_v_genes) if len(V_mask) == 0: print 'No recognized V genes/alleles in the provided V_mask. Continuing without conditioning on V usage.' V_mask = None except AttributeError: V_mask = options.V_mask #Default is None, i.e. not conditioning on V identity try: J_mask = options.J_mask.split(',') unrecognized_j_genes = [j for j in J_mask if j not in pgen_model.J_mask_mapping.keys()] J_mask = [j for j in J_mask if j in pgen_model.J_mask_mapping.keys()] if len(unrecognized_j_genes) > 0: print 'These J genes/alleles are not recognized: ' + ', '.join(unrecognized_j_genes) if len(J_mask) == 0: print 'No recognized J genes/alleles in the provided J_mask. Continuing without conditioning on J usage.' J_mask = None except AttributeError: J_mask = options.J_mask #Default is None, i.e. not conditioning on J identity print '' start_time = time.time() for seq, seq_type in zip(seqs, seq_types): if seq_type == 'aaseq': c_pgen = pgen_model.compute_aa_CDR3_pgen(seq, V_mask, J_mask, print_warnings) print 'Pgen of the amino acid sequence ' + seq + ': ' + str(c_pgen) print '' elif seq_type == 'regex': c_pgen = pgen_model.compute_regex_CDR3_template_pgen(seq, V_mask, J_mask, print_warnings) print 'Pgen of the regular expression sequence ' + seq + ': ' + str(c_pgen) print '' elif seq_type == 'ntseq': if seq_type_out is None or seq_type_out == 'ntseq': c_pgen_nt = pgen_model.compute_nt_CDR3_pgen(seq, V_mask, J_mask, print_warnings) print 'Pgen of the nucleotide sequence ' + seq + ': ' + str(c_pgen_nt) if seq_type_out is None or seq_type_out == 'aaseq': c_pgen_aa = pgen_model.compute_aa_CDR3_pgen(nt2aa(seq), V_mask, J_mask, print_warnings) print 'Pgen of the amino acid sequence nt2aa(' + seq + ') = ' + nt2aa(seq) + ': ' + str(c_pgen_aa) print '' c_time = time.time() - start_time if c_time > 86400: #more than a day c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(c_time)/86400, (int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) elif c_time > 3600: #more than an hr c_time_str = '%d hours, %d minutes, and %.2f seconds.'%((int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) elif c_time > 60: #more than a min c_time_str = '%d minutes and %.2f seconds.'%((int(c_time)/60)%60, c_time%60) else: c_time_str = '%.2f seconds.'%(c_time) print 'Completed pgen computation in: ' + c_time_str else: #Read sequences in from file print_warnings = False #Most cases of reading in from file should have warnings disabled seqs = [] seq_types = [] V_usage_masks = [] J_usage_masks = [] infile = open(infile_name, 'r') for i, line in enumerate(infile): if comment_delimiter is not None: #Default case -- no comments/header delimiter if line.startswith(comment_delimiter): #allow comments continue if i < lines_to_skip: continue if delimiter is None: #Default delimiter is any whitespace split_line = line.split() else: split_line = line.split(delimiter) #Find the seq try: seq = split_line[seq_in_index].strip() if len(seq.strip()) == 0: if skip_empty: continue else: seqs.append(seq) #keep the blank seq as a placeholder seq_types.append('aaseq') else: seqs.append(seq) seq_types.append(determine_seq_type(seq, aa_alphabet)) except IndexError: #no index match for seq if skip_empty and len(line.strip()) == 0: continue print 'seq_in_index is out of range' print 'Exiting...' infile.close() return -1 #Find and format V_usage_mask if V_mask_index is None: V_usage_masks.append(None) #default mask else: try: V_usage_mask = split_line[V_mask_index].strip().split(gene_mask_delimiter) #check that all V gene/allele names are recognized if all([v in pgen_model.V_mask_mapping for v in V_usage_mask]): V_usage_masks.append(V_usage_mask) else: print str(V_usage_mask) + " is not a usable V_usage_mask composed exclusively of recognized V gene/allele names" print 'Unrecognized V gene/allele names: ' + ', '.join([v for v in V_usage_mask if not v in pgen_model.V_mask_mapping.keys()]) print 'Exiting...' infile.close() return -1 except IndexError: #no index match for V_mask_index print 'V_mask_index is out of range' print 'Exiting...' infile.close() return -1 #Find and format J_usage_mask if J_mask_index is None: J_usage_masks.append(None) #default mask else: try: J_usage_mask = split_line[J_mask_index].strip().split(gene_mask_delimiter) #check that all V gene/allele names are recognized if all([j in pgen_model.J_mask_mapping for j in J_usage_mask]): J_usage_masks.append(J_usage_mask) else: print str(J_usage_mask) + " is not a usable J_usage_mask composed exclusively of recognized J gene/allele names" print 'Unrecognized J gene/allele names: ' + ', '.join([j for j in J_usage_mask if not j in pgen_model.J_mask_mapping.keys()]) print 'Exiting...' infile.close() return -1 except IndexError: #no index match for J_mask_index print 'J_mask_index is out of range' print 'Exiting...' infile.close() return -1 if max_number_of_seqs is not None: if len(seqs) >= max_number_of_seqs: break unrecognized_seqs = [seq for i, seq in enumerate(seqs) if seq_types[i] is None] if len(unrecognized_seqs) > 0 and len(unrecognized_seqs) < len(seqs): if print_warnings or options.outfile_name is not None: print 'Some strings read in were not parsed as sequences -- they will be omitted.' print 'Examples of improperly read strings: ' for unrecognized_seq in unrecognized_seqs[:10]: print unrecognized_seq seqs = [seq for i, seq in enumerate(seqs) if seq_types[i] is not None] V_usage_masks = [V_usage_mask for i, V_usage_mask in enumerate(V_usage_masks) if seq_types[i] is not None] seq_types = [seq_type for seq_type in seq_types if seq_type is not None] elif len(unrecognized_seqs) > 0 and len(unrecognized_seqs) == len(seqs): print 'None of the read in strings were parsed as sequences. Check input file.' print 'Examples of improperly read strings:' for unrecognized_seq in unrecognized_seqs[:10]: print unrecognized_seq print 'Exiting...' return -1 infile.close() if options.outfile_name is not None: #OUTFILE SPECIFIED, allow printed info/display print 'Successfully read in and formatted ' + str(len(seqs)) + ' sequences and any V or J usages.' if display_seqs: sys.stdout.write('\r'+'Continuing to Pgen computation in 3... ') sys.stdout.flush() time.sleep(0.4) sys.stdout.write('\r'+'Continuing to Pgen computation in 2... ') sys.stdout.flush() time.sleep(0.4) sys.stdout.write('\r'+'Continuing to Pgen computation in 1... ') sys.stdout.flush() time.sleep(0.4) else: print 'Continuing to Pgen computation.' print_warnings = True #Display is off, can print warnings if display_seqs: lines_for_display = [] times_for_speed_calc = [time.time()] outfile = open(outfile_name, 'w') start_time = time.time() for i, seq in enumerate(seqs): if seq_types[i] == 'aaseq': #Compute Pgen and print out c_pgen_line = seq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(seq, V_usage_masks[i], J_usage_masks[i], print_warnings)) if seq_types[i] == 'regex': #Compute Pgen and print out c_pgen_line = seq + delimiter_out + str(pgen_model.compute_regex_CDR3_template_pgen(seq, V_usage_masks[i], J_usage_masks[i], print_warnings)) elif seq_types[i] == 'ntseq': ntseq = seq if len(ntseq) % 3 == 0: #inframe sequence aaseq = nt2aa(ntseq) #Compute Pgen and print out based on recomb_type and seq_type_out if seq_type_out is None: c_pgen_line = ntseq + delimiter_out + str(pgen_model.compute_nt_CDR3_pgen(ntseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) + delimiter_out + aaseq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(aaseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) elif seq_type_out == 'ntseq': c_pgen_line = ntseq + delimiter_out + str(pgen_model.compute_nt_CDR3_pgen(ntseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) elif seq_type_out == 'aaseq': c_pgen_line = aaseq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(aaseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) else: #out of frame sequence -- Pgens are 0 and use 'out_of_frame' for aaseq if seq_type_out is None: c_pgen_line = ntseq + delimiter_out + '0' + delimiter_out + 'out_of_frame' + delimiter_out + '0' elif seq_type_out == 'ntseq': c_pgen_line = ntseq + delimiter_out + '0' elif seq_type_out == 'aaseq': c_pgen_line = 'out_of_frame' + delimiter_out + '0' outfile.write(c_pgen_line + '\n') #Print time update if display_seqs: cc_time = time.time() c_time = cc_time - start_time times_for_speed_calc = [cc_time] + times_for_speed_calc[:num_lines_for_display] c_avg_speed = (len(times_for_speed_calc)-1)/float(times_for_speed_calc[0] - times_for_speed_calc[-1]) #eta = ((len(seqs) - (i+1))/float(i+1))*c_time eta = (len(seqs) - (i+1))/c_avg_speed lines_for_display = [c_pgen_line] + lines_for_display[:num_lines_for_display] c_time_str = '%s hours, %s minutes, and %s seconds.'%(repr(int(c_time)/3600).rjust(3), repr((int(c_time)/60)%60).rjust(2), repr(int(c_time)%60).rjust(2)) eta_str = '%s hours, %s minutes, and %s seconds.'%(repr(int(eta)/3600).rjust(3), repr((int(eta)/60)%60).rjust(2), repr(int(eta)%60).rjust(2)) time_str = 'Time to compute Pgen on %s seqs: %s \nEst. time for remaining %s seqs: %s'%(repr(i+1).rjust(9), c_time_str, repr(len(seqs) - (i + 1)).rjust(9), eta_str) speed_str = 'Current Pgen computation speed: %s seqs/min'%(repr(round((len(times_for_speed_calc)-1)*60/float(times_for_speed_calc[0] - times_for_speed_calc[-1]), 2)).rjust(8)) display_str = '\n'.join(lines_for_display[::-1]) + '\n' + '-'*80 + '\n' + time_str + '\n' + speed_str + '\n' + '-'*80 print '\033[2J' + display_str elif (i+1)%seqs_per_time_update == 0 and time_updates: c_time = time.time() - start_time eta = ((len(seqs) - (i+1))/float(i+1))*c_time if c_time > 86400: #more than a day c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(c_time)/86400, (int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) elif c_time > 3600: #more than an hr c_time_str = '%d hours, %d minutes, and %.2f seconds.'%((int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) elif c_time > 60: #more than a min c_time_str = '%d minutes and %.2f seconds.'%((int(c_time)/60)%60, c_time%60) else: c_time_str = '%.2f seconds.'%(c_time) if eta > 86400: #more than a day eta_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(eta)/86400, (int(eta)/3600)%24, (int(eta)/60)%60, eta%60) elif eta > 3600: #more than an hr eta_str = '%d hours, %d minutes, and %.2f seconds.'%((int(eta)/3600)%24, (int(eta)/60)%60, eta%60) elif eta > 60: #more than a min eta_str = '%d minutes and %.2f seconds.'%((int(eta)/60)%60, eta%60) else: eta_str = '%.2f seconds.'%(eta) print 'Pgen computed for %d sequences in: %s Estimated time remaining: %s'%(i+1, c_time_str, eta_str) c_time = time.time() - start_time if c_time > 86400: #more than a day c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(c_time)/86400, (int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) elif c_time > 3600: #more than an hr c_time_str = '%d hours, %d minutes, and %.2f seconds.'%((int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) elif c_time > 60: #more than a min c_time_str = '%d minutes and %.2f seconds.'%((int(c_time)/60)%60, c_time%60) else: c_time_str = '%.2f seconds.'%(c_time) print 'Completed Pgen computation for %d sequences: in %s'%(len(seqs), c_time_str) outfile.close() else: #NO OUTFILE -- print directly to stdout start_time = time.time() for i, seq in enumerate(seqs): if seq_types[i] == 'aaseq': #Compute Pgen and print out c_pgen_line = seq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(seq, V_usage_masks[i], J_usage_masks[i], print_warnings)) if seq_types[i] == 'regex': #Compute Pgen and print out c_pgen_line = seq + delimiter_out + str(pgen_model.compute_regex_CDR3_template_pgen(seq, V_usage_masks[i], J_usage_masks[i], print_warnings)) elif seq_types[i] == 'ntseq': ntseq = seq if len(ntseq) % 3 == 0: #inframe sequence aaseq = nt2aa(ntseq) #Compute Pgen and print out based on recomb_type and seq_type_out if seq_type_out is None: c_pgen_line = ntseq + delimiter_out + str(pgen_model.compute_nt_CDR3_pgen(ntseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) + delimiter_out + aaseq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(aaseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) elif seq_type_out == 'ntseq': c_pgen_line = ntseq + delimiter_out + str(pgen_model.compute_nt_CDR3_pgen(ntseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) elif seq_type_out == 'aaseq': c_pgen_line = aaseq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(aaseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) else: #out of frame sequence -- Pgens are 0 and use 'out_of_frame' for aaseq if seq_type_out is None: c_pgen_line = ntseq + delimiter_out + '0' + delimiter_out + 'out_of_frame' + delimiter_out + '0' elif seq_type_out == 'ntseq': c_pgen_line = ntseq + delimiter_out + '0' elif seq_type_out == 'aaseq': c_pgen_line = 'out_of_frame' + delimiter_out + '0' print c_pgen_line
alpha_marginals_file_name = util.path_to_olga + 'default_models/human_T_alpha/model_marginals.txt' alpha_V_anchor_pos_file = util.path_to_olga + 'default_models/human_T_alpha/V_gene_CDR3_anchors.csv' alpha_J_anchor_pos_file = util.path_to_olga + 'default_models/human_T_alpha/J_gene_CDR3_anchors.csv' mus_beta_params_file_name = util.path_to_olga + 'default_models/mouse_T_beta/model_params.txt' mus_beta_marginals_file_name = util.path_to_olga + 'default_models/mouse_T_beta/model_marginals.txt' mus_beta_V_anchor_pos_file = util.path_to_olga + 'default_models/mouse_T_beta/V_gene_CDR3_anchors.csv' mus_beta_J_anchor_pos_file = util.path_to_olga + 'default_models/mouse_T_beta/J_gene_CDR3_anchors.csv' humanIg_params_file_name = util.path_to_olga + 'default_models/human_B_heavy/model_params.txt' humanIg_marginals_file_name = util.path_to_olga + 'default_models/human_B_heavy/model_marginals.txt' humanIg_V_anchor_pos_file = util.path_to_olga + 'default_models/human_B_heavy/V_gene_CDR3_anchors.csv' humanIg_J_anchor_pos_file = util.path_to_olga + 'default_models/human_B_heavy/J_gene_CDR3_anchors.csv' #Load models beta_genomic_data = load_model.GenomicDataVDJ() beta_genomic_data.load_igor_genomic_data(beta_params_file_name, beta_V_anchor_pos_file, beta_J_anchor_pos_file) beta_generative_model = load_model.GenerativeModelVDJ() beta_generative_model.load_and_process_igor_model(beta_marginals_file_name) #alpha_genomic_data = load_model.GenomicDataVDJ() #alpha_genomic_data.load_igor_genomic_data(alpha_params_file_name, alpha_V_anchor_pos_file, alpha_J_anchor_pos_file) #alpha_generative_model = load_model.GenerativeModelVDJ() #alpha_generative_model.load_and_process_igor_model(alpha_marginals_file_name) mus_beta_genomic_data = load_model.GenomicDataVDJ() mus_beta_genomic_data.load_igor_genomic_data(mus_beta_params_file_name, mus_beta_V_anchor_pos_file, mus_beta_J_anchor_pos_file)
custom_pgen_model='universal_model') qm0 = SoniaLeftposRightpos( load_dir='selection_models/emerson_frequency_leftright_1M', custom_pgen_model='universal_model') qm1 = SoniaVJL(load_dir='selection_models/emerson_frequency_vjl_1M', custom_pgen_model='universal_model') # load Evaluate model main_folder = 'universal_model' params_file_name = os.path.join(main_folder, 'model_params.txt') marginals_file_name = os.path.join(main_folder, 'model_marginals.txt') V_anchor_pos_file = os.path.join(main_folder, 'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(main_folder, 'J_gene_CDR3_anchors.csv') genomic_data = olga_load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model(marginals_file_name) pgen_model = generation_probability.GenerationProbabilityVDJ( generative_model, genomic_data) ev = EvaluateModel(sonia_model=qm, custom_olga_model=pgen_model) ev0 = EvaluateModel(sonia_model=qm0, custom_olga_model=pgen_model) ev1 = EvaluateModel(sonia_model=qm1, custom_olga_model=pgen_model) #evaluate ppost/pgen energy, pgen, ppost = ev.evaluate_seqs(to_evalutate) _, _, ppost_left = ev0.evaluate_seqs(to_evalutate) _, _, ppost_vjl = ev1.evaluate_seqs(to_evalutate)
def main(): """ Evaluate sequences.""" parser = OptionParser(conflict_handler="resolve") #specify model parser.add_option('--humanTRA', '--human_T_alpha', action='store_true', dest='humanTRA', default=False, help='use default human TRA model (T cell alpha chain)') parser.add_option('--humanTRB', '--human_T_beta', action='store_true', dest='humanTRB', default=False, help='use default human TRB model (T cell beta chain)') parser.add_option('--mouseTRB', '--mouse_T_beta', action='store_true', dest='mouseTRB', default=False, help='use default mouse TRB model (T cell beta chain)') parser.add_option('--humanIGH', '--human_B_heavy', action='store_true', dest='humanIGH', default=False, help='use default human IGH model (B cell heavy chain)') parser.add_option('--humanIGK', '--human_B_kappa', action='store_true', dest='humanIGK', default=False, help='use default human IGK model (B cell light kappa chain)') parser.add_option('--humanIGL', '--human_B_lambda', action='store_true', dest='humanIGL', default=False, help='use default human IGL model (B cell light lambda chain)') parser.add_option('--mouseTRA', '--mouse_T_alpha', action='store_true', dest='mouseTRA', default=False, help='use default mouse TRA model (T cell alpha chain)') parser.add_option('--set_custom_model_VDJ', dest='vdj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VDJ generative model') parser.add_option('--set_custom_model_VJ', dest='vj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VJ generative model') parser.add_option('--sonia_model', type='string', default = 'leftright', dest='model_type' ,help='specify model type: leftright or lengthpos, default is leftright') parser.add_option('--epochs', type='int', default = 30, dest='epochs' ,help='number of epochs for inference, default is 30') parser.add_option('--batch_size', type='int', default = 5000, dest='batch_size' ,help='size of batch for the stochastic gradient descent') parser.add_option('--validation_split', type='float', default = 0.2, dest='validation_split' ,help='fraction of sequences used for validation.') parser.add_option('--independent_genes', '--include_indep_genes', action='store_true', dest='independent_genes', default=False, help='Independent gene selection factors q_v*q_j. Deafult is joint q_vj') parser.add_option('--min_energy_clip', type='float', default=-5, dest='min_energy_clip', help='Set numerical lower bound to the values of -logQ, default is -5.') parser.add_option('--max_energy_clip', type='float', default=10, dest='max_energy_clip', help='Set numerical upper bound to the values of -logQ, default is 10.') #location of seqs parser.add_option('--seq_in', '--seq_index', type='int', metavar='INDEX', dest='seq_in_index', default = 0, help='specifies sequences to be read in are in column INDEX. Default is index 0 (the first column).') parser.add_option('--v_in', '--v_mask_index', type='int', metavar='INDEX', dest='V_mask_index', default=1, help='specifies V_masks are found in column INDEX in the input file. Default is 1.') parser.add_option('--j_in', '--j_mask_index', type='int', metavar='INDEX', dest='J_mask_index', default=2, help='specifies J_masks are found in column INDEX in the input file. Default is 2.') # input output parser.add_option('-i', '--infile', dest = 'infile_name',metavar='PATH/TO/FILE', help='read in CDR3 sequences (and optionally V/J masks) from PATH/TO/FILE') parser.add_option('-o', '--outfile', dest = 'outfile_name', metavar='PATH/TO/FILE', help='write CDR3 sequences and pgens to PATH/TO/FILE') parser.add_option('-m', '--max_number_of_seqs', type='int',metavar='N', dest='max_number_of_seqs', help='evaluate for at most N sequences.') parser.add_option('-n', '--n_gen_seqs', type='int',metavar='N', dest='n_gen_seqs',default=0, help='sample n sequences from gen distribution.') parser.add_option('-g', '--infile_gen', dest = 'infile_gen',metavar='PATH/TO/FILE', help='read generated CDR3 sequences (and optionally V/J masks) from PATH/TO/FILE') parser.add_option('--lines_to_skip', type='int',metavar='N', dest='lines_to_skip', default = 0, help='skip the first N lines of the file. Default is 0.') parser.add_option('--no_report', '--no_plot_report', action='store_false', dest='plot_report', default=True, help='Do not produce report plots of the inferred model.') #delimeters parser.add_option('-d', '--delimiter', type='choice', dest='delimiter', choices=['tab', 'space', ',', ';', ':'], help="declare infile delimiter. Default is tab for .tsv input files, comma for .csv files, and any whitespace for all others. Choices: 'tab', 'space', ',', ';', ':'") parser.add_option('--raw_delimiter', type='str', dest='delimiter', help="declare infile delimiter as a raw string.") parser.add_option('--delimiter_out', type='choice', dest='delimiter_out', choices=['tab', 'space', ',', ';', ':'], help="declare outfile delimiter. Default is tab for .tsv output files, comma for .csv files, and the infile delimiter for all others. Choices: 'tab', 'space', ',', ';', ':'") parser.add_option('--raw_delimiter_out', type='str', dest='delimiter_out', help="declare for the delimiter outfile as a raw string.") parser.add_option('--gene_mask_delimiter', type='choice', dest='gene_mask_delimiter', choices=['tab', 'space', ',', ';', ':'], help="declare gene mask delimiter. Default comma unless infile delimiter is comma, then default is a semicolon. Choices: 'tab', 'space', ',', ';', ':'") parser.add_option('--raw_gene_mask_delimiter', type='str', dest='gene_mask_delimiter', help="declare delimiter of gene masks as a raw string.") parser.add_option('--comment_delimiter', type='str', dest='comment_delimiter', help="character or string to indicate comment or header lines to skip.") parser.add_option('--seed', type='int',metavar='N', dest='seed', default = None, help='set seed for inference') (options, args) = parser.parse_args() #set seed if options.seed is not None: import tensorflow as tf np.random.seed(options.seed) tf.random.set_seed(options.seed) #Check that the model is specified properly main_folder = os.path.dirname(__file__) default_models = {} default_models['humanTRA'] = [os.path.join(main_folder, 'default_models', 'human_T_alpha'), 'VJ'] default_models['humanTRB'] = [os.path.join(main_folder, 'default_models', 'human_T_beta'), 'VDJ'] default_models['mouseTRB'] = [os.path.join(main_folder, 'default_models', 'mouse_T_beta'), 'VDJ'] default_models['humanIGH'] = [os.path.join(main_folder, 'default_models', 'human_B_heavy'), 'VDJ'] default_models['humanIGK'] = [os.path.join(main_folder, 'default_models', 'human_B_kappa'), 'VJ'] default_models['humanIGL'] = [os.path.join(main_folder, 'default_models', 'human_B_lambda'), 'VJ'] default_models['mouseTRA'] = [os.path.join(main_folder, 'default_models', 'mouse_T_alpha'), 'VJ'] if options.independent_genes: independent_genes=True joint_genes=False else: independent_genes=False joint_genes=True num_models_specified = sum([1 for x in list(default_models.keys()) + ['vj_model_folder', 'vdj_model_folder'] if getattr(options, x)]) recompute_productive_norm=False if num_models_specified == 1: #exactly one model specified try: d_model = [x for x in default_models.keys() if getattr(options, x)][0] model_folder = default_models[d_model][0] recomb_type = default_models[d_model][1] except IndexError: if options.vdj_model_folder: #custom VDJ model specified recompute_productive_norm=True model_folder = options.vdj_model_folder recomb_type = 'VDJ' elif options.vj_model_folder: #custom VJ model specified recompute_productive_norm=True model_folder = options.vj_model_folder recomb_type = 'VJ' elif num_models_specified == 0: print('Need to indicate generative model.') print('Exiting...') return -1 elif num_models_specified > 1: print('Only specify one model') print('Exiting...') return -1 if options.max_energy_clip <= options.min_energy_clip : print('The clip for the higher energy must be strictly greater than the clip for the lower energy. ') print('Exiting...') return -1 else : max_energy_clip = options.max_energy_clip min_energy_clip = options.min_energy_clip #Generative model specification -- note we'll probably change this syntax to #allow for arbitrary model file specification params_file_name = os.path.join(model_folder,'model_params.txt') marginals_file_name = os.path.join(model_folder,'model_marginals.txt') V_anchor_pos_file = os.path.join(model_folder,'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(model_folder,'J_gene_CDR3_anchors.csv') for x in [params_file_name, marginals_file_name, V_anchor_pos_file, J_anchor_pos_file]: if not os.path.isfile(x): print('Cannot find: ' + x) print('Please check the files (and naming conventions) in the model folder ' + model_folder) print('Exiting...') return -1 #Load up model based on recomb_type #VDJ recomb case --- used for TCRB and IGH if recomb_type == 'VDJ': genomic_data = olga_load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model(marginals_file_name) pgen_model = generation_probability.GenerationProbabilityVDJ(generative_model, genomic_data) #VJ recomb case --- used for TCRA and light chain elif recomb_type == 'VJ': genomic_data = olga_load_model.GenomicDataVJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVJ() generative_model.load_and_process_igor_model(marginals_file_name) pgen_model = generation_probability.GenerationProbabilityVJ(generative_model, genomic_data) if options.infile_name is not None: infile_name = options.infile_name if not os.path.isfile(infile_name): print('Cannot find input file: ' + infile_name) print('Exiting...') return -1 if options.outfile_name is not None: outfile_name = options.outfile_name if os.path.isfile(outfile_name): if not input(outfile_name + ' already exists. Overwrite (y/n)? ').strip().lower() in ['y', 'yes']: print('Exiting...') return -1 #Parse delimiter delimiter = options.delimiter if delimiter is None: #Default case if options.infile_name is None: delimiter = '\t' elif infile_name.endswith('.tsv'): #parse TAB separated value file delimiter = '\t' elif infile_name.endswith('.csv'): #parse COMMA separated value file delimiter = ',' else: try: delimiter = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[delimiter] except KeyError: pass #Other string passed as the delimiter. #Parse delimiter_out delimiter_out = options.delimiter_out if delimiter_out is None: #Default case if delimiter is None: delimiter_out = '\t' else: delimiter_out = delimiter if options.outfile_name is None: pass elif outfile_name.endswith('.tsv'): #output TAB separated value file delimiter_out = '\t' elif outfile_name.endswith('.csv'): #output COMMA separated value file delimiter_out = ',' else: try: delimiter_out = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[delimiter_out] except KeyError: pass #Other string passed as the delimiter. #Parse gene_delimiter gene_mask_delimiter = options.gene_mask_delimiter if gene_mask_delimiter is None: #Default case gene_mask_delimiter = ',' if delimiter == ',': gene_mask_delimiter = ';' else: try: gene_mask_delimiter = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[gene_mask_delimiter] except KeyError: pass #Other string passed as the delimiter. #More options seq_in_index = options.seq_in_index #where in the line the sequence is after line.split(delimiter) lines_to_skip = options.lines_to_skip #one method of skipping header comment_delimiter = options.comment_delimiter #another method of skipping header max_number_of_seqs = options.max_number_of_seqs V_mask_index = options.V_mask_index #Default is not conditioning on V identity J_mask_index = options.J_mask_index #Default is not conditioning on J identity skip_empty=True # skip empty lines if options.infile_name is None: #No infile specified -- args should be the input seqs print('ERROR: specify input file.') return -1 else: seqs = [] V_usage_masks = [] J_usage_masks = [] print('Read input file.') infile = open(infile_name, 'r') for i, line in enumerate(tqdm(infile)): if comment_delimiter is not None: #Default case -- no comments/header delimiter if line.startswith(comment_delimiter): #allow comments continue if i < lines_to_skip: continue if delimiter is None: #Default delimiter is any whitespace split_line = line.split('\n')[0].split() else: split_line = line.split('\n')[0].split(delimiter) #Find the seq try: seq = split_line[seq_in_index].strip() if len(seq.strip()) == 0: if skip_empty: continue else: seqs.append(seq) #keep the blank seq as a placeholder #seq_types.append('aaseq') else: seqs.append(seq) #seq_types.append(determine_seq_type(seq, aa_alphabet)) except IndexError: #no index match for seq if skip_empty and len(line.strip()) == 0: continue print('seq_in_index is out of range') print('Exiting...') infile.close() return -1 #Find and format V_usage_mask if V_mask_index is None: V_usage_masks.append(None) #default mask else: try: V_usage_mask = split_line[V_mask_index].strip().split(gene_mask_delimiter) #check that all V gene/allele names are recognized if all([gene_to_num_str(v, 'V') in pgen_model.V_mask_mapping for v in V_usage_mask]): V_usage_masks.append(V_usage_mask) else: print(str(V_usage_mask) + " is not a usable V_usage_mask composed exclusively of recognized V gene/allele names") print('Unrecognized V gene/allele names: ' + ', '.join([v for v in V_usage_mask if gene_to_num_str(v, 'V') not in pgen_model.V_mask_mapping.keys()])) print('Continuing but inference might be biased...') V_usage_masks.append(V_usage_mask) #infile.close() #return -1 except IndexError: #no index match for V_mask_index print('V_mask_index is out of range, check the delimeter.') print('Exiting...') infile.close() return -1 #Find and format J_usage_mask if J_mask_index is None: J_usage_masks.append(None) #default mask else: try: J_usage_mask = split_line[J_mask_index].strip().split(gene_mask_delimiter) #check that all V gene/allele names are recognized if all([gene_to_num_str(j, 'J') in pgen_model.J_mask_mapping for j in J_usage_mask]): J_usage_masks.append(J_usage_mask) else: print(str(J_usage_mask) + " is not a usable J_usage_mask composed exclusively of recognized J gene/allele names") print('Unrecognized J gene/allele names: ' + ', '.join([j for j in J_usage_mask if gene_to_num_str(j, 'J') not in pgen_model.J_mask_mapping.keys()])) print('Continuing but inference might be biased...') J_usage_masks.append(J_usage_mask) #infile.close() #return -1 except IndexError: #no index match for J_mask_index print('J_mask_index is out of range, check the delimeter.') print('Exiting...') infile.close() return -1 if max_number_of_seqs is not None: if len(seqs) >= max_number_of_seqs: break data_seqs=[[seqs[i],V_usage_masks[i][0],J_usage_masks[i][0]] for i in range(len(seqs))] #define number of gen_seqs: gen_seqs=[] n_gen_seqs=options.n_gen_seqs generate_sequences=False if options.infile_gen is None: generate_sequences=True if n_gen_seqs is 0: n_gen_seqs=np.max([int(3e5),3*len(data_seqs)]) else: seqs = [] V_usage_masks = [] J_usage_masks = [] print('Read file of generated seqs.') infile = open(options.infile_gen, 'r') for i, line in enumerate(tqdm(infile)): if comment_delimiter is not None: #Default case -- no comments/header delimiter if line.startswith(comment_delimiter): #allow comments continue if i < lines_to_skip: continue if delimiter is None: #Default delimiter is any whitespace split_line = line.split('\n')[0].split() else: split_line = line.split('\n')[0].split(delimiter) #Find the seq try: seq = split_line[seq_in_index].strip() if len(seq.strip()) == 0: if skip_empty: continue else: seqs.append(seq) #keep the blank seq as a placeholder #seq_types.append('aaseq') else: seqs.append(seq) #seq_types.append(determine_seq_type(seq, aa_alphabet)) except IndexError: #no index match for seq if skip_empty and len(line.strip()) == 0: continue print('seq_in_index is out of range') print('Exiting...') infile.close() return -1 #Find and format V_usage_mask if V_mask_index is None: V_usage_masks.append(None) #default mask else: try: V_usage_mask = split_line[V_mask_index].strip().split(gene_mask_delimiter) #check that all V gene/allele names are recognized if all([gene_to_num_str(v, 'V') in pgen_model.V_mask_mapping for v in V_usage_mask]): V_usage_masks.append(V_usage_mask) else: print(str(V_usage_mask) + " is not a usable V_usage_mask composed exclusively of recognized V gene/allele names") print('Unrecognized V gene/allele names: ' + ', '.join([v for v in V_usage_mask if gene_to_num_str(v, 'V') not in pgen_model.V_mask_mapping.keys()])) print('Continuing but inference might be biased...') V_usage_masks.append(V_usage_mask) #infile.close() #return -1 except IndexError: #no index match for V_mask_index print('V_mask_index is out of range, check the delimeter.') print('Exiting...') infile.close() return -1 #Find and format J_usage_mask if J_mask_index is None: J_usage_masks.append(None) #default mask else: try: J_usage_mask = split_line[J_mask_index].strip().split(gene_mask_delimiter) #check that all V gene/allele names are recognized if all([gene_to_num_str(j, 'J') in pgen_model.J_mask_mapping for j in J_usage_mask]): J_usage_masks.append(J_usage_mask) else: print(str(J_usage_mask) + " is not a usable J_usage_mask composed exclusively of recognized J gene/allele names") print('Unrecognized J gene/allele names: ' + ', '.join([j for j in J_usage_mask if gene_to_num_str(j, 'J') not in pgen_model.J_mask_mapping.keys()])) print('Continuing but inference might be biased...') J_usage_masks.append(J_usage_mask) #infile.close() #return -1 except IndexError: #no index match for J_mask_index print('J_mask_index is out of range, check the delimeter.') print('Exiting...') infile.close() return -1 gen_seqs=[[seqs[i],V_usage_masks[i][0],J_usage_masks[i][0]] for i in range(len(seqs))] # combine sequences. print('Initialise Model.') # choose sonia model type if options.model_type=='leftright': sonia_model=SoniaLeftposRightpos(data_seqs=data_seqs, gen_seqs=gen_seqs, custom_pgen_model=model_folder, vj=recomb_type == 'VJ', include_joint_genes=joint_genes, include_indep_genes=independent_genes, min_energy_clip=min_energy_clip, max_energy_clip=max_energy_clip ) elif options.model_type=='lengthpos': sonia_model=SoniaLengthPos(data_seqs=data_seqs, gen_seqs=gen_seqs, custom_pgen_model=model_folder, vj=recomb_type == 'VJ', include_joint_genes=joint_genes, include_indep_genes=independent_genes, min_energy_clip=min_energy_clip, max_energy_clip=max_energy_clip ) else: print('ERROR: choose a model between leftright or lengthpos') if generate_sequences: sonia_model.add_generated_seqs(n_gen_seqs,custom_model_folder=model_folder) if recompute_productive_norm: sonia_model.norm_productive=pgen_model.compute_regex_CDR3_template_pgen('CX{0,}') print('Model initialised. Start inference') sonia_model.infer_selection(epochs=options.epochs,verbose=1,batch_size=options.batch_size,validation_split=options.validation_split) print('Save Model') if options.outfile_name is not None: #OUTFILE SPECIFIED sonia_model.save_model(options.outfile_name) if options.plot_report: from sonia.plotting import Plotter pl=Plotter(sonia_model) pl.plot_model_learning(os.path.join(options.outfile_name, 'model_learning.png')) pl.plot_vjl(os.path.join(options.outfile_name, 'marginals.png')) pl.plot_logQ(os.path.join(options.outfile_name, 'log_Q.png')) pl.plot_ratioQ(os.path.join(options.outfile_name, 'Q_ratio.png')) else: #print to stdout sonia_model.save_model('sonia_model') if options.plot_report: from sonia.plotting import Plotter pl=Plotter(sonia_model) pl.plot_model_learning(os.path.join('sonia_model', 'model_learning.png')) pl.plot_vjl(os.path.join('sonia_model', 'marginals.png')) pl.plot_logQ(os.path.join('sonia_model', 'log_Q.png')) pl.plot_ratioQ(os.path.join('sonia_model', 'Q_ratio.png'))
def main(): """ Generate sequences.""" parser = OptionParser(conflict_handler="resolve") parser.add_option('--humanTRA', '--human_T_alpha', action='store_true', dest='humanTRA', default=False, help='use default human TRA model (T cell alpha chain)') parser.add_option('--humanTRB', '--human_T_beta', action='store_true', dest='humanTRB', default=False, help='use default human TRB model (T cell beta chain)') parser.add_option('--mouseTRB', '--mouse_T_beta', action='store_true', dest='mouseTRB', default=False, help='use default mouse TRB model (T cell beta chain)') parser.add_option('--humanIGH', '--human_B_heavy', action='store_true', dest='humanIGH', default=False, help='use default human IGH model (B cell heavy chain)') parser.add_option( '--VDJ_model_folder', dest='vdj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VDJ generative model') parser.add_option( '--VJ_model_folder', dest='vj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VJ generative model') parser.add_option('-o', '--outfile', dest='outfile_name', metavar='PATH/TO/FILE', help='write CDR3 sequences to PATH/TO/FILE') parser.add_option('-n', '--num_seqs', type='float', metavar='N', default=0, dest='num_seqs_to_generate', help='specify the number of sequences to generate.') parser.add_option( '--seed', type='int', dest='seed', help= 'set seed for pseudorandom number generator. Default is to not set a seed.' ) parser.add_option( '--seqs_per_time_update', type='float', default=100000, dest='seqs_per_time_update', help= 'specify the number of sequences between time updates. Default is 1e5') parser.add_option('--conserved_J_residues', type='string', default='FVW', dest='conserved_J_residues', help="specify conserved J residues. Default is 'FVW'.") parser.add_option('--time_updates_off', action='store_false', dest='time_updates', default=True, help='turn time updates off.') parser.add_option( '--seq_type', type='choice', default='all', dest='seq_type', choices=['all', 'ntseq', 'nucleotide', 'aaseq', 'amino_acid'], help= "declare sequence type for output sequences. Choices: 'all' [default], 'ntseq', 'nucleotide', 'aaseq', 'amino_acid'" ) parser.add_option('--record_genes_off', action='store_false', dest="record_genes", default=True, help='turn off recording V and J gene info.') parser.add_option( '-d', '--delimiter', type='choice', dest='delimiter', choices=['tab', 'space', ',', ';', ':'], help= "declare delimiter choice. Default is tab for .tsv output files, comma for .csv files, and tab for all others. Choices: 'tab', 'space', ',', ';', ':'" ) parser.add_option('--raw_delimiter', type='str', dest='delimiter', help="declare delimiter choice as a raw string.") (options, args) = parser.parse_args() main_folder = os.path.dirname(__file__) default_models = {} default_models['humanTRA'] = [ os.path.join(main_folder, 'default_models', 'human_T_alpha'), 'VJ' ] default_models['humanTRB'] = [ os.path.join(main_folder, 'default_models', 'human_T_beta'), 'VDJ' ] default_models['mouseTRB'] = [ os.path.join(main_folder, 'default_models', 'mouse_T_beta'), 'VDJ' ] default_models['humanIGH'] = [ os.path.join(main_folder, 'default_models', 'human_B_heavy'), 'VDJ' ] num_models_specified = sum([ 1 for x in list(default_models.keys()) + ['vj_model_folder', 'vdj_model_folder'] if getattr(options, x) ]) if num_models_specified == 1: #exactly one model specified try: d_model = [ x for x in list(default_models.keys()) if getattr(options, x) ][0] model_folder = default_models[d_model][0] recomb_type = default_models[d_model][1] except IndexError: if options.vdj_model_folder: #custom VDJ model specified model_folder = options.vdj_model_folder recomb_type = 'VDJ' elif options.vj_model_folder: #custom VJ model specified model_folder = options.vj_model_folder recomb_type = 'VJ' elif num_models_specified == 0: print('Need to indicate generative model.') print('Exiting...') return -1 elif num_models_specified > 1: print('Only specify one model') print('Exiting...') return -1 #Check that all model and genomic files exist in the indicated model folder if not os.path.isdir(model_folder): print('Check pathing... cannot find the model folder: ' + model_folder) print('Exiting...') return -1 params_file_name = os.path.join(model_folder, 'model_params.txt') marginals_file_name = os.path.join(model_folder, 'model_marginals.txt') V_anchor_pos_file = os.path.join(model_folder, 'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(model_folder, 'J_gene_CDR3_anchors.csv') for x in [ params_file_name, marginals_file_name, V_anchor_pos_file, J_anchor_pos_file ]: if not os.path.isfile(x): print('Cannot find: ' + x) print( 'Please check the files (and naming conventions) in the model folder ' + model_folder) print('Exiting...') return -1 if options.outfile_name is not None: outfile_name = options.outfile_name if os.path.isfile(outfile_name): if not input(outfile_name + ' already exists. Overwrite (y/n)? ' ).strip().lower() in ['y', 'yes']: print('Exiting...') return -1 #Parse arguments num_seqs_to_generate = int(options.num_seqs_to_generate) if num_seqs_to_generate <= 0: print('Need to specify num_seqs (number of sequences to generate).') print('Exiting...') return -1 #Parse default delimiter delimiter = options.delimiter if delimiter is None: delimiter = '\t' if options.outfile_name is not None: if outfile_name.endswith('.tsv'): delimiter = '\t' elif outfile_name.endswith('.csv'): delimiter = ',' else: try: delimiter = { 'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':' }[delimiter] except KeyError: pass #Other raw string. #Optional flags seq_type = { 'all': 'all', 'ntseq': 'ntseq', 'nucleotide': 'ntseq', 'aaseq': 'aaseq', 'amino_acid': 'aaseq' }[options.seq_type] record_genes = options.record_genes seqs_per_time_update = int(options.seqs_per_time_update) time_updates = options.time_updates conserved_J_residues = options.conserved_J_residues if options.seed is not None: np.random.seed(options.seed) #VDJ recomb case --- used for TCRB and IGH if recomb_type == 'VDJ': genomic_data = load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model(marginals_file_name) seq_gen = sequence_generation.SequenceGenerationVDJ( generative_model, genomic_data) #VJ recomb case --- used for TCRA and light chain elif recomb_type == 'VJ': genomic_data = load_model.GenomicDataVJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = load_model.GenerativeModelVJ() generative_model.load_and_process_igor_model(marginals_file_name) seq_gen = sequence_generation.SequenceGenerationVJ( generative_model, genomic_data) V_gene_names = [V[0].split('*')[0] for V in genomic_data.genV] J_gene_names = [J[0].split('*')[0] for J in genomic_data.genJ] if options.outfile_name is not None: outfile = open(outfile_name, 'w') print('Starting sequence generation... ') start_time = time.time() for i in range(num_seqs_to_generate): ntseq, aaseq, V_in, J_in = seq_gen.gen_rnd_prod_CDR3( conserved_J_residues) if seq_type == 'all': #default, include both ntseq and aaseq current_line_out = ntseq + delimiter + aaseq elif seq_type == 'ntseq': #only record ntseq current_line_out = ntseq elif seq_type == 'aaseq': #only record aaseq current_line_out = aaseq if record_genes: current_line_out += delimiter + V_gene_names[ V_in] + delimiter + J_gene_names[J_in] outfile.write(current_line_out + '\n') if (i + 1) % seqs_per_time_update == 0 and time_updates: c_time = time.time() - start_time eta = ((num_seqs_to_generate - (i + 1)) / float(i + 1)) * c_time if c_time > 86400: #more than a day c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.' % ( int(c_time) / 86400, (int(c_time) / 3600) % 24, (int(c_time) / 60) % 60, c_time % 60) elif c_time > 3600: #more than an hr c_time_str = '%d hours, %d minutes, and %.2f seconds.' % ( (int(c_time) / 3600) % 24, (int(c_time) / 60) % 60, c_time % 60) elif c_time > 60: #more than a min c_time_str = '%d minutes and %.2f seconds.' % ( (int(c_time) / 60) % 60, c_time % 60) else: c_time_str = '%.2f seconds.' % (c_time) if eta > 86400: #more than a day eta_str = '%d days, %d hours, %d minutes, and %.2f seconds.' % ( int(eta) / 86400, (int(eta) / 3600) % 24, (int(eta) / 60) % 60, eta % 60) elif eta > 3600: #more than an hr eta_str = '%d hours, %d minutes, and %.2f seconds.' % ( (int(eta) / 3600) % 24, (int(eta) / 60) % 60, eta % 60) elif eta > 60: #more than a min eta_str = '%d minutes and %.2f seconds.' % ( (int(eta) / 60) % 60, eta % 60) else: eta_str = '%.2f seconds.' % (eta) print( '%d sequences generated in %s Estimated time remaining: %s' % (i + 1, c_time_str, eta_str)) c_time = time.time() - start_time if c_time > 86400: #more than a day c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.' % ( int(c_time) / 86400, (int(c_time) / 3600) % 24, (int(c_time) / 60) % 60, c_time % 60) elif c_time > 3600: #more than an hr c_time_str = '%d hours, %d minutes, and %.2f seconds.' % ( (int(c_time) / 3600) % 24, (int(c_time) / 60) % 60, c_time % 60) elif c_time > 60: #more than a min c_time_str = '%d minutes and %.2f seconds.' % ( (int(c_time) / 60) % 60, c_time % 60) else: c_time_str = '%.2f seconds.' % (c_time) print('Completed generating all %d sequences in %s' % (num_seqs_to_generate, c_time_str)) outfile.close() else: #print to stdout for i in range(num_seqs_to_generate): ntseq, aaseq, V_in, J_in = seq_gen.gen_rnd_prod_CDR3( conserved_J_residues) if seq_type == 'all': #default, include both ntseq and aaseq current_line_out = ntseq + delimiter + aaseq elif seq_type == 'ntseq': #only record ntseq current_line_out = ntseq elif seq_type == 'aaseq': #only record aaseq current_line_out = aaseq if record_genes: current_line_out += delimiter + V_gene_names[ V_in] + delimiter + J_gene_names[J_in] print(current_line_out)
import olga.load_model as load_model import olga.generation_probability as pgen import olga.sequence_generation as seq_gen #%% path = '/home/heli/ENV/lib/python2.7/site-packages/olga/' #Define the files for loading in generative model/data params_file_name = path + 'default_models/human_T_beta/model_params.txt' marginals_file_name = path + 'default_models/human_T_beta/model_marginals.txt' V_anchor_pos_file = path + 'default_models/human_T_beta/V_gene_CDR3_anchors.csv' J_anchor_pos_file = path + 'default_models/human_T_beta/J_gene_CDR3_anchors.csv' #Load data genomic_data = load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) #Load model generative_model = load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model(marginals_file_name) #Process model/data for pgen computation by instantiating GenerationProbabilityVDJ pgen_model = pgen.GenerationProbabilityVDJ(generative_model, genomic_data) #example #calculating pgen with restriction to V, J gene usage pgen_model.compute_aa_CDR3_pgen('CAWSVAPDRGGYTF', 'TRBV30*01', 'TRBJ1-2*01') #calculating pgen without restriction to V, J gene usage pgen_model.compute_aa_CDR3_pgen('CAWSVAPDRGGYTF')