def gen_rnd_prod_CDR3(self, conserved_J_residues='FVW'): """Generate a productive CDR3 seq from a Monte Carlo draw of the model. Parameters ---------- conserved_J_residues : str, optional Conserved amino acid residues defining the CDR3 on the J side (normally F, V, and/or W) Returns ------- ntseq : str Productive CDR3 nucleotide sequence aaseq : str CDR3 amino acid sequence (aaseq = nt2aa(ntseq)) V_choice : int Index of V allele chosen to generate the CDR3 seq J_choice : int Index of J allele chosen to generate the CDR3 seq """ coding_pass = False while ~coding_pass: recomb_events = self.choose_random_recomb_events() V_seq = self.cutV_genomic_CDR3_segs[recomb_events['V']] #This both checks that the position of the conserved C is #identified and that the V isn't fully deleted out of the CDR3 #region if len(V_seq) <= max(recomb_events['delV'], 0): continue J_seq = self.cutJ_genomic_CDR3_segs[recomb_events['J']] #We check that J isn't deleted more than allowed. Note the #generative model really should reflect this structure already if len(J_seq) < recomb_events['delJ']: continue V_seq = V_seq[:len(V_seq) - recomb_events['delV']] J_seq = J_seq[recomb_events['delJ']:] if (len(V_seq) + len(J_seq) + recomb_events['insVJ']) % 3 != 0: continue insVJ_seq = rnd_ins_seq(recomb_events['insVJ'], self.C_Rvj, self.C_first_nt_bias_insVJ) #Translate to amino acid sequence, see if productive ntseq = V_seq + insVJ_seq + J_seq aaseq = nt2aa(ntseq) if '*' not in aaseq and aaseq[0] == 'C' and aaseq[ -1] in conserved_J_residues: return ntseq, aaseq, recomb_events['V'], recomb_events['J']
def add_generated_seqs(self, num_gen_seqs = 0, reset_gen_seqs = True, custom_model_folder = None, add_error=False,custom_error=None): """Generates MonteCarlo sequences for gen_seqs using OLGA. Only generates seqs from a V(D)J model. Requires the OLGA package (pip install olga). Parameters ---------- num_gen_seqs : int or float Number of MonteCarlo sequences to generate and add to the specified sequence pool. custom_model_folder : str Path to a folder specifying a custom IGoR formatted model to be used as a generative model. Folder must contain 'model_params.txt' and 'model_marginals.txt' add_error: bool simualate sequencing error: default is false custom_error: int set custom error rate for sequencing error. Default is the one inferred by igor. Attributes set -------------- gen_seqs : list MonteCarlo sequences drawn from a VDJ recomb model gen_seq_features : list Features gen_seqs have been projected onto. """ from sonia.utils import add_random_error from olga.utils import nt2aa #Load generative model if custom_model_folder is None: try: if self.custom_pgen_model is None: main_folder = os.path.join(os.path.dirname(__file__), 'default_models', self.chain_type) else: main_folder=self.custom_pgen_model except: main_folder = os.path.join(os.path.dirname(__file__), 'default_models', self.chain_type) else: main_folder = custom_model_folder params_file_name = os.path.join(main_folder,'model_params.txt') marginals_file_name = os.path.join(main_folder,'model_marginals.txt') V_anchor_pos_file = os.path.join(main_folder,'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(main_folder,'J_gene_CDR3_anchors.csv') if not os.path.isfile(params_file_name) or not os.path.isfile(marginals_file_name): print('Cannot find specified custom generative model files: ' + '\n' + params_file_name + '\n' + marginals_file_name) print('Exiting sequence generation...') return None if not os.path.isfile(V_anchor_pos_file): V_anchor_pos_file = os.path.join(os.path.dirname(olga_load_model.__file__), 'default_models', self.chain_type, 'V_gene_CDR3_anchors.csv') if not os.path.isfile(J_anchor_pos_file): J_anchor_pos_file = os.path.join(os.path.dirname(olga_load_model.__file__), 'default_models', self.chain_type, 'J_gene_CDR3_anchors.csv') with open(params_file_name,'r') as file: sep=0 error_rate='' lines=file.read().splitlines() while len(error_rate)<1: error_rate=lines[-1+sep] sep-=1 if custom_error is None: self.error_rate=float(error_rate) else: self.error_rate=custom_error if self.vj: genomic_data = olga_load_model.GenomicDataVJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVJ() generative_model.load_and_process_igor_model(marginals_file_name) sg_model = seq_gen.SequenceGenerationVJ(generative_model, genomic_data) else: genomic_data = olga_load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model(marginals_file_name) sg_model = seq_gen.SequenceGenerationVDJ(generative_model, genomic_data) #Generate sequences print('Generate sequences.') if add_error: seqs = [[nt2aa(add_random_error(seq[0],self.error_rate)), genomic_data.genV[seq[2]][0].split('*')[0], genomic_data.genJ[seq[3]][0].split('*')[0]] for seq in [sg_model.gen_rnd_prod_CDR3(conserved_J_residues='ABCEDFGHIJKLMNOPQRSTUVWXYZ') for _ in tqdm(range(int(num_gen_seqs)))]] else: seqs = [[seq[1], genomic_data.genV[seq[2]][0].split('*')[0], genomic_data.genJ[seq[3]][0].split('*')[0]] for seq in [sg_model.gen_rnd_prod_CDR3(conserved_J_residues='ABCEDFGHIJKLMNOPQRSTUVWXYZ') for _ in tqdm(range(int(num_gen_seqs)))]] if reset_gen_seqs: #reset gen_seqs if needed self.gen_seqs = [] #Add to specified pool(s) self.update_model(add_gen_seqs = seqs)
def main(): """Compute Pgens from a file and output to another file.""" parser = OptionParser(conflict_handler="resolve") parser.add_option('--humanTRA', '--human_T_alpha', action='store_true', dest='humanTRA', default=False, help='use default human TRA model (T cell alpha chain)') parser.add_option('--humanTRB', '--human_T_beta', action='store_true', dest='humanTRB', default=False, help='use default human TRB model (T cell beta chain)') parser.add_option('--mouseTRB', '--mouse_T_beta', action='store_true', dest='mouseTRB', default=False, help='use default mouse TRB model (T cell beta chain)') parser.add_option('--humanIGH', '--human_B_heavy', action='store_true', dest='humanIGH', default=False, help='use default human IGH model (B cell heavy chain)') parser.add_option('--set_custom_model_VDJ', dest='vdj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VDJ generative model') parser.add_option('--set_custom_model_VJ', dest='vj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VJ generative model') parser.add_option('-i', '--infile', dest = 'infile_name',metavar='PATH/TO/FILE', help='read in CDR3 sequences (and optionally V/J masks) from PATH/TO/FILE') parser.add_option('-o', '--outfile', dest = 'outfile_name', metavar='PATH/TO/FILE', help='write CDR3 sequences and pgens to PATH/TO/FILE') parser.add_option('--seq_in', '--seq_index', type='int', metavar='INDEX', dest='seq_in_index', default = 0, help='specifies sequences to be read in are in column INDEX. Default is index 0 (the first column).') parser.add_option('--v_in', '--v_mask_index', type='int', metavar='INDEX', dest='V_mask_index', help='specifies V_masks are found in column INDEX in the input file. Default is no V mask.') parser.add_option('--j_in', '--j_mask_index', type='int', metavar='INDEX', dest='J_mask_index', help='specifies J_masks are found in column INDEX in the input file. Default is no J mask.') parser.add_option('--v_mask', type='string', dest='V_mask', help='specify V usage to condition Pgen on for seqs read in as arguments.') parser.add_option('--j_mask', type='string', dest='J_mask', help='specify J usage to condition Pgen on for seqs read in as arguments.') parser.add_option('-m', '--max_number_of_seqs', type='int',metavar='N', dest='max_number_of_seqs', help='compute Pgens for at most N sequences.') parser.add_option('--lines_to_skip', type='int',metavar='N', dest='lines_to_skip', default = 0, help='skip the first N lines of the file. Default is 0.') parser.add_option('-a', '--alphabet_filename', dest='alphabet_filename', metavar='PATH/TO/FILE', help="specify PATH/TO/FILE defining a custom 'amino acid' alphabet. Default is no custom alphabet.") parser.add_option('--seq_type_out', type='choice',metavar='SEQ_TYPE', dest='seq_type_out', choices=['all', 'ntseq', 'nucleotide', 'aaseq', 'amino_acid'], help="if read in sequences are ntseqs, declare what type of sequence to compute pgen for. Default is all. Choices: 'all', 'ntseq', 'nucleotide', 'aaseq', 'amino_acid'") parser.add_option('--skip_off','--skip_empty_off', action='store_true', dest = 'skip_empty', default=True, help='stop skipping empty or blank sequences/lines (if for example you want to keep line index fidelity between the infile and outfile).') parser.add_option('--display_off', action='store_false', dest='display_seqs', default=True, help='turn the sequence display off (only applies in write-to-file mode). Default is on.') parser.add_option('--num_lines_for_display', type='int', metavar='N', default = 50, dest='num_lines_for_display', help='N lines of the output file are displayed when sequence display is on. Also used to determine the number of sequences to average over for speed and time estimates.') parser.add_option('--time_updates_off', action='store_false', dest='time_updates', default=True, help='turn time updates off (only applies when sequence display is disabled).') parser.add_option('--seqs_per_time_update', type='float', metavar='N', default = 100, dest='seqs_per_time_update', help='specify the number of sequences between time updates. Default is 1e5.') parser.add_option('-d', '--delimiter', type='choice', dest='delimiter', choices=['tab', 'space', ',', ';', ':'], help="declare infile delimiter. Default is tab for .tsv input files, comma for .csv files, and any whitespace for all others. Choices: 'tab', 'space', ',', ';', ':'") parser.add_option('--raw_delimiter', type='str', dest='delimiter', help="declare infile delimiter as a raw string.") parser.add_option('--delimiter_out', type='choice', dest='delimiter_out', choices=['tab', 'space', ',', ';', ':'], help="declare outfile delimiter. Default is tab for .tsv output files, comma for .csv files, and the infile delimiter for all others. Choices: 'tab', 'space', ',', ';', ':'") parser.add_option('--raw_delimiter_out', type='str', dest='delimiter_out', help="declare for the delimiter outfile as a raw string.") parser.add_option('--gene_mask_delimiter', type='choice', dest='gene_mask_delimiter', choices=['tab', 'space', ',', ';', ':'], help="declare gene mask delimiter. Default comma unless infile delimiter is comma, then default is a semicolon. Choices: 'tab', 'space', ',', ';', ':'") parser.add_option('--raw_gene_mask_delimiter', type='str', dest='gene_mask_delimiter', help="declare delimiter of gene masks as a raw string.") parser.add_option('--comment_delimiter', type='str', dest='comment_delimiter', help="character or string to indicate comment or header lines to skip.") (options, args) = parser.parse_args() #Check that the model is specified properly main_folder = os.path.dirname(__file__) default_models = {} default_models['humanTRA'] = [os.path.join(main_folder, 'default_models', 'human_T_alpha'), 'VJ'] default_models['humanTRB'] = [os.path.join(main_folder, 'default_models', 'human_T_beta'), 'VDJ'] default_models['mouseTRB'] = [os.path.join(main_folder, 'default_models', 'mouse_T_beta'), 'VDJ'] default_models['humanIGH'] = [os.path.join(main_folder, 'default_models', 'human_B_heavy'), 'VDJ'] num_models_specified = sum([1 for x in default_models.keys() + ['vj_model_folder', 'vdj_model_folder'] if getattr(options, x)]) if num_models_specified == 1: #exactly one model specified try: d_model = [x for x in default_models.keys() if getattr(options, x)][0] model_folder = default_models[d_model][0] recomb_type = default_models[d_model][1] except IndexError: if options.vdj_model_folder: #custom VDJ model specified model_folder = options.vdj_model_folder recomb_type = 'VDJ' elif options.vj_model_folder: #custom VJ model specified model_folder = options.vj_model_folder recomb_type = 'VJ' elif num_models_specified == 0: print 'Need to indicate generative model.' print 'Exiting...' return -1 elif num_models_specified > 1: print 'Only specify one model' print 'Exiting...' return -1 #Check that all model and genomic files exist in the indicated model folder if not os.path.isdir(model_folder): print 'Check pathing... cannot find the model folder: ' + model_folder print 'Exiting...' return -1 params_file_name = os.path.join(model_folder,'model_params.txt') marginals_file_name = os.path.join(model_folder,'model_marginals.txt') V_anchor_pos_file = os.path.join(model_folder,'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(model_folder,'J_gene_CDR3_anchors.csv') for x in [params_file_name, marginals_file_name, V_anchor_pos_file, J_anchor_pos_file]: if not os.path.isfile(x): print 'Cannot find: ' + x print 'Please check the files (and naming conventions) in the model folder ' + model_folder print 'Exiting...' return -1 alphabet_filename = options.alphabet_filename #used if a custom alphabet is to be specified if alphabet_filename is not None: if not os.path.isfile(alphabet_filename): print 'Cannot find custom alphabet file: ' + infile_name print 'Exiting...' return -1 #Load up model based on recomb_type #VDJ recomb case --- used for TCRB and IGH if recomb_type == 'VDJ': genomic_data = load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model(marginals_file_name) pgen_model = generation_probability.GenerationProbabilityVDJ(generative_model, genomic_data, alphabet_filename) #VJ recomb case --- used for TCRA and light chain elif recomb_type == 'VJ': genomic_data = load_model.GenomicDataVJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = load_model.GenerativeModelVJ() generative_model.load_and_process_igor_model(marginals_file_name) pgen_model = generation_probability.GenerationProbabilityVJ(generative_model, genomic_data, alphabet_filename) aa_alphabet = ''.join(pgen_model.codons_dict.keys()) if options.infile_name is not None: infile_name = options.infile_name if not os.path.isfile(infile_name): print 'Cannot find input file: ' + infile_name print 'Exiting...' return -1 if options.outfile_name is not None: outfile_name = options.outfile_name if os.path.isfile(outfile_name): if not raw_input(outfile_name + ' already exists. Overwrite (y/n)? ').strip().lower() in ['y', 'yes']: print 'Exiting...' return -1 #Parse delimiter delimiter = options.delimiter if delimiter is None: #Default case if options.infile_name is None: delimiter = '\t' elif infile_name.endswith('.tsv'): #parse TAB separated value file delimiter = '\t' elif infile_name.endswith('.csv'): #parse COMMA separated value file delimiter = ',' else: try: delimiter = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[delimiter] except KeyError: pass #Other string passed as the delimiter. #Parse delimiter_out delimiter_out = options.delimiter_out if delimiter_out is None: #Default case if delimiter is None: delimiter_out = '\t' else: delimiter_out = delimiter if options.outfile_name is None: pass elif outfile_name.endswith('.tsv'): #output TAB separated value file delimiter_out = '\t' elif outfile_name.endswith('.csv'): #output COMMA separated value file delimiter_out = ',' else: try: delimiter_out = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[delimiter_out] except KeyError: pass #Other string passed as the delimiter. #Parse gene_delimiter gene_mask_delimiter = options.gene_mask_delimiter if gene_mask_delimiter is None: #Default case gene_mask_delimiter = ',' if delimiter == ',': gene_mask_delimiter = ';' else: try: gene_mask_delimiter = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[gene_mask_delimiter] except KeyError: pass #Other string passed as the delimiter. #More options time_updates = options.time_updates display_seqs = options.display_seqs num_lines_for_display = options.num_lines_for_display seq_in_index = options.seq_in_index #where in the line the sequence is after line.split(delimiter) lines_to_skip = options.lines_to_skip #one method of skipping header comment_delimiter = options.comment_delimiter #another method of skipping header seqs_per_time_update = options.seqs_per_time_update max_number_of_seqs = options.max_number_of_seqs V_mask_index = options.V_mask_index #Default is not conditioning on V identity J_mask_index = options.J_mask_index #Default is not conditioning on J identity skip_empty = options.skip_empty seq_type_out = options.seq_type_out #type of pgens to be computed. Can be ntseq, aaseq, or both if seq_type_out is not None: seq_type_out = {'all': None, 'ntseq': 'ntseq', 'nucleotide': 'ntseq', 'aaseq': 'aaseq', 'amino_acid': 'aaseq'}[seq_type_out] if options.infile_name is None: #No infile specified -- args should be the input seqs print_warnings = True seqs = args seq_types = [determine_seq_type(seq, aa_alphabet) for seq in seqs] unrecognized_seqs = [seq for i, seq in enumerate(seqs) if seq_types[i] is None] if len(unrecognized_seqs) > 0 and print_warnings: print 'The following sequences/arguments were not recognized: ' + ', '.join(unrecognized_seqs) seqs = [seq for i, seq in enumerate(seqs) if seq_types[i] is not None] seq_types = [seq_type for seq_type in seq_types if seq_type is not None] #Format V and J masks -- uniform for all argument input sequences try: V_mask = options.V_mask.split(',') unrecognized_v_genes = [v for v in V_mask if v not in pgen_model.V_mask_mapping.keys()] V_mask = [v for v in V_mask if v in pgen_model.V_mask_mapping.keys()] if len(unrecognized_v_genes) > 0: print 'These V genes/alleles are not recognized: ' + ', '.join(unrecognized_v_genes) if len(V_mask) == 0: print 'No recognized V genes/alleles in the provided V_mask. Continuing without conditioning on V usage.' V_mask = None except AttributeError: V_mask = options.V_mask #Default is None, i.e. not conditioning on V identity try: J_mask = options.J_mask.split(',') unrecognized_j_genes = [j for j in J_mask if j not in pgen_model.J_mask_mapping.keys()] J_mask = [j for j in J_mask if j in pgen_model.J_mask_mapping.keys()] if len(unrecognized_j_genes) > 0: print 'These J genes/alleles are not recognized: ' + ', '.join(unrecognized_j_genes) if len(J_mask) == 0: print 'No recognized J genes/alleles in the provided J_mask. Continuing without conditioning on J usage.' J_mask = None except AttributeError: J_mask = options.J_mask #Default is None, i.e. not conditioning on J identity print '' start_time = time.time() for seq, seq_type in zip(seqs, seq_types): if seq_type == 'aaseq': c_pgen = pgen_model.compute_aa_CDR3_pgen(seq, V_mask, J_mask, print_warnings) print 'Pgen of the amino acid sequence ' + seq + ': ' + str(c_pgen) print '' elif seq_type == 'regex': c_pgen = pgen_model.compute_regex_CDR3_template_pgen(seq, V_mask, J_mask, print_warnings) print 'Pgen of the regular expression sequence ' + seq + ': ' + str(c_pgen) print '' elif seq_type == 'ntseq': if seq_type_out is None or seq_type_out == 'ntseq': c_pgen_nt = pgen_model.compute_nt_CDR3_pgen(seq, V_mask, J_mask, print_warnings) print 'Pgen of the nucleotide sequence ' + seq + ': ' + str(c_pgen_nt) if seq_type_out is None or seq_type_out == 'aaseq': c_pgen_aa = pgen_model.compute_aa_CDR3_pgen(nt2aa(seq), V_mask, J_mask, print_warnings) print 'Pgen of the amino acid sequence nt2aa(' + seq + ') = ' + nt2aa(seq) + ': ' + str(c_pgen_aa) print '' c_time = time.time() - start_time if c_time > 86400: #more than a day c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(c_time)/86400, (int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) elif c_time > 3600: #more than an hr c_time_str = '%d hours, %d minutes, and %.2f seconds.'%((int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) elif c_time > 60: #more than a min c_time_str = '%d minutes and %.2f seconds.'%((int(c_time)/60)%60, c_time%60) else: c_time_str = '%.2f seconds.'%(c_time) print 'Completed pgen computation in: ' + c_time_str else: #Read sequences in from file print_warnings = False #Most cases of reading in from file should have warnings disabled seqs = [] seq_types = [] V_usage_masks = [] J_usage_masks = [] infile = open(infile_name, 'r') for i, line in enumerate(infile): if comment_delimiter is not None: #Default case -- no comments/header delimiter if line.startswith(comment_delimiter): #allow comments continue if i < lines_to_skip: continue if delimiter is None: #Default delimiter is any whitespace split_line = line.split() else: split_line = line.split(delimiter) #Find the seq try: seq = split_line[seq_in_index].strip() if len(seq.strip()) == 0: if skip_empty: continue else: seqs.append(seq) #keep the blank seq as a placeholder seq_types.append('aaseq') else: seqs.append(seq) seq_types.append(determine_seq_type(seq, aa_alphabet)) except IndexError: #no index match for seq if skip_empty and len(line.strip()) == 0: continue print 'seq_in_index is out of range' print 'Exiting...' infile.close() return -1 #Find and format V_usage_mask if V_mask_index is None: V_usage_masks.append(None) #default mask else: try: V_usage_mask = split_line[V_mask_index].strip().split(gene_mask_delimiter) #check that all V gene/allele names are recognized if all([v in pgen_model.V_mask_mapping for v in V_usage_mask]): V_usage_masks.append(V_usage_mask) else: print str(V_usage_mask) + " is not a usable V_usage_mask composed exclusively of recognized V gene/allele names" print 'Unrecognized V gene/allele names: ' + ', '.join([v for v in V_usage_mask if not v in pgen_model.V_mask_mapping.keys()]) print 'Exiting...' infile.close() return -1 except IndexError: #no index match for V_mask_index print 'V_mask_index is out of range' print 'Exiting...' infile.close() return -1 #Find and format J_usage_mask if J_mask_index is None: J_usage_masks.append(None) #default mask else: try: J_usage_mask = split_line[J_mask_index].strip().split(gene_mask_delimiter) #check that all V gene/allele names are recognized if all([j in pgen_model.J_mask_mapping for j in J_usage_mask]): J_usage_masks.append(J_usage_mask) else: print str(J_usage_mask) + " is not a usable J_usage_mask composed exclusively of recognized J gene/allele names" print 'Unrecognized J gene/allele names: ' + ', '.join([j for j in J_usage_mask if not j in pgen_model.J_mask_mapping.keys()]) print 'Exiting...' infile.close() return -1 except IndexError: #no index match for J_mask_index print 'J_mask_index is out of range' print 'Exiting...' infile.close() return -1 if max_number_of_seqs is not None: if len(seqs) >= max_number_of_seqs: break unrecognized_seqs = [seq for i, seq in enumerate(seqs) if seq_types[i] is None] if len(unrecognized_seqs) > 0 and len(unrecognized_seqs) < len(seqs): if print_warnings or options.outfile_name is not None: print 'Some strings read in were not parsed as sequences -- they will be omitted.' print 'Examples of improperly read strings: ' for unrecognized_seq in unrecognized_seqs[:10]: print unrecognized_seq seqs = [seq for i, seq in enumerate(seqs) if seq_types[i] is not None] V_usage_masks = [V_usage_mask for i, V_usage_mask in enumerate(V_usage_masks) if seq_types[i] is not None] seq_types = [seq_type for seq_type in seq_types if seq_type is not None] elif len(unrecognized_seqs) > 0 and len(unrecognized_seqs) == len(seqs): print 'None of the read in strings were parsed as sequences. Check input file.' print 'Examples of improperly read strings:' for unrecognized_seq in unrecognized_seqs[:10]: print unrecognized_seq print 'Exiting...' return -1 infile.close() if options.outfile_name is not None: #OUTFILE SPECIFIED, allow printed info/display print 'Successfully read in and formatted ' + str(len(seqs)) + ' sequences and any V or J usages.' if display_seqs: sys.stdout.write('\r'+'Continuing to Pgen computation in 3... ') sys.stdout.flush() time.sleep(0.4) sys.stdout.write('\r'+'Continuing to Pgen computation in 2... ') sys.stdout.flush() time.sleep(0.4) sys.stdout.write('\r'+'Continuing to Pgen computation in 1... ') sys.stdout.flush() time.sleep(0.4) else: print 'Continuing to Pgen computation.' print_warnings = True #Display is off, can print warnings if display_seqs: lines_for_display = [] times_for_speed_calc = [time.time()] outfile = open(outfile_name, 'w') start_time = time.time() for i, seq in enumerate(seqs): if seq_types[i] == 'aaseq': #Compute Pgen and print out c_pgen_line = seq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(seq, V_usage_masks[i], J_usage_masks[i], print_warnings)) if seq_types[i] == 'regex': #Compute Pgen and print out c_pgen_line = seq + delimiter_out + str(pgen_model.compute_regex_CDR3_template_pgen(seq, V_usage_masks[i], J_usage_masks[i], print_warnings)) elif seq_types[i] == 'ntseq': ntseq = seq if len(ntseq) % 3 == 0: #inframe sequence aaseq = nt2aa(ntseq) #Compute Pgen and print out based on recomb_type and seq_type_out if seq_type_out is None: c_pgen_line = ntseq + delimiter_out + str(pgen_model.compute_nt_CDR3_pgen(ntseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) + delimiter_out + aaseq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(aaseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) elif seq_type_out == 'ntseq': c_pgen_line = ntseq + delimiter_out + str(pgen_model.compute_nt_CDR3_pgen(ntseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) elif seq_type_out == 'aaseq': c_pgen_line = aaseq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(aaseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) else: #out of frame sequence -- Pgens are 0 and use 'out_of_frame' for aaseq if seq_type_out is None: c_pgen_line = ntseq + delimiter_out + '0' + delimiter_out + 'out_of_frame' + delimiter_out + '0' elif seq_type_out == 'ntseq': c_pgen_line = ntseq + delimiter_out + '0' elif seq_type_out == 'aaseq': c_pgen_line = 'out_of_frame' + delimiter_out + '0' outfile.write(c_pgen_line + '\n') #Print time update if display_seqs: cc_time = time.time() c_time = cc_time - start_time times_for_speed_calc = [cc_time] + times_for_speed_calc[:num_lines_for_display] c_avg_speed = (len(times_for_speed_calc)-1)/float(times_for_speed_calc[0] - times_for_speed_calc[-1]) #eta = ((len(seqs) - (i+1))/float(i+1))*c_time eta = (len(seqs) - (i+1))/c_avg_speed lines_for_display = [c_pgen_line] + lines_for_display[:num_lines_for_display] c_time_str = '%s hours, %s minutes, and %s seconds.'%(repr(int(c_time)/3600).rjust(3), repr((int(c_time)/60)%60).rjust(2), repr(int(c_time)%60).rjust(2)) eta_str = '%s hours, %s minutes, and %s seconds.'%(repr(int(eta)/3600).rjust(3), repr((int(eta)/60)%60).rjust(2), repr(int(eta)%60).rjust(2)) time_str = 'Time to compute Pgen on %s seqs: %s \nEst. time for remaining %s seqs: %s'%(repr(i+1).rjust(9), c_time_str, repr(len(seqs) - (i + 1)).rjust(9), eta_str) speed_str = 'Current Pgen computation speed: %s seqs/min'%(repr(round((len(times_for_speed_calc)-1)*60/float(times_for_speed_calc[0] - times_for_speed_calc[-1]), 2)).rjust(8)) display_str = '\n'.join(lines_for_display[::-1]) + '\n' + '-'*80 + '\n' + time_str + '\n' + speed_str + '\n' + '-'*80 print '\033[2J' + display_str elif (i+1)%seqs_per_time_update == 0 and time_updates: c_time = time.time() - start_time eta = ((len(seqs) - (i+1))/float(i+1))*c_time if c_time > 86400: #more than a day c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(c_time)/86400, (int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) elif c_time > 3600: #more than an hr c_time_str = '%d hours, %d minutes, and %.2f seconds.'%((int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) elif c_time > 60: #more than a min c_time_str = '%d minutes and %.2f seconds.'%((int(c_time)/60)%60, c_time%60) else: c_time_str = '%.2f seconds.'%(c_time) if eta > 86400: #more than a day eta_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(eta)/86400, (int(eta)/3600)%24, (int(eta)/60)%60, eta%60) elif eta > 3600: #more than an hr eta_str = '%d hours, %d minutes, and %.2f seconds.'%((int(eta)/3600)%24, (int(eta)/60)%60, eta%60) elif eta > 60: #more than a min eta_str = '%d minutes and %.2f seconds.'%((int(eta)/60)%60, eta%60) else: eta_str = '%.2f seconds.'%(eta) print 'Pgen computed for %d sequences in: %s Estimated time remaining: %s'%(i+1, c_time_str, eta_str) c_time = time.time() - start_time if c_time > 86400: #more than a day c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(c_time)/86400, (int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) elif c_time > 3600: #more than an hr c_time_str = '%d hours, %d minutes, and %.2f seconds.'%((int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60) elif c_time > 60: #more than a min c_time_str = '%d minutes and %.2f seconds.'%((int(c_time)/60)%60, c_time%60) else: c_time_str = '%.2f seconds.'%(c_time) print 'Completed Pgen computation for %d sequences: in %s'%(len(seqs), c_time_str) outfile.close() else: #NO OUTFILE -- print directly to stdout start_time = time.time() for i, seq in enumerate(seqs): if seq_types[i] == 'aaseq': #Compute Pgen and print out c_pgen_line = seq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(seq, V_usage_masks[i], J_usage_masks[i], print_warnings)) if seq_types[i] == 'regex': #Compute Pgen and print out c_pgen_line = seq + delimiter_out + str(pgen_model.compute_regex_CDR3_template_pgen(seq, V_usage_masks[i], J_usage_masks[i], print_warnings)) elif seq_types[i] == 'ntseq': ntseq = seq if len(ntseq) % 3 == 0: #inframe sequence aaseq = nt2aa(ntseq) #Compute Pgen and print out based on recomb_type and seq_type_out if seq_type_out is None: c_pgen_line = ntseq + delimiter_out + str(pgen_model.compute_nt_CDR3_pgen(ntseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) + delimiter_out + aaseq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(aaseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) elif seq_type_out == 'ntseq': c_pgen_line = ntseq + delimiter_out + str(pgen_model.compute_nt_CDR3_pgen(ntseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) elif seq_type_out == 'aaseq': c_pgen_line = aaseq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(aaseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) else: #out of frame sequence -- Pgens are 0 and use 'out_of_frame' for aaseq if seq_type_out is None: c_pgen_line = ntseq + delimiter_out + '0' + delimiter_out + 'out_of_frame' + delimiter_out + '0' elif seq_type_out == 'ntseq': c_pgen_line = ntseq + delimiter_out + '0' elif seq_type_out == 'aaseq': c_pgen_line = 'out_of_frame' + delimiter_out + '0' print c_pgen_line
def gen_rnd_prod_CDR3( self, V=None, J=None, conserved_J_residues='FVW', ): """Generate a productive CDR3 seq from a Monte Carlo draw of the model. Parameters ---------- conserved_J_residues : str, optional Conserved amino acid residues defining the CDR3 on the J side (normally F, V, and/or W) Returns ------- ntseq : str Productive CDR3 nucleotide sequence aaseq : str CDR3 amino acid sequence (aaseq = nt2aa(ntseq)) V_choice : int Index of V allele chosen to generate the CDR3 seq J_choice : int Index of J allele chosen to generate the CDR3 seq """ coding_pass = False counter = 0 while ~coding_pass and counter < 20: counter = counter + 1 #print(counter) if V is not None: recomb_events = self.choose_directed_recomb_events(V=V, J=J) # print(recomb_events) else: recomb_events = self.choose_random_recomb_events() #print(recomb_events) V_seq = self.cutV_genomic_CDR3_segs[recomb_events['V']] #print(V_seq) #This both checks that the position of the conserved C is #identified and that the V isn't fully deleted out of the CDR3 #region if len(V_seq) <= max(recomb_events['delV'], 0): continue D_seq = self.cutD_genomic_CDR3_segs[recomb_events['D']] J_seq = self.cutJ_genomic_CDR3_segs[recomb_events['J']] #We check that the D and J aren't deleted more than allowed. Note #the generative model really should reflect this structure already if len(D_seq) < (recomb_events['delDl'] + recomb_events['delDr'] ) or len(J_seq) < recomb_events['delJ']: continue V_seq = V_seq[:len(V_seq) - recomb_events['delV']] D_seq = D_seq[recomb_events['delDl']:len(D_seq) - recomb_events['delDr']] J_seq = J_seq[recomb_events['delJ']:] if (len(V_seq) + len(D_seq) + len(J_seq) + recomb_events['insVD'] + recomb_events['insDJ']) % 3 != 0: continue insVD_seq = rnd_ins_seq(recomb_events['insVD'], self.C_Rvd, self.C_first_nt_bias_insVD) insDJ_seq = rnd_ins_seq( recomb_events['insDJ'], self.C_Rdj, self.C_first_nt_bias_insDJ)[::-1] #have to reverse the DJ seq #Translate to amino acid sequence, see if productive ntseq = V_seq + insVD_seq + D_seq + insDJ_seq + J_seq aaseq = nt2aa(ntseq) if '*' not in aaseq and aaseq[0] == 'C' and aaseq[ -1] in conserved_J_residues: return ntseq, aaseq, recomb_events['V'], recomb_events[ 'J'], recomb_events #warnings.warn(f"After {counter} attemps no productive CDR3 found from V:{V} and J:{J}, delV likely exceeds V_seq: '{V_seq}' see possible issue with cutV_genomic_CDR3_segs () ") return None