def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None): # load candidate sequences seq_file = open(seq_path, 'U') candidate_sequences = parse_fasta(seq_file) # load template sequences template_alignment = [] template_alignment_fp = self.Params['template_filepath'] for seq_id, seq in parse_fasta(open(template_alignment_fp)): # replace '.' characters with '-' characters template_alignment.append((seq_id, seq.replace('.', '-').upper())) template_alignment = Alignment.from_fasta_records(template_alignment, DNASequence, validate=True) # initialize_logger logger = NastLogger(log_path) # get function for pairwise alignment method pairwise_alignment_f = pairwise_alignment_methods[ self.Params['pairwise_alignment_method']] pynast_aligned, pynast_failed = pynast_seqs( candidate_sequences, template_alignment, min_pct=self.Params['min_pct'], min_len=self.Params['min_len'], align_unaligned_seqs_f=pairwise_alignment_f, logger=logger, temp_dir=get_qiime_temp_dir()) logger.record(str(self)) for i, seq in enumerate(pynast_failed): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_failed[i] = skb_seq pynast_failed = SequenceCollection(pynast_failed) for i, seq in enumerate(pynast_aligned): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_aligned[i] = skb_seq pynast_aligned = Alignment(pynast_aligned) if failure_path is not None: fail_file = open(failure_path, 'w') fail_file.write(pynast_failed.to_fasta()) fail_file.close() if result_path is not None: result_file = open(result_path, 'w') result_file.write(pynast_aligned.to_fasta()) result_file.close() return None else: return pynast_aligned
def make_single_mutant(sequence,wt_res,res_num,mut_res,first_res=1): """ sequence (string) DNA sequence wt_res (char) single letter amino acid code of wildtype residue to be mutated res_num (int) residue id number of residue to be mutated mut_res (char) single letter amino acid code of mutant residue first_res (int) residue id number of first residue in sequence (default = 1) DNA sequence needs to start with the first residue of the protein (no promoter, etc) take DNA sequence, convert to AA, define AA point mutant, find corresponding codon of wt and mut, output forward and reverse primers DNA sequence should be only the kinase domain Desired mutation must require only a single nucleotide change """ orig_code = genetic_code(11) sequence = sequence.upper() aa_sequence = orig_code.translate(sequence).sequence if not str(wt_res) == aa_sequence[res_num-first_res]: raise IOError("Desired residue not found -- check wildtype residue name and id, and first residue id") # start of codon of residue of interest is at (res_num - first_res)*3 wt_codon = DNASequence(sequence[(res_num - first_res)*3:(res_num - first_res)*3+3]) mut_codons = orig_code.synonyms[mut_res] mut_codon = None for codon in mut_codons: if wt_codon.distance(DNASequence(codon))*3 == 1: mut_codon = codon if not mut_codon: print("Cannot make desired mutant with a single base change") mut_codon = make_mutant(wt_codon, mut_codons) good_melting_temp = False start_ix = max(0,(res_num-first_res)*3-11) end_ix = min(len(sequence),(res_num+1-first_res)*3+11) while not good_melting_temp: if end_ix - start_ix > 45: print("Acceptable melting temp was not found") break forward_primer = sequence[start_ix:(res_num - first_res)*3]+mut_codon+sequence[(res_num+1 - first_res)*3:end_ix] forward_primer = forward_primer.lower() good_melting_temp, start_ix, end_ix = check_melting_temp(forward_primer, start_ix, end_ix, len(sequence)) forward_sequence = DNASequence(forward_primer) reverse_sequence = forward_sequence.rc() reverse_primer = reverse_sequence.sequence return forward_primer, reverse_primer
def dna_to_aa(sequence, try_frames=False): """ Translates from the input DNA nucleotide sequence to amino acid sequence Arguments: ---------- sequence : str DNA nucleotide sequence Optional: --------- try_frames : Bool if True, tries 6 possible reading frames, translates all to amino acids and chooses sequence with fewest stop codons default = False Returns: -------- aa_sequence : str sequence of one-letter amino acid codes """ orig_code = genetic_code(11) if not try_frames: return orig_code.translate(sequence).sequence sequence = DNASequence(sequence) translated = orig_code.translate_six_frames(sequence) stops = [aastring.sequence.count('*') for aastring in translated] return translated[stops.index(min(stops))].sequence
def check_dna_chars_primers(header, mapping_data, errors, disable_primer_check=False ): """ Checks for valid DNA characters in primer fields Also flags empty fields as errors unless flags are passed to suppress barcode or primer checks. header: list of header strings mapping_data: list of lists of raw metadata mapping file data errors: list of errors disable_primer_check: If True, disables tests for valid primer sequences. """ valid_dna_chars = DNASequence.iupac_characters() valid_dna_chars.add(',') # Detect fields directly, in case user does not have fields in proper # order in the mapping file (this will generate error separately) header_fields_to_check = ["ReversePrimer"] if not disable_primer_check: header_fields_to_check.append("LinkerPrimerSequence") check_indices = [] for curr_field in range(len(header)): if header[curr_field] in header_fields_to_check: check_indices.append(curr_field) # Correction factor for header being the first line correction_ix = 1 # Check for missing data for curr_data in range(len(mapping_data)): for curr_ix in check_indices: if len(mapping_data[curr_data][curr_ix]) == 0: errors.append("Missing expected DNA sequence\t%d,%d" % (curr_data + correction_ix, curr_ix)) # Check for non-DNA characters for curr_data in range(len(mapping_data)): for curr_ix in check_indices: for curr_nt in mapping_data[curr_data][curr_ix]: if curr_nt not in valid_dna_chars: errors.append("Invalid DNA sequence detected: %s\t%d,%d" % (mapping_data[curr_data][curr_ix], curr_data + correction_ix, curr_ix)) continue return errors
def check_dna_chars_primers(header, mapping_data, errors, disable_primer_check=False): """ Checks for valid DNA characters in primer fields Also flags empty fields as errors unless flags are passed to suppress barcode or primer checks. header: list of header strings mapping_data: list of lists of raw metadata mapping file data errors: list of errors disable_primer_check: If True, disables tests for valid primer sequences. """ valid_dna_chars = DNASequence.iupac_characters() valid_dna_chars.add(',') # Detect fields directly, in case user does not have fields in proper # order in the mapping file (this will generate error separately) header_fields_to_check = ["ReversePrimer"] if not disable_primer_check: header_fields_to_check.append("LinkerPrimerSequence") check_indices = [] for curr_field in range(len(header)): if header[curr_field] in header_fields_to_check: check_indices.append(curr_field) # Correction factor for header being the first line correction_ix = 1 # Check for missing data for curr_data in range(len(mapping_data)): for curr_ix in check_indices: if len(mapping_data[curr_data][curr_ix]) == 0: errors.append("Missing expected DNA sequence\t%d,%d" % (curr_data + correction_ix, curr_ix)) # Check for non-DNA characters for curr_data in range(len(mapping_data)): for curr_ix in check_indices: for curr_nt in mapping_data[curr_data][curr_ix]: if curr_nt not in valid_dna_chars: errors.append("Invalid DNA sequence detected: %s\t%d,%d" % (mapping_data[curr_data][curr_ix], curr_data + correction_ix, curr_ix)) continue return errors
def check_dna_chars_bcs(header, mapping_data, errors, has_barcodes=True): """ Checks for valid DNA characters in barcode field Also flags empty fields as errors unless flags are passed to suppress barcode or primer checks. header: list of header strings mapping_data: list of lists of raw metadata mapping file data errors: list of errors has_barcodes: If True, will test for perform barcodes test (presence, uniqueness, valid IUPAC DNA chars). """ valid_dna_chars = DNASequence.iupac_standard_characters() # Detect fields directly, in case user does not have fields in proper # order in the mapping file (this will generate error separately) header_fields_to_check = [] if has_barcodes: header_fields_to_check.append("BarcodeSequence") check_indices = [] for curr_field in range(len(header)): if header[curr_field] in header_fields_to_check: check_indices.append(curr_field) # Correction factor for header being the first line correction_ix = 1 # Check for missing data for curr_data in range(len(mapping_data)): for curr_ix in check_indices: if len(mapping_data[curr_data][curr_ix]) == 0: errors.append("Missing expected DNA sequence\t%d,%d" % (curr_data + correction_ix, curr_ix)) continue for curr_nt in mapping_data[curr_data][curr_ix]: if curr_nt not in valid_dna_chars: errors.append("Invalid DNA sequence detected: %s\t%d,%d" % (mapping_data[curr_data][curr_ix], curr_data + correction_ix, curr_ix)) continue return errors
def _make_mutant(wt_codon, mut_codons): """ Finds the mutant codon, if mutation requires more than 1 nucleotide change Arguments: ---------- wt_codon : str len(wt_codon) = 3 nucleotide codon from the wild type sequence for the residue to be mutated mut_codons : list(str) all codons that translate to desired mutant residue Returns: -------- mut_codon : str codon selected from mut_codons which requires the fewest changes from the wild type codon """ mut_codons = [DNASequence(codon) for codon in mut_codons] distances = [wt_codon.distance(codon) for codon in mut_codons] changed_bp = int(min(distances)*3) print("This mutant required "+str(changed_bp)+"bp modifications\n") # choose the codon that requires fewest changes return mut_codons[distances.index(min(distances))].sequence
def get_consensus(fasta_tempfile, min_consensus): """ Returns consensus sequence from a set of sequences input: fasta file, min_consensus fasta_file should be in the following format: >random_bc|number seq >random_bc|number seq .... number = number of times this seq has appeared with this random_barcode Parameters ---------- fasta_seqs: list min_consensus: float Returns ---------- consensus_seq: string consensus sequence for the given list of sequences """ seqs = list() counts = list() for label, seq in parse_fasta(fasta_tempfile): RE_output = search(r'\w+\|(\d+)', label) counts.append(int(RE_output.group(1))) seqs.append(seq) length = len(seqs[0]) number_of_seqs = len(seqs) for seq_index in range(number_of_seqs): if len(seqs[seq_index]) != length: raise SeqLengthMismatchError() freq_this_pos_this_base = dict() count_of_seq_with_max_count = dict() for x in range(length): freq_this_pos_this_base[x] = dict() count_of_seq_with_max_count[x] = dict() for y in DNASequence.iupac_characters(): freq_this_pos_this_base[x][y] = 0 count_of_seq_with_max_count[x][y] = 0 for this_seq_count, seq in enumerate(seqs): freq_this_pos_this_base[x][ seq[x]] += counts[this_seq_count] if counts[this_seq_count] > count_of_seq_with_max_count[x][seq[x]]: count_of_seq_with_max_count[x][seq[x]] = counts[this_seq_count] consensus = list() for index in range(length): sorted_bases = sorted( freq_this_pos_this_base[index].iteritems(), key=lambda x: x[1]) max_base, max_freq = sorted_bases[-1] for (counter, (b, n)) in enumerate(sorted_bases): if max_freq == n: try: if (count_of_seq_with_max_count[counter][b] > count_of_seq_with_max_count[counter][max_base]): max_base = b except KeyError: pass score = 10.0 * max_freq / number_of_seqs if score < min_consensus: raise LowConsensusScoreError() consensus.append(max_base) consensus_seq = ''.join(map(str, consensus)) return consensus_seq
def run_ampliconnoise( mapping_fp, output_dir, command_handler, params, qiime_config, logger=None, status_update_callback=print_to_stdout, chimera_alpha=-3.8228, chimera_beta=0.6200, sff_txt_fp=None, numnodes=2, suppress_perseus=True, output_filepath=None, platform="flx", seqnoise_resolution=None, truncate_len=None, ): """ Run the ampliconnoise pipeline The steps performed by this function are: 1. Split input sff.txt file into one file per sample 2. Run scripts required for PyroNoise 3. Run scripts required for SeqNoise 4. Run scripts requred for Perseus (chimera removal) 5. Merge output files into one file similar to the output of split_libraries.py output_filepath should be absolute seqnoise_resolution should be string environment variable PYRO_LOOKUP_FILE must be set correctly. Thus be careful passing command handlers that don't spawn child processes, as they may not inherit the correct environment variable setting """ map_data, headers, comments = parse_mapping_file(open(mapping_fp, "U")) create_dir(output_dir) if seqnoise_resolution is None: if platform == "flx": seqnoise_resolution = "30.0" elif platform == "titanium": seqnoise_resolution = "25.0" else: raise RuntimeError("seqnoise_resolution not set, and no" + " default for platform " + platform) if truncate_len is None: if platform == "flx": truncate_len = "220" elif platform == "titanium": truncate_len = "400" else: raise RuntimeError("truncate_len not set, and no" + " default for platform " + platform) # these are filenames minus extension, and are sample IDs sample_names = [] primer_seqs = [] # same order as sample_names bc_seqs = [] # same order as sample_names for i in range(len(map_data)): sample_names.append(map_data[i][headers.index("SampleID")]) bc_seqs.append(map_data[i][headers.index("BarcodeSequence")]) primer = map_data[i][headers.index("LinkerPrimerSequence")] for char, bases in DNASequence.iupac_degeneracies().iteritems(): primer = primer.replace(char, "[" + "".join(bases) + "]") primer_seqs.append(primer) if len(set(primer_seqs)) != 1: raise RuntimeError("Error: only one primer per mapping file supported.") one_primer = primer_seqs[0] commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False log_input_md5s(logger, [mapping_fp, sff_txt_fp]) # execute commands in output_dir called_dir = os.getcwd() os.chdir(output_dir) fh = open(os.path.join(output_dir, "map.csv"), "w") for i in range(len(sample_names)): fh.write(sample_names[i] + "," + bc_seqs[i] + "\n") fh.close() # these are the fasta results, e.g. PC.636_Good.fa # later we merge them and copy to output file post_pyro_tail = "_" + truncate_len if suppress_perseus: fasta_result_names = [sample_name + post_pyro_tail + "_seqnoise_cd.fa" for sample_name in sample_names] else: fasta_result_names = [sample_name + "_Good.fa" for sample_name in sample_names] cmd = "cd " + output_dir # see also os.chdir above commands.append([("change to output dir", cmd)]) cmd = "echo $PYRO_LOOKUP_FILE > pyro_lookup_filepath.txt" commands.append([("confirm pyro lookup filepath environment variable", cmd)]) cmd = ( "SplitKeys.pl " + one_primer + " map.csv < " + os.path.join(called_dir, sff_txt_fp) + " > splitkeys_log.txt 2> unassigned.fna" ) commands.append([("split sff.txt via barcodes (keys)", cmd)]) for i, sample_name in enumerate(sample_names): # Build the summarize taxonomy command if platform == "flx": cmd = "Clean360.pl " + one_primer + " " + sample_name + " < " + sample_name + ".raw" commands.append([("clean flows " + sample_name, cmd)]) # these run through the whole sff file once per sample, I think # cmd = "FlowsFA.pl " + primer_seqs[i] + ' '+sample_name +' < '+\ # os.path.join(called_dir,sff_txt_fp) # commands.append([('extract flows '+sample_name, cmd)]) elif platform == "titanium": cmd = "CleanMinMax.pl " + one_primer + " " + sample_name + " < " + sample_name + ".raw" commands.append([("clean flows " + sample_name, cmd)]) # cmd = "FlowsMinMax.pl " + primer_seqs[i] + ' '+sample_name +' < '+\ # os.path.join(called_dir,sff_txt_fp) # commands.append([('extract flows '+sample_name, cmd)]) else: raise RuntimeError("platform " + platform + " not supported") cmd = ( "mpirun -np " + str(numnodes) + " PyroDist -in " + sample_name + ".dat -out " + sample_name + " > " + sample_name + ".pdout" ) commands.append([("pyrodist " + sample_name, cmd)]) cmd = "FCluster -in " + sample_name + ".fdist -out " + sample_name + " > " + sample_name + ".fcout" commands.append([("fcluster pyrodist " + sample_name, cmd)]) # e.g.: # mpirun -np 2 PyroNoise -din PC.354.dat -out PC.354_pyronoise -lin # PC.354.list -s 60.0 -c 0.01 > PC.354_pyronoise.pnout cmd = ( "mpirun -np " + str(numnodes) + " PyroNoise -din " + sample_name + ".dat -out " + sample_name + "_pyronoise " + "-lin " + sample_name + ".list -s 60.0 -c 0.01 > " + sample_name + "_pyronoise.pnout" ) commands.append([("pyronoise " + sample_name, cmd)]) cmd = ( "Parse.pl " + bc_seqs[i] + one_primer + " " + truncate_len + " < " + sample_name + "_pyronoise_cd.fa" + " > " + sample_name + "_" + truncate_len + ".fa" ) commands.append([("truncate " + sample_name, cmd)]) # now start with post_pyro_tail cmd = ( "mpirun -np " + str(numnodes) + " SeqDist -in " + sample_name + post_pyro_tail + ".fa > " + sample_name + post_pyro_tail + ".seqdist" ) commands.append([("seqdist " + sample_name, cmd)]) cmd = ( "FCluster -in " + sample_name + post_pyro_tail + ".seqdist -out " + sample_name + post_pyro_tail + "fcl > " + sample_name + post_pyro_tail + ".fcout" ) commands.append([("fcluster seqdist " + sample_name, cmd)]) # e.g.: # mpirun -np 2 SeqNoise -in PC.354_pyronoise_cd.fa -din # PC.354_pyronoise_cd.seqdist -out PC.354_pyronoise_cd_seqnoise -lin # PC.354_pyronoise_cdfcl.list -min PC.354_pyronoise.mapping -s 30.0 -c 0.08 > # PC.354_pyronoise_cd.snout cmd = ( "mpirun -np " + str(numnodes) + " SeqNoise -in " + sample_name + post_pyro_tail + ".fa -din " + sample_name + post_pyro_tail + ".seqdist -out " + sample_name + post_pyro_tail + "_seqnoise -lin " + sample_name + post_pyro_tail + "fcl.list -min " + sample_name + "_pyronoise" + ".mapping -s " + seqnoise_resolution + " -c 0.08 > " + sample_name + post_pyro_tail + ".snout" ) commands.append([("seqnoise " + sample_name, cmd)]) if not suppress_perseus: cmd = "Perseus -sin " + sample_name + post_pyro_tail + "_seqnoise_cd.fa > " + sample_name + ".per" commands.append([("Perseus " + sample_name, cmd)]) cmd = ( "Class.pl " + sample_name + ".per " + str(chimera_alpha) + " " + str(chimera_beta) + " > " + sample_name + ".class" ) commands.append([("Class.pl " + sample_name, cmd)]) cmd = ( "FilterGoodClass.pl " + sample_name + post_pyro_tail + "_seqnoise_cd.fa " + sample_name + ".class 0.5 > " + sample_name + "_Chi.fa 2> " + sample_name + "_Good.fa" ) commands.append([("FilterGoodClass " + sample_name, cmd)]) cmd = "unweight_fasta.py -i %s -o %s -l %s" % (fasta_result_names[i], sample_name + "_unw.fna", sample_name) commands.append([("unweight fasta " + sample_name, cmd)]) cmd = ( "cat " + " ".join([sample_name + "_unw.fna" for sample_name in sample_names]) + " > " + output_filepath ) # this should be an abs filepath commands.append([("cat into one fasta file", cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def run_ampliconnoise(mapping_fp, output_dir, command_handler, params, qiime_config, logger=None, status_update_callback=print_to_stdout, chimera_alpha=-3.8228, chimera_beta=0.6200, sff_txt_fp=None, numnodes=2, suppress_perseus=True, output_filepath=None, platform='flx', seqnoise_resolution=None, truncate_len=None): """ Run the ampliconnoise pipeline The steps performed by this function are: 1. Split input sff.txt file into one file per sample 2. Run scripts required for PyroNoise 3. Run scripts required for SeqNoise 4. Run scripts requred for Perseus (chimera removal) 5. Merge output files into one file similar to the output of split_libraries.py output_filepath should be absolute seqnoise_resolution should be string environment variable PYRO_LOOKUP_FILE must be set correctly. Thus be careful passing command handlers that don't spawn child processes, as they may not inherit the correct environment variable setting """ map_data, headers, comments = parse_mapping_file(open(mapping_fp, 'U')) create_dir(output_dir) if seqnoise_resolution is None: if platform == 'flx': seqnoise_resolution = '30.0' elif platform == 'titanium': seqnoise_resolution = '25.0' else: raise RuntimeError('seqnoise_resolution not set, and no' + ' default for platform ' + platform) if truncate_len is None: if platform == 'flx': truncate_len = '220' elif platform == 'titanium': truncate_len = '400' else: raise RuntimeError('truncate_len not set, and no' + ' default for platform ' + platform) # these are filenames minus extension, and are sample IDs sample_names = [] primer_seqs = [] # same order as sample_names bc_seqs = [] # same order as sample_names for i in range(len(map_data)): sample_names.append(map_data[i][headers.index('SampleID')]) bc_seqs.append(map_data[i][headers.index('BarcodeSequence')]) primer = (map_data[i][headers.index('LinkerPrimerSequence')]) for char, bases in DNASequence.iupac_degeneracies().iteritems(): primer = primer.replace(char, '[' + ''.join(bases) + ']') primer_seqs.append(primer) if len(set(primer_seqs)) != 1: raise RuntimeError( 'Error: only one primer per mapping file supported.') one_primer = primer_seqs[0] commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False log_input_md5s(logger, [mapping_fp, sff_txt_fp]) # execute commands in output_dir called_dir = os.getcwd() os.chdir(output_dir) fh = open(os.path.join(output_dir, 'map.csv'), 'w') for i in range(len(sample_names)): fh.write(sample_names[i] + ',' + bc_seqs[i] + '\n') fh.close() # these are the fasta results, e.g. PC.636_Good.fa # later we merge them and copy to output file post_pyro_tail = '_' + truncate_len if suppress_perseus: fasta_result_names = [sample_name + post_pyro_tail + '_seqnoise_cd.fa' for sample_name in sample_names] else: fasta_result_names = [sample_name + '_Good.fa' for sample_name in sample_names] cmd = 'cd ' + output_dir # see also os.chdir above commands.append([('change to output dir', cmd)]) cmd = 'echo $PYRO_LOOKUP_FILE > pyro_lookup_filepath.txt' commands.append([('confirm pyro lookup filepath environment variable', cmd)]) cmd = 'SplitKeys.pl ' + one_primer + ' map.csv < ' +\ os.path.join(called_dir, sff_txt_fp) +\ ' > splitkeys_log.txt 2> unassigned.fna' commands.append([('split sff.txt via barcodes (keys)', cmd)]) for i, sample_name in enumerate(sample_names): # Build the summarize taxonomy command if platform == 'flx': cmd = 'Clean360.pl ' + one_primer + ' ' + sample_name + ' < ' +\ sample_name + '.raw' commands.append([('clean flows ' + sample_name, cmd)]) # these run through the whole sff file once per sample, I think # cmd = "FlowsFA.pl " + primer_seqs[i] + ' '+sample_name +' < '+\ # os.path.join(called_dir,sff_txt_fp) # commands.append([('extract flows '+sample_name, cmd)]) elif platform == 'titanium': cmd = 'CleanMinMax.pl ' + one_primer + ' ' + sample_name + ' < ' +\ sample_name + '.raw' commands.append([('clean flows ' + sample_name, cmd)]) # cmd = "FlowsMinMax.pl " + primer_seqs[i] + ' '+sample_name +' < '+\ # os.path.join(called_dir,sff_txt_fp) # commands.append([('extract flows '+sample_name, cmd)]) else: raise RuntimeError("platform " + platform + " not supported") cmd = "mpirun -np " + str(numnodes) + " PyroDist -in " +\ sample_name + ".dat -out " + \ sample_name + " > " + sample_name + ".pdout" commands.append([('pyrodist ' + sample_name, cmd)]) cmd = "FCluster -in " + sample_name + ".fdist -out " + sample_name +\ " > " + sample_name + ".fcout" commands.append([('fcluster pyrodist ' + sample_name, cmd)]) # e.g.: # mpirun -np 2 PyroNoise -din PC.354.dat -out PC.354_pyronoise -lin # PC.354.list -s 60.0 -c 0.01 > PC.354_pyronoise.pnout cmd = "mpirun -np " + str(numnodes) + " PyroNoise -din " +\ sample_name + ".dat -out " +\ sample_name + "_pyronoise " + "-lin " +\ sample_name + ".list -s 60.0 -c 0.01 > " +\ sample_name + "_pyronoise.pnout" commands.append([('pyronoise ' + sample_name, cmd)]) cmd = 'Parse.pl ' + bc_seqs[i] + one_primer + ' ' + truncate_len + ' < ' +\ sample_name + '_pyronoise_cd.fa' + ' > ' + sample_name + '_' +\ truncate_len + '.fa' commands.append([('truncate ' + sample_name, cmd)]) # now start with post_pyro_tail cmd = "mpirun -np " + str(numnodes) + " SeqDist -in " +\ sample_name + post_pyro_tail +\ ".fa > " + sample_name + post_pyro_tail + ".seqdist" commands.append([('seqdist ' + sample_name, cmd)]) cmd = "FCluster -in " + sample_name + post_pyro_tail + ".seqdist -out " +\ sample_name + post_pyro_tail + "fcl > " +\ sample_name + post_pyro_tail + ".fcout" commands.append([('fcluster seqdist ' + sample_name, cmd)]) # e.g.: # mpirun -np 2 SeqNoise -in PC.354_pyronoise_cd.fa -din # PC.354_pyronoise_cd.seqdist -out PC.354_pyronoise_cd_seqnoise -lin # PC.354_pyronoise_cdfcl.list -min PC.354_pyronoise.mapping -s 30.0 -c 0.08 > # PC.354_pyronoise_cd.snout cmd = "mpirun -np " + str(numnodes) + " SeqNoise -in " +\ sample_name + post_pyro_tail +\ ".fa -din " + sample_name + post_pyro_tail + ".seqdist -out " +\ sample_name + post_pyro_tail +\ "_seqnoise -lin " + sample_name + post_pyro_tail + 'fcl.list -min ' +\ sample_name + '_pyronoise' +\ '.mapping -s ' + seqnoise_resolution + ' -c 0.08 > ' +\ sample_name + post_pyro_tail + '.snout' commands.append([('seqnoise ' + sample_name, cmd)]) if not suppress_perseus: cmd = 'Perseus -sin ' + sample_name + post_pyro_tail +\ '_seqnoise_cd.fa > ' +\ sample_name + '.per' commands.append([('Perseus ' + sample_name, cmd)]) cmd = 'Class.pl ' + sample_name + '.per ' +\ str(chimera_alpha) + ' ' + str(chimera_beta) +\ ' > ' + sample_name + '.class' commands.append([('Class.pl ' + sample_name, cmd)]) cmd = 'FilterGoodClass.pl ' + sample_name + post_pyro_tail +\ '_seqnoise_cd.fa ' +\ sample_name + '.class 0.5 > ' + sample_name + '_Chi.fa 2> ' +\ sample_name + '_Good.fa' commands.append([('FilterGoodClass ' + sample_name, cmd)]) cmd = 'unweight_fasta.py -i %s -o %s -l %s' %\ (fasta_result_names[i], sample_name + '_unw.fna', sample_name) commands.append([('unweight fasta ' + sample_name, cmd)]) cmd = 'cat ' +\ ' '.join([sample_name + '_unw.fna' for sample_name in sample_names]) +\ ' > ' + output_filepath # this should be an abs filepath commands.append([('cat into one fasta file', cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def make_single_mutant(self, wt_res,res_num,mut_res): """ Determines how many nucleotide changes are required for the desired amino acid mutation, then constructs a primer with a minimum of 25 nucleotides, increasing the length symmetrically (such that the mutant codon is centered in the primer) up to 45 nucleotides, using the minimum length possible to achieve acceptable melting temperature (78C minimum) DNA sequence needs to start with the first residue of the protein (no promoter, etc) take DNA sequence, convert to AA, define AA point mutant, find corresponding codon of wt and mut, output forward and reverse primers DNA sequence should be only the kinase domain Desired mutation should require only a single nucleotide change; will print warning if more nucleotide changes are required Arguments: ---------- sequence : str DNA sequence wt_res : char single letter amino acid code of wildtype residue to be mutated res_num : int residue id number of residue to be mutated mut_res : char single letter amino acid code of mutant residue Returns: -------- forward_primer : str nucleotide sequence reverse_primer : str nucleotide sequence """ aa_sequence = self.aa_sequence sequence = self.sequence first_res = self.first_res orig_code = self.orig_code if not str(wt_res) == aa_sequence[res_num-first_res]: raise IOError("Desired residue not found -- check wildtype residue name and id, and first residue id") # start of codon of residue of interest is at (res_num - first_res)*3 wt_codon = DNASequence(sequence[(res_num - first_res)*3:(res_num - first_res)*3+3]) mut_codons = orig_code.synonyms[mut_res] mut_codon = None for codon in mut_codons: if wt_codon.distance(DNASequence(codon))*3 == 1: mut_codon = codon if not mut_codon: print("Cannot make desired mutant with a single base change") mut_codon = self._make_mutant(wt_codon, mut_codons) good_melting_temp = False start_ix = max(0,(res_num-first_res)*3-11) end_ix = min(len(sequence),(res_num+1-first_res)*3+11) while not good_melting_temp: if end_ix - start_ix > 45: print("Acceptable melting temp was not found") break forward_primer = sequence[start_ix:(res_num - first_res)*3]+mut_codon+sequence[(res_num+1 - first_res)*3:end_ix] forward_primer = forward_primer.lower() good_melting_temp, start_ix, end_ix = self._check_melting_temp(forward_primer, start_ix, end_ix, len(sequence)) forward_sequence = DNASequence(forward_primer) reverse_sequence = forward_sequence.rc() reverse_primer = reverse_sequence.sequence return forward_primer, reverse_primer