def WriteOverviewWeights(fields, table, options): output = [] WriteHeader(options) for x in range(1, len(fields) - 1): for y in range(x + 1, len(fields)): changed = {} for c in table: codon = c[0] w1 = c[x] w2 = c[y] t1 = w1 == 1.0 and w2 != 1.0 t2 = w1 != 1.0 and w2 == 1.0 if t1 or t2: aa = Genomics.MapCodon2AA(codon) if aa not in changed: changed[aa] = [] if t1: changed[aa].append((t1, w2, codon)) else: changed[aa].append((t1, w1, codon)) output += WriteChanged(fields[x], fields[y], changes, options) WriteOutput(output, options)
def translate(s): sequence = s.mString seq = [] for codon in [sequence[x:x + 3] for x in range(0, len(sequence), 3)]: aa = Genomics.MapCodon2AA(codon) seq.append(aa) s.mString = "".join(seq)
def loadSequence(self, sequence): """load sequence properties from a sequence.""" SequenceProperties.loadSequence(self, sequence) # counts of amino acids self.mCountsAA = {} for x in Bio.Alphabet.IUPAC.extended_protein.letters: self.mCountsAA[x] = 0 for codon in [sequence[x:x + 3] for x in range(0, len(sequence), 3)]: aa = Genomics.MapCodon2AA(codon) self.mCountsAA[aa] += 1
def loadSequence(self, sequence, seqtype="na"): """load sequence properties from a sequence.""" SequenceProperties.loadSequence(self, sequence, seqtype) if len(sequence) % 3: raise ValueError( '''sequence length is not a multiple of 3 (length=%i)''' % (len(sequence))) # counts of amino acids self.mCountsAA = {} for x in Bio.Alphabet.IUPAC.extended_protein.letters: self.mCountsAA[x] = 0 for codon in (sequence[x:x + 3] for x in range(0, len(sequence), 3)): aa = Genomics.MapCodon2AA(codon) self.mCountsAA[aa] += 1
def filterMali(mali, method="3rd"): """build a new multiple alignment based on a filter. valid methods are 3rd: only third positions 4d: only four-fold degenerate sites """ if method not in ("3rd", "4d"): raise "unknown method %s" % method if method == "3rd": columns = range(2, mali.getWidth(), 3) elif method == "4d": # translate trans_mali = Mali.Mali() for id, seq in mali.items(): s = [] sequence = seq.mString l = len(sequence) for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) s.append(aa) trans_mali.addSequence(id, 0, l, "".join(s)) # get four-fold (or higher) degenerate amino acids aa_columns = trans_mali.getColumns() columns = [] for c in range(len(aa_columns)): chars = set(aa_columns[c]) chars = chars.difference(set(mali.mGapChars)) if len(chars) == 1: char = list(chars)[0].upper() try: deg = Genomics.DegeneracyAA[char] except KeyError: continue if deg >= 4: columns.append(c * 3) mali.takeColumns(columns)
def WriteOverviewFrequencies(fields, table, options): WriteHeader(options) output = [] for x in range(1, len(fields) - 1): for y in range(x + 1, len(fields)): frequencies = {} # collect frequencies per amino acid for c in table: codon = c[0] f1 = c[x] f2 = c[y] aa = Genomics.MapCodon2AA(codon) if aa not in frequencies: frequencies[aa] = [] frequencies[aa].append((codon, f1, f2)) changed = {} # sort for both genomes, and check if preference has changed for aa, codons in frequencies.items(): codons.sort(lambda x, y: cmp(x[1], y[1])) pref_codon1 = codons[-1] codons.sort(lambda x, y: cmp(x[2], y[2])) pref_codon2 = codons[-1] if pref_codon1 == pref_codon2: continue else: changed[aa] = [(True, pref_codon1[2], pref_codon1[0]), (False, pref_codon2[1], pref_codon2[0])] output += WriteChanges(fields[x], fields[y], changed, options) WriteOutput(output, options)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: shuffle_fasta.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option( "-c", "--codons", dest="codons", action="store_true", help="make sure that shuffled sequences only contain valid codons.") parser.add_option("-a", "--conserve-aminos", dest="conserve_aminos", action="store_true", help="conserve amino acids.") parser.add_option( "-b", "--bias", dest="bias", type="float", help= "introduce bias into codon usage choice. Complete bias is 1.0, while no bias is 0.0." ) parser.add_option( "-i", "--biased-codon-usage", dest="filename_biased_codon_usage", type="string", help="Filename with reference codon usage table for biased codon usage." ) parser.add_option( "-u", "--bulk-codon-usage", dest="filename_bulk_codon_usage", type="string", help= "Filename with reference codon usage table for unbiased codon usage.") parser.set_defaults( codons=False, conserve_aminos=False, bias=0.0, filename_biased_codon_usage=None, filename_bulk_codon_usage=None, stop_codons=("TAG", "TAA", "TGA"), precision=10000, ) (options, args) = E.Start(parser, add_pipe_options=True) iterator = FastaIterator.FastaIterator(sys.stdin) # get map of amino acids to codons map_aa2codons = Genomics.GetMapAA2Codons() # for codon based shuffling: build ranges based on strength of bias and on reference codon usage # Bias switches from completely biased to unbiased. Unbiased is uniform # usage. if options.filename_biased_codon_usage: map_codon2frequency = IOTools.ReadMap(open( options.filename_biased_codon_usage, "r"), map_functions=(str, float), has_header=True) if options.filename_bulk_codon_usage: map_codon2frequency_bulk = IOTools.ReadMap( open(options.filename_bulk_codon_usage, "r"), map_functions=(str, float), has_header=True) codon_ranges = {} for aa in map_aa2codons.keys(): c = [] x = 0 for codon in map_aa2codons[aa]: if options.filename_bulk_codon_usage: u = map_codon2frequency_bulk[codon] else: # uniform usage u = 1.0 / len(map_aa2codons[aa]) g = map_codon2frequency[codon] f = g + (u - g) * (1.0 - options.bias) x += f * options.precision c.append(x) codon_ranges[aa] = c while 1: cur_record = iterator.next() if cur_record is None: break sequence = re.sub(" ", "", cur_record.sequence) l = len(sequence) if options.conserve_aminos: n = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) if aa not in map_aa2codons: continue if options.bias or options.filename_biased_codon_usage: # get random number from 0 to precision v = random.randint(0, options.precision) # find the corresponding intervall: l = len(map_aa2codons[aa]) x = 0 while x < l - 1: if v < codon_ranges[aa][x]: break x += 1 else: x = random.randint(0, len(map_aa2codons[aa]) - 1) n.append(map_aa2codons[aa][x]) sequence = "".join(n) else: sequence = list(sequence) if options.codons: while 1: random.shuffle(sequence) for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: if codon in options.stop_codons: redo = True break else: break else: random.shuffle(sequence) sequence = "".join(sequence) options.stdout.write(">%s\n%s\n" % (cur_record.title, "".join(sequence))) E.Stop()
def AlignCodonBased(seq_wobble, seq_cds, seq_peptide, map_p2c, options, diag_width=2, max_advance=2): """advance in codons in seq_wobble and match to nucleotides in seq_cds. Due to alinglib this is all in one-based coordinates. Takes care of frameshifts. """ map_p2c.clear() gop, gep = -1.0, -1.0 matrix = alignlib_lite.py_makeSubstitutionMatrixBackTranslation( 1, -10, 1, alignlib_lite.py_getDefaultEncoder()) pep_seq = seq_peptide.asString() cds_seq = seq_cds.asString() wobble_seq = seq_wobble.asString() lcds = seq_cds.getLength() lwobble = seq_wobble.getLength() y = 0 x = 0 last_start = None while x < lwobble and y < lcds: xr = seq_wobble.asResidue(x) # skip over masked chars in wobble - these are gaps if seq_wobble.asChar(x) == "X": x += 1 continue # skip over masked chars in wobble - these are from # masked chars in the peptide sequence # Note to self: do not see all implications of this change # check later. if seq_wobble.asChar(x) == "N": x += 1 continue # skip over gaps in wobble if seq_wobble.asChar(x) == "-": x += 1 continue s = matrix.getValue(xr, seq_cds.asResidue(y)) if options.loglevel >= 6: if (x % 3 == 0): c = seq_cds.asChar(y) + seq_cds.asChar(y + 1) + seq_cds.asChar(y + 2) options.stdlog.write( "# c=%s, x=%i, y=%i, aa=%s target=%s\n" % (c, x, y, Genomics.MapCodon2AA(c), pep_seq[int(x / 3)])) options.stdlog.write( "# x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%s\n" % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr, seq_cds.asResidue(y), str(s))) # deal with mismatches if s <= 0: tmp_map_p2c = alignlib_lite.py_makeAlignmentVector() # backtrack to previous three codons and align # three codons for double frameshifts that span two codons and # produce two X's and six WWWWWW. # number of nucleotides to extend (should be multiple of 3) # less than 12 caused failure for some peptides. d = 15 # extend by amound dx dx = (x % 3) + d x_start = max(0, x - dx) # map to ensure that no ambiguous residue mappings # exist after re-alignment y_start = max(0, map_p2c.mapRowToCol(x_start, alignlib_lite.py_RIGHT)) if (x_start, y_start) == last_start: raise ValueError("infinite loop detected") last_start = (x_start, y_start) x_end = min(x_start + 2 * d, len(wobble_seq)) y_end = min(y_start + 2 * d, len(cds_seq)) wobble_fragment = alignlib_lite.py_makeSequence( wobble_seq[x_start:x_end]) cds_fragment = alignlib_lite.py_makeSequence( cds_seq[y_start:y_end]) AlignExhaustive(wobble_fragment, cds_fragment, "", tmp_map_p2c, options) if options.loglevel >= 10: options.stdlog.write( "# fragmented alignment from %i-%i, %i-%i:\n%s\n" % (x_start, x_end, y_start, y_end, str( alignlib_lite.py_AlignmentFormatExplicit( tmp_map_p2c, wobble_fragment, cds_fragment)))) options.stdlog.flush() # clear alignment map_p2c.removeRowRegion(x_start, x_end) ngap = 0 last_x, last_y = None, None for xxx in range(tmp_map_p2c.getRowFrom(), tmp_map_p2c.getRowTo()): yyy = tmp_map_p2c.mapRowToCol(xxx) if yyy >= 0: x = xxx + x_start y = yyy + y_start xr = seq_wobble.asResidue(x) s = matrix.getValue(seq_wobble.asResidue(x), seq_cds.asResidue(y)) if s < 0: raise ValueError( "mismatched residue wobble: %i (%s), cds: %i (%s)" % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y))) map_p2c.addPair(x, y, s) last_x, last_y = x, y if options.loglevel >= 6: options.stdlog.write( "# reset: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n" % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr, seq_cds.asResidue(y), s)) options.stdlog.flush() ngap = 0 else: ngap += 1 # treat special case of double frameshifts. They might cause a petide/wobble residue # to be eliminated and thus the translated sequences will differ. # simply delete the last residue between x and y and move to # next codon. if ngap == 3: map_p2c.removeRowRegion(last_x, last_x + 1) last_x += 1 map_p2c.addPair(last_x, last_y) if options.loglevel >= 6: options.stdlog.write( "# double: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n" % (last_x, seq_wobble.asChar(last_x), last_y, seq_cds.asChar(last_y), xr, seq_cds.asResidue(last_y), s)) options.stdlog.flush() ngap = 0 # exit condition if alignment is shorter than problematic residue # need to catch this to avoid infinite loop. if tmp_map_p2c.getRowTo() < d: if lwobble - x <= 4: # only last codon is missing, so ok break else: raise ValueError("failure to align in designated window.") s = 0 s = matrix.getValue(xr, seq_cds.asResidue(y)) if s < 0: raise ValueError("mis-matching residues.") map_p2c.addPair(x, y, float(s)) # advance to next residues x += 1 y += 1 # sanity checks assert (map_p2c.getRowTo() <= seq_wobble.getLength()) assert (map_p2c.getColTo() <= seq_cds.getLength())
def ProcessResult(result, options, mali=None, prefix=None, p_value=None): counts = None if options.method == "summary-slr": thresholds = "95%", "99%", "95% corrected", "99% corrected" if prefix: options.stdout.write("%s\t" % prefix) options.stdout.write("%5.2f\t%5.2f\t%5.2f\t%6.4f\t%i\t%i\t%i\t" % ( result.mTreeLength, result.mOmega, result.mKappa, result.mLogLikelihood, len(result.mSites), result.mNSitesSynonymous, result.mNSitesGaps + result.mNSitesSingleChar, )) options.stdout.write("\t".join( map(lambda x: "%i" % result.mNPositiveSites[x][0], thresholds))) options.stdout.write("\t") options.stdout.write("\t".join( map(lambda x: "%i" % result.mNNegativeSites[x], thresholds))) options.stdout.write("\n") elif options.method in ("summary-filtered", "positive-site-table", "negative-site-table", "neutral-site-table", "positive-site-list", "negative-site-list", "neutral-site-list"): mali_length = mali.getLength() mali_width = mali.getWidth() column_data = map( lambda x: Mali.MaliData(x, gap_chars="Nn", mask_chars="-."), mali.getColumns()) # sanity check: do lengths of mali and # of sites correspond if len(result.mSites) * 3 != mali_width: raise "mali (%i) and # of sites (%i) do not correspond." % ( mali_width, len(result.mSites)) if options.method == "summary-filtered": # count sites, but filter with multiple alignment ntotal = 0 npositive = 0 nnegative = 0 nneutral = 0 nfiltered = 0 nsynonymous = 0 if prefix: options.stdout.write("%s\t" % prefix) for x in range(len(result.mSites)): site = result.mSites[x] column = column_data[x * 3] if column.mNChars != mali_length: nfiltered += 1 continue if site.isPositive(options.significance_threshold, options.use_adjusted): npositive += 1 elif site.isNegative(options.significance_threshold, options.use_adjusted): nnegative += 1 if site.isSynonymous(): nsynonymous += 1 ntotal += 1 options.stdout.write( "%5.2f\t%5.2f\t%5.2f\t%6.4f\t%i\t%i\t%i\t%i\t%i\t%i\n" % (result.mTreeLength, result.mOmega, result.mKappa, result.mLogLikelihood, len(result.mSites), nfiltered, ntotal, nsynonymous, nnegative, npositive)) counts = Result(nfiltered, ntotal, nsynonymous, nnegative, npositive) elif options.method in ( "positive-site-table", "negative-site-table", "neutral-site-table", "positive-site-list", "negative-site-list", "neutral-site-list", ): select_positive_sites = options.method in ("positive-site-table", "positive-site-list") select_negative_sites = options.method in ("negative-site-table", "negative-site-list") # iterate over sites and output those under xxx selection identifiers = mali.getIdentifiers() chars_per_row = [[] for x in range(mali_length)] sites = [] for col in range(len(result.mSites)): site = result.mSites[col] column = column_data[col * 3] if column.mNChars != mali_length: continue keep = False if select_positive_sites and site.isPositive( options.significance_threshold, options.use_adjusted): keep = True elif select_negative_sites and site.isNegative( options.significance_threshold, options.use_adjusted): keep = True if not keep: continue sites.append((col, site)) nsites = len(sites) if options.truncate_sites_list: # truncate sites list, sort by significance sites.sort(lambda x, y: cmp(x[1].mPValue, y[1].mPValue)) sites = sites[:options.truncate_sites_list] for col, site in sites: site = result.mSites[col] xcol = col * 3 for row in range(mali_length): id = identifiers[row] x = max(xcol - options.context_size * 3, 0) y = min(xcol + 3 + options.context_size * 3, mali_width) segment = mali[id][x:y] codon = mali[id][xcol:xcol + 3] pos = mali.getResidueNumber(id, xcol) pos /= 3 # save as real-world coordinates chars_per_row[row].append( PositionInformation( Genomics.MapCodon2AA(codon), pos + 1, xcol, Genomics.TranslateDNA2Protein(segment).upper())) if p_value is not None: pp_value = p_value else: pp_value = "na" if options.method in ("positive-site-table", "negative-site-table", "neutral-site-table"): if options.context_size: for row in range(mali_length): if prefix: options.stdout.write("%s\t" % prefix) options.stdout.write( "%s\t%i\t%s\t%s\n" % (identifiers[row], nsites, pp_value, ";".join([ "%s%i in %s" % (x.mAA, x.mSequencePosition, x.mContext) for x in chars_per_row[row] ]))) else: for row in range(mali_length): if prefix: options.stdout.write("%s\t" % prefix) options.stdout.write( "%s\t%i\t%s\t%s\n" % (identifiers[row], nsites, pp_value, ";".join([ "%s%i" % (x.mAA, x.mSequencePosition) for x in chars_per_row[row] ]))) elif options.method in ("positive-site-list", "negative-site-list", "neutral-site-list"): for row in range(mali_length): if prefix: xprefix = "%s\t%s" % (prefix, identifiers[row]) else: xprefix = "%s" % (identifiers[row]) x = 0 for chars in chars_per_row[row]: x += 1 options.stdout.write( "%s\t%i\t%s\t%i\t%i\t%s\n" % (xprefix, x, chars.mAA, chars.mSequencePosition, chars.mMaliPosition, chars.mContext)) options.stdout.flush() return counts
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version", usage=globals()["__doc__"]) parser.add_option( "-m", "--method", dest="methods", type="choice", action="append", choices=("translate", "translate-to-stop", "truncate-at-stop", "back-translate", "mark-codons", "apply-map", "build-map", "pseudo-codons", "filter", "interleaved-codons", "map-codons", "remove-gaps", "mask-seg", "mask-bias", "mask-codons", "mask-incomplete-codons", "mask-stops", "mask-soft", "remove-stops", "upper", "lower", "reverse-complement", "sample", "shuffle"), help="method to apply to sequences.") parser.add_option("-p", "--parameters", dest="parameters", type="string", help="parameter stack for methods that require one " "[default=%default].") parser.add_option("-x", "--ignore-errors", dest="ignore_errors", action="store_true", help="ignore errors [default = %default].") parser.add_option("--sample-proportion", dest="sample_proportion", type="float", help="sample proportion [default = %default].") parser.add_option("--exclude-pattern", dest="exclude_pattern", type="string", help="exclude all sequences with ids matching pattern " "[default = %default].") parser.add_option("--include-pattern", dest="include_pattern", type="string", help="include only sequences with ids matching pattern " "[default = %default].") parser.add_option("--filter-method", dest="filter_methods", type="string", action="append", help="filtering methods to apply " "[default = %default].") parser.add_option( "-t", "--sequence-type", dest="type", type="choice", choices=("aa", "na"), help="sequence type (aa or na) [%default]. This option determines " "which characters to use for masking [default = %default].") parser.add_option( "-l", "--template-identifier", dest="template_identifier", type="string", help="template for numerical identifier [default = %default] " "for the operation --build-map. A %i is replaced by the position " "of the sequence in the file.") parser.set_defaults( methods=[], parameters="", type="na", aa_mask_chars="xX", aa_mask_char="x", na_mask_chars="nN", na_mask_char="n", gap_chars="-.", gap_char="-", template_identifier="ID%06i", ignore_errors=False, exclude_pattern=None, include_pattern=None, sample_proportion=None, filter_methods=[], ) (options, args) = E.Start(parser) options.parameters = options.parameters.split(",") rx_include, rx_exclude = None, None if options.include_pattern: rx_include = re.compile(options.include_pattern) if options.exclude_pattern: rx_exclude = re.compile(options.exclude_pattern) iterator = FastaIterator.FastaIterator(options.stdin) nseq = 0 map_seq2nid = {} if "apply-map" in options.methods: map_seq2nid = IOTools.ReadMap(open(options.parameters[0], "r")) del options.parameters[0] if options.type == "na": mask_chars = options.na_mask_chars mask_char = options.na_mask_char else: mask_chars = options.aa_mask_chars mask_char = options.aa_mask_char if "map-codons" in options.methods: map_codon2code = IOTools.ReadMap(open(options.parameters[0], "r")) del options.parameters[0] if "mask-soft" in options.methods: f = options.parameters[0] del options.parameters[0] hard_masked_iterator = FastaIterator.FastaIterator(open(f, "r")) if "mask-codons" in options.methods or "back-translate" in options.methods: # open a second stream to read sequences from f = options.parameters[0] del options.parameters[0] other_iterator = FastaIterator.FastaIterator(open(f, "r")) ninput, noutput, nerrors, nskipped = 0, 0, 0, 0 if "sample" in options.methods: if not options.sample_proportion: raise ValueError("specify a sample proportion") sample_proportion = options.sample_proportion else: sample_proportion = None filter_min_sequence_length = None filter_max_sequence_length = None filter_id_list = None for f in options.filter_methods: if f.startswith("min-length"): filter_min_sequence_length = int(f.split("=")[1]) elif f.startswith("max-length"): filter_max_sequence_length = int(f.split("=")[1]) elif f.startswith("id-file"): filter_id_list = [ line[:-1] for line in IOTools.openFile(f.split("=")[1]) ] def raiseIfNotCodon(l, title): '''raise ValueError if sequence length l is not divisible by 3''' if l % 3 != 0: raise ValueError("length of sequence %s not divisible by 3" % (title)) while 1: try: cur_record = next(iterator) except StopIteration: break if cur_record is None: break nseq += 1 ninput += 1 sequence = re.sub(" ", "", cur_record.sequence) l = len(sequence) if rx_include and not rx_include.search(cur_record.title): nskipped += 1 continue if rx_exclude and rx_exclude.search(cur_record.title): nskipped += 1 continue if sample_proportion: if random.random() > sample_proportion: continue if not (filter_id_list is None or cur_record.title in filter_id_list): nskipped += 1 continue for method in options.methods: if method == "translate": # translate such that gaps are preserved seq = [] ls = len(re.sub('[%s]' % options.gap_chars, sequence, "")) if ls % 3 != 0: msg = "length of sequence %s (%i) not divisible by 3" % ( cur_record.title, ls) nerrors += 1 if options.ignore_errors: E.warn(msg) continue else: raise ValueError(msg) for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) seq.append(aa) sequence = "".join(seq) elif method == "back-translate": # translate from an amino acid alignment to codon alignment seq = [] try: other_record = next(other_iterator) except StopIteration: raise ValueError("run out of sequences") if cur_record.title != other_record.title: raise "sequence titles don't match: %s %s" % ( cur_record.title, other_record.title) other_sequence = re.sub("[ %s]" % options.gap_chars, "", other_record.sequence) if len(other_sequence) % 3 != 0: raise ValueError( "length of sequence %s not divisible by 3" % (other_record.title)) r = re.sub("[%s]" % options.gap_chars, "", sequence) if len(other_sequence) != len(r) * 3: raise ValueError( "length of sequences do not match: %i vs %i" % (len(other_sequence), len(r))) x = 0 for aa in sequence: if aa in options.gap_chars: c = options.gap_char * 3 else: c = other_sequence[x:x + 3] x += 3 seq.append(c) sequence = "".join(seq) elif method == "pseudo-codons": raiseIfNotCodon(l, cur_record.title) seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) seq.append(aa) sequence = " ".join(seq) elif method == "reverse-complement": sequence = string.translate( sequence, string.maketrans("ACGTacgt", "TGCAtgca"))[::-1] elif method in ("mask-stops", "remove-stops"): c = [] codon = [] new_sequence = [] if method == "mask-stops": char = options.na_mask_char elif method == "remove-stops": char = options.gap_char for x in sequence: if x not in options.gap_chars: codon.append(x.upper()) c.append(x) if len(codon) == 3: codon = "".join(codon).upper() # mask all non-gaps if Genomics.IsStopCodon(codon): for x in c: if x in options.gap_chars: new_sequence.append(x) else: new_sequence.append(char) else: new_sequence += c c = [] codon = [] new_sequence += c sequence = "".join(new_sequence) elif method == "mask-soft": # Get next hard masked record and extract sequence and length try: cur_hm_record = next(hard_masked_iterator) except StopIteration: break hm_sequence = re.sub(" ", "", cur_hm_record.sequence) lhm = len(hm_sequence) new_sequence = [] # Check lengths of unmasked and soft masked sequences the same if l != lhm: raise ValueError( "length of unmasked and hard masked sequences not " "identical for record %s" % (cur_record.title)) # Check if hard masked seq contains repeat (N), if so replace N # with lowercase sequence from unmasked version if sequence == hm_sequence: pass else: for x, y in zip_longest(sequence, hm_sequence): if y == "N": new_sequence += x.lower() else: new_sequence += x.upper() sequence = "".join(new_sequence) elif method == "map-codons": raiseIfNotCodon(l, cur_record.title) seq = [] for codon in (sequence[x:x + 3].upper() for x in range(0, l, 3)): if codon not in map_codon2code: aa = "X" else: aa = map_codon2code[codon] seq.append(aa) sequence = "".join(seq) elif method == "interleaved-codons": raiseIfNotCodon(l, cur_record.title) seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) seq.append("%s:%s" % (aa, codon)) sequence = " ".join(seq) elif method == "translate-to-stop": seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: if Genomics.IsStopCodon(codon): break aa = Genomics.MapCodon2AA(codon) seq.append(aa) sequence = "".join(seq) elif method == "truncate-at-stop": seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: if Genomics.IsStopCodon(codon): break seq.append(codon) sequence = "".join(seq) elif method == "remove-gaps": seq = [] for s in sequence: if s in options.gap_chars: continue seq.append(s) sequence = "".join(seq) elif method == "upper": sequence = sequence.upper() elif method == "lower": sequence = sequence.lower() elif method == "mark-codons": raiseIfNotCodon(l, cur_record.title) seq = [] sequence = " ".join( [sequence[x:x + 3] for x in range(0, l, 3)]) elif method == "apply-map": id = re.match("^(\S+)", cur_record.title).groups()[0] if id in map_seq2nid: rest = cur_record.title[len(id):] cur_record.title = map_seq2nid[id] + rest elif method == "build-map": # build a map of identifiers id = re.match("^(\S+)", cur_record.title).groups()[0] new_id = options.template_identifier % nseq if id in map_seq2nid: raise "duplicate fasta entries - can't map those: %s" % id map_seq2nid[id] = new_id cur_record.title = new_id elif method == "mask-bias": masker = Masker.MaskerBias() sequence = masker(sequence) elif method == "mask-seg": masker = Masker.MaskerSeg() sequence = masker(sequence) elif method == "shuffle": s = list(sequence) random.shuffle(s) sequence = "".join(s) elif method == "mask-incomplete-codons": seq = list(sequence) for x in range(0, l, 3): nm = len([x for x in seq[x:x + 3] if x in mask_chars]) if 0 < nm < 3: seq[x:x + 3] = [mask_char] * 3 sequence = "".join(seq) elif method == "mask-codons": # mask codons based on amino acids given as reference # sequences. other_record = next(other_iterator) if other_record is None: raise ValueError("run out of sequences.") if cur_record.title != other_record.title: raise ValueError("sequence titles don't match: %s %s" % (cur_record.title, other_record.title)) other_sequence = re.sub(" ", "", other_record.sequence) if len(other_sequence) * 3 != len(sequence): raise ValueError( "sequences for %s don't have matching lengths %i - %i" % (cur_record.title, len(other_sequence) * 3, len(sequence))) seq = list(sequence) c = 0 for x in other_sequence: if x in options.aa_mask_chars: if x.isupper(): seq[c:c + 3] = [options.na_mask_char.upper()] * 3 else: seq[c:c + 3] = [options.na_mask_char.lower()] * 3 c += 3 sequence = "".join(seq) l = len(sequence) if filter_min_sequence_length is not None and \ l < filter_min_sequence_length: nskipped += 1 if filter_max_sequence_length is not None and \ l > filter_max_sequence_length: nskipped += 1 continue options.stdout.write(">%s\n%s\n" % (cur_record.title, sequence)) noutput += 1 if "build-map" in options.methods: p = options.parameters[0] if p: outfile = IOTools.openFile(p, "w") else: outfile = options.stdout outfile.write("old\tnew\n") for old_id, new_id in list(map_seq2nid.items()): outfile.write("%s\t%s\n" % (old_id, new_id)) if p: outfile.close() E.info("ninput=%i, noutput=%i, nskipped=%i, nerrors=%i" % (ninput, noutput, nskipped, nerrors)) E.Stop()