def generatePSP(positives, negatives, outfile): ''' generate a discrimitative PSP file from the positives and negatives that can be used to do descriminative MEME ''' psp_options = PARAMS["psp_options"] nseqs_pos = int(FastaIterator.count(positives)) nseqs_neg = int(FastaIterator.count(negatives)) if nseqs_pos < 2 or nseqs_neg < 2: E.warn("%s: input files do not have sufficent sequences" "to run psp-gen, skipping" % outfile) P.touch(outfile) return # get appropriate options from meme options if PARAMS.get("meme_revcomp", True): psp_options += " -revcomp" statement = '''psp-gen -pos %(positives)s -neg %(negatives)s %(psp_options)s > %(outfile)s ''' P.run(statement)
def calculateSequenceComposition(interval_names, sequence_file, outfile, header_line=True): ''' given a set of interval names that are present in a fasta file, return CpG content file ''' interval_file = open(interval_names) if header_line: interval_file.readline() sequence_file = open(sequence_file) interval_set = set() for line in interval_file.readlines(): interval_set.add(line[:-1]) temp = P.getTempFile("/ifs/scratch") for record in FastaIterator.iterate(sequence_file): seq_id = record.title.split(" ")[0] if seq_id in interval_set: temp.write(">%s\n%s\n" % (record.title, record.sequence)) temp.close() inf = temp.name statement = '''cat %(inf)s | python %(scriptsdir)s/fasta2table.py -s cpg -s length --log=%(outfile)s.log > %(outfile)s''' P.run()
def maskSequences(self, sequences): '''mask a collection of sequences.''' outfile, infile = tempfile.mkstemp() for x, s in enumerate(sequences): os.write(outfile, ">%i\n%s\n" % (x, s)) os.close(outfile) statement = self.mCommand % locals() E.debug("statement: %s" % statement) s = subprocess.Popen(statement, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) (out, err) = s.communicate() if s.returncode != 0: raise RuntimeError( "Error in running %s \n%s\nTemporary directory" % (statement, err)) result = [ x.sequence for x in FastaIterator.iterate(StringIO.StringIO(out))] os.remove(infile) return result
def buildInputFiles(infile, outfiles): ''' build input file based on parameters and fasta sequences that primers are to be designed for ''' PARAMS["constraints_primer_mispriming_library"] = glob.glob("mispriming.dir/*.lib")[0] fasta, identifiers = infile[0], "identifiers.tsv" inf = IOTools.openFile(fasta) E.info("Reading ids for primer design") ids = readIdentifiers(identifiers) E.info("collecting sequences") for f in FastaIterator.iterate(IOTools.openFile(fasta)): if f.title in ids: outf = IOTools.openFile(os.path.join( "input.dir",f.title.replace(" ", "_").replace("/","_") + ".input").replace('"', ''), "w") seq = f.sequence outf.write("SEQUENCE_ID=%s\n" % f.title) for key, value in PARAMS.iteritems(): if "constraints" in key: outf.write("%s=%s\n" % (key.replace("constraints_", "").upper(), value)) outf.write("SEQUENCE_TEMPLATE=%s\n=\n" % seq) outf.close()
def segmentWithCpG(infile, with_contig_sizes=False): '''segment a fasta file, output locations of CpG.''' ninput, nskipped, noutput = 0, 0, 0 iterator = FastaIterator.FastaIterator(infile) segments, contig_sizes = [], collections.OrderedDict() for cur_record in iterator: ninput += 1 contig = re.sub("\s.*", "", cur_record.title) last = None contig_sizes[contig] = (0, len(cur_record.sequence)) for pos, this in enumerate(cur_record.sequence.upper()): if last == "C" and this == "G": segments.append((contig, pos - 1, pos + 1, 1.0)) last = this E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) if with_contig_sizes: return segments, contig_sizes return segments
def countMotifs(infile, motifs): '''find regular expression *motifs* in sequences within fasta formatted *infile*. ''' it = FastaIterator.FastaIterator(infile) positions = [] while 1: try: seq = next(it) except StopIteration: break if not seq: break rseq = Genomics.complement(seq.sequence) lsequence = len(seq.sequence) pos = [] for motif, pattern in motifs: for x in pattern.finditer(seq.sequence): pos.append((motif, "+", x.start(), x.end())) for x in pattern.finditer(rseq): pos.append( (motif, "-", lsequence - x.end(), lsequence - x.start())) positions.append((seq.title, pos)) return positions
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-b", "--bamfile", dest="bamfile", type="string", help="supply bam file") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) # read in contigs E.info("reading in contig file") contigs = {} for fasta in FastaIterator.iterate(options.stdin): contigs[fasta.title] = (1, len(fasta.sequence) - 1) E.info("read %i contigs" % len(contigs.keys())) # read in bamfile E.info("reading bam file") samfile = pysam.Samfile(options.bamfile) E.info("iterating over contigs") c = 0 for contig, coords in contigs.iteritems(): coords = list(coords) ################################# # NB this is specific for my data! contig = contig.split(" ")[0] ################################# species_counts = collections.defaultdict(int) for alignment in samfile.fetch(contig, coords[0], coords[1]): species_id = alignment.qname.split("|")[1] species_counts[species_id] += 1 # at the moment ignore if there are no counts if len(species_counts.values()) == 0: E.warn("no reads map to %s" % contig) continue for species, count in species_counts.iteritems(): if species_counts[species] == max(species_counts.values()): top_dog = species c += 1 break E.info("species %s assigned to contig number %i" % (top_dog, c)) options.stdout.write("%s\t%s\n" % (contig, top_dog)) # write footer and output benchmark information. E.Stop()
def calculateSequenceComposition(interval_names, sequence_file, outfile, header_line=True): ''' given a set of interval names that are present in a fasta file, return CpG content file ''' interval_file = open(interval_names) if header_line: interval_file.readline() sequence_file = open(sequence_file) interval_set = set() for line in interval_file.readlines(): interval_set.add(line[:-1]) temp = P.getTempFile("/ifs/scratch") for record in FastaIterator.iterate(sequence_file): seq_id = record.title.split(" ")[0] if seq_id in interval_set: temp.write(">%s\n%s\n" % (record.title, record.sequence)) temp.close() inf = temp.name statement = ''' cat %(inf)s | cgat fasta2table -s na -s cpg -s length --log=%(outfile)s.log > %(outfile)s''' P.run()
def maskSequences(self, sequences): '''mask a collection of sequences.''' with tempfile.NamedTemporaryFile(mode="w+t", delete=False) as outf: for x, s in enumerate(sequences): outf.write(">%i\n%s\n" % (x, s)) infile = outf.name statement = self.mCommand % locals() E.debug("statement: %s" % statement) s = subprocess.Popen(statement, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) (out, err) = s.communicate() if s.returncode != 0: raise RuntimeError( "Error in running %s \n%s\nTemporary directory" % (statement, err)) result = [ x.sequence for x in FastaIterator.iterate(StringIO(out.decode())) ] os.remove(infile) return result
def maskSequences(self, sequences): '''mask a collection of sequences.''' outfile, infile = tempfile.mkstemp() for x, s in enumerate(sequences): os.write(outfile, ">%i\n%s\n" % (x, s)) os.close(outfile) statement = self.mCommand % locals() E.debug("statement: %s" % statement) s = subprocess.Popen(statement, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) (out, err) = s.communicate() if s.returncode != 0: raise RuntimeError( "Error in running %s \n%s\nTemporary directory" % (statement, err)) result = [x.sequence for x in FastaIterator.iterate(StringIO(out))] os.remove(infile) return result
def countCompleteGenes(infile, outfile): ''' count the number of genes that are classed as complete based on having a start and stop codon ''' start = "ATG" stop = ["TAG", "TAA", "TGA"] ntotal = 0 nstart = 0 nstop = 0 nstart_nstop = 0 for fasta in FastaIterator.iterate(IOTools.openFile(infile)): ntotal += 1 if fasta.sequence.startswith(start): nstart += 1 if fasta.sequence[-3:len(fasta.sequence)] in stop: nstop += 1 if fasta.sequence.startswith( start) and fasta.sequence[-3:len(fasta.sequence)] in stop: nstart_nstop += 1 outf = open(outfile, "w") outf.write("total_genes\tpstart\tpstop\tpstart_stop\n") outf.write("\t".join( map(str, [ ntotal, float(nstart) / ntotal, float(nstop) / ntotal, float(nstart_nstop) / ntotal ])) + "\n") outf.close()
def removeContaminants(infiles, outfile): '''remove adaptor contamination from fastq files. This method uses cutadapt. ''' infile, contaminant_file = infiles adaptors = [] for entry in FastaIterator.FastaIterator( IOTools.openFile(contaminant_file)): adaptors.append("-a %s" % entry.sequence) adaptors = " ".join(adaptors) to_cluster = True statement = ''' cutadapt %(adaptors)s --overlap=%(contamination_min_overlap_length)i --format=fastq %(contamination_options)s <( zcat < %(infile)s ) 2> %(outfile)s.log | gzip > %(outfile)s ''' P.run()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-dir", dest="genome_dir", type="string", help="supply help") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) contigs_map = {} for genome in glob.glob(os.path.join(options.genome_dir, "*")): for fasta in FastaIterator.iterate(IOTools.openFile(genome)): identifier = fasta.title.split("|") gi = identifier[1] contigs_map[gi] = fasta.title for line in options.stdin.readlines(): data = line[:-1].split("\t") gi = data[1] assert gi in contigs_map, "cannot find genome with id gi|%s in genomes directory" % gi options.stdout.write("%s\t%s\n" % (data[0], contigs_map[gi])) # write footer and output benchmark information. E.Stop()
def filterByCoverage(infiles, outfile): fcoverage = PARAMS["coverage_filter"] contig_file = infiles[0] dbh = sqlite3.connect( os.path.join(PARAMS["results_resultsdir"], PARAMS["database"])) cc = dbh.cursor() contigs = set() for infile in infiles[1:]: dirsplit = infile.split("/") infile = os.path.join( PARAMS["results_resultsdir"], dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1]) tablename = P.toTable(os.path.basename(infile)) if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile), ".coverage.load"): statement = """SELECT contig_id ave FROM (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id) WHERE ave > %i""" % (tablename, PARAMS["coverage_filter"]) for data in cc.execute(statement).fetchall(): contigs.add(data[0]) outf = open(outfile, "w") print contigs for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)): identifier = fasta.title.split(" ")[0] if identifier in contigs: outf.write(">%s\n%s\n" % (identifier, fasta.sequence)) outf.close()
def createOTUs(infiles, outfile): '''Make OTUs from the template sequences of all samples that passed filtering''' fasta_files = [] for infile in infiles: fasta_file = P.snip(infile, '_true_snps.tsv', strip_path=True)\ + '_template.fasta' fasta_file = os.path.join('05_mapped_fastas.dir', fasta_file) assert os.path.exists(fasta_file) fasta_files.append(fasta_file) tmpf = P.getTempFilename('.') with IOTools.openFile(tmpf, 'w') as outf: for fasta_file in fasta_files: for fasta in FI.FastaIterator(IOTools.openFile(fasta_file)): outf.write('>' + fasta.title + ';size=1\n' + fasta.sequence + '\n') out_up = P.snip(outfile, '.fasta') + '_up.txt' statement = ("usearch" " -cluster_otus %(tmpf)s" " -otus %(outfile)s" " -uparseout %(out_up)s" " -otu_radius_pct 1.0" " -uparse_break -999" " -relabel OTU_" " &> %(outfile)s.log") to_cluster = False P.run()
def buildInputFiles(infile, outfiles): ''' build input file based on parameters and fasta sequences that primers are to be designed for ''' PARAMS["constraints_primer_mispriming_library"] = glob.glob( "mispriming.dir/*.lib")[0] fasta, identifiers = infile[0], "identifiers.tsv" inf = IOTools.openFile(fasta) E.info("Reading ids for primer design") ids = readIdentifiers(identifiers) E.info("collecting sequences") for f in FastaIterator.iterate(IOTools.openFile(fasta)): if f.title in ids: outf = IOTools.openFile( os.path.join( "input.dir", f.title.replace(" ", "_").replace("/", "_") + ".input").replace('"', ''), "w") seq = f.sequence outf.write("SEQUENCE_ID=%s\n" % f.title) for key, value in PARAMS.iteritems(): if "constraints" in key: outf.write( "%s=%s\n" % (key.replace("constraints_", "").upper(), value)) outf.write("SEQUENCE_TEMPLATE=%s\n=\n" % seq) outf.close()
def segmentUngapped(infile, gap_char, min_gap_size=0): iterator = FastaIterator.FastaIterator(infile) while 1: try: cur_record = iterator.next() except StopIteration: break if cur_record is None: break contig = re.sub("\s.*", "", cur_record.title) size = len(cur_record.sequence) last_end = 0 for start, end in gapped_regions(cur_record.sequence, gap_char): if end - start < min_gap_size: continue if last_end != 0: yield(contig, last_end, start, 0) last_end = end if last_end < size: yield(contig, last_end, size, 0)
def listAdaptors(infile): adaptors = [] for entry in FastaIterator.FastaIterator(IOTools.openFile(infile)): adaptors.append("%s %s" % (PARAMS["contamination_trim_type"], entry.sequence)) adaptors = " ".join(adaptors) return adaptors
def iterate_double_fasta ( fn1, fn2 ): iterator = FastaIterator.iterate_together( fn1, fn2 ) for seq1, seq2 in iterator: yield AlignedPairs.UnalignedPair( token1 = seq1.title, sequence1 = seq1.sequence, token2 = seq2.title, sequence2 = seq2.sequence )
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv # setup command line parser parser = E.OptionParser( version = "%prog version: $Id$", usage = globals()["__doc__"] ) parser.add_option("-n", dest="N", type="int", help="e.g N50 - the length at which 50% of contigs are equal or above") parser.add_option("-f", "--filter-length", dest="filter_length", type="int", help="calculate stats on contigs longer than -f") parser.set_defaults(N = 50, filter_length = 0) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start( parser, argv = argv ) f = options.filter_length # iterate over the contigs/scaffolds and return stats number_of_contigs = 0 N = options.N contig_lengths = [] for record in FastaIterator.iterate(options.stdin): contig_length = len(list(record.sequence)) if contig_length >= f: number_of_contigs += 1 contig_lengths.append(contig_length) # mean, median and max contig/scaffold lengths mean_length = np.mean(contig_lengths) median_length = np.median(contig_lengths) max_length = max(contig_lengths) # iterate over contigs/scaffolds sorted by longest # and caculate the NX index = 0 cum_length = 0 total_length = sum(contig_lengths) for length in sorted(contig_lengths, reverse = True): while cum_length <= total_length*(float(N)/100): index += 1 cum_length += length # output the results options.stdout.write("nscaffolds\tscaffold_length\tN%i\tmedian_length\tmean_length\tmax_length\n" % N) options.stdout.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (number_of_contigs, total_length, sorted(contig_lengths, reverse = True)[index], str(median_length), str(mean_length), str(max_length))) ## write footer and output benchmark information. E.Stop()
def iterate_single_fasta ( fn1 ): iterator = FastaIterator.FastaIterator( fn1 ) while 1: seq1, seq2 = iterator.next(), iterator.next() yield AlignedPairs.UnalignedPair( token1 = seq1.title, sequence1 = seq1.sequence, token2 = seq2.title, sequence2 = seq2.sequence )
def filterContigs(infile, outfile, length): ''' filter contigs by length ''' outf = open(outfile, "w") for fasta in FastaIterator.iterate(IOTools.openFile(infile)): seq_length = len(fasta.sequence) if seq_length < length: continue outf.write(">%s\n%s\n" % (fasta.title, fasta.sequence)) outf.close()
def extract16SSequences(infile, outfile): '''Extract any RNA fasta entry with the term 16S in the title''' outf = IOTools.openFile(outfile, 'w') for fasta in FastaIterator.FastaIterator(IOTools.openFile(infile)): if re.search('16S', fasta.title, re.IGNORECASE): outf.write('\n'.join(['>' + fasta.title, fasta.sequence]) + '\n') outf.close()
def contig_to_stats(contigs_file, stats_file, params): """ calculate descriptive stats for a set of contigs / scaffolds """ PARAMS = params if PARAMS["filter"]: f = PARAMS["filter"] else: f = 0 # iterate over the contigs/scaffolds and return stats number_of_scaffolds = 0 N = PARAMS["scaffold_n"] scaffold_lengths = [] inf = open(contigs_file) for record in FastaIterator.iterate(inf): scaffold_length = len(list(record.sequence)) if scaffold_length >= f: number_of_scaffolds += 1 scaffold_lengths.append(scaffold_length) # mean, median and max contig/scaffold lengths mean_length = np.mean(scaffold_lengths) median_length = np.median(scaffold_lengths) max_length = max(scaffold_lengths) # iterate over contigs/scaffolds sorted by longest # and caculate the NX index = 0 cum_length = 0 total_length = sum(scaffold_lengths) for length in sorted(scaffold_lengths, reverse=True): while cum_length <= total_length * (float(N) / 100): index += 1 cum_length += length # output the results outf = open(stats_file, "w") outf.write("nscaffolds\tscaffold_length\tN%i\tmedian_length\tmean_length\tmax_length\n" % N) outf.write( "%s\t%s\t%s\t%s\t%s\t%s\n" % ( number_of_scaffolds, total_length, sorted(scaffold_lengths, reverse=True)[index], str(median_length), str(mean_length), str(max_length), ) )
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: sequence2alignment.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.set_defaults() (options, args) = E.Start(parser) iterator = FastaIterator.FastaIterator(sys.stdin) ninput, noutput, nskipped = 0, 0, 0 options.stdout.write( "query\tsbjct\tquery_from\tquery_to\tsbjct_from\tsbjct_to\tquery_starts\tsbjct_starts\tblock_sizes\n" ) while 1: try: cur_record = iterator.next() except StopIteration: break ninput += 1 sequence = re.sub(" ", "", cur_record.sequence) l = len(sequence) map_sequence2mali = alignlib_lite.py_makeAlignmentVector() alignlib_lite.py_AlignmentFormatExplicit(0, sequence, 0, "X" * l).copy(map_sequence2mali) options.stdout.write("\t".join( (cur_record.title, "ref", str(alignlib_lite.py_AlignmentFormatBlocks(map_sequence2mali)))) + "\n") noutput += 1 if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i.\n" % (ninput, noutput, nskipped)) E.Stop()
def build_scaffold_lengths(contigs_file, outfile, params): ''' output the distribution of scaffold lengths ''' inf = open(contigs_file) outf = open(outfile, "w") outf.write("scaffold_name\tlength\n") for record in FastaIterator.iterate(inf): scaffold_length = len(list(record.sequence)) outf.write("%s\t%i\n" % (record.title, scaffold_length)) outf.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: fastas2fasta.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) (options, args) = E.Start(parser) if len(args) < 2: raise ValueError( "please supply at least two filenames to concatenate.") iterators = [] for a in args: iterators.append(FastaIterator.FastaIterator(IOTools.openFile(a, "r"))) ninput, noutput, nerrors = 0, 0, 0 while 1: sequences = [] ids = [] for iterator in iterators: try: cur_record = iterator.next() except StopIteration: break sequences.append(re.sub(" ", "", cur_record.sequence)) ids.append(cur_record.title) if not sequences: break ninput += 1 if len(sequences) != len(iterators): raise "unequal number of sequences in files." noutput += 1 options.stdout.write(">%s\n%s\n" % (ids[0], "".join(sequences))) E.info("ninput=%i, noutput=%i, nerrors=%i" % (ninput, noutput, nerrors)) E.Stop()
def collectGenomeSizes(infile, outfile): ''' output the genome sizes for each genome ''' to_cluster = True outf = open(outfile, "w") outf.write("genome\tlength\n") # assume single fasta entry for fasta in FastaIterator.iterate(IOTools.openFile(infile)): name = P.snip(os.path.basename(infile), ".fna") length = len(list(fasta.sequence)) outf.write("%s\t%s\n" % (name, str(length))) outf.close()
def contig_to_stats(contigs_file, stats_file, params): ''' calculate descriptive stats for a set of contigs / scaffolds ''' PARAMS = params if PARAMS["filter"]: f = PARAMS["filter"] else: f = 0 # iterate over the contigs/scaffolds and return stats number_of_scaffolds = 0 N = PARAMS["scaffold_n"] scaffold_lengths = [] inf = open(contigs_file) for record in FastaIterator.iterate(inf): scaffold_length = len(list(record.sequence)) if scaffold_length >= f: number_of_scaffolds += 1 scaffold_lengths.append(scaffold_length) # mean, median and max contig/scaffold lengths mean_length = np.mean(scaffold_lengths) median_length = np.median(scaffold_lengths) max_length = max(scaffold_lengths) # iterate over contigs/scaffolds sorted by longest # and caculate the NX index = 0 cum_length = 0 total_length = sum(scaffold_lengths) for length in sorted(scaffold_lengths, reverse=True): while cum_length <= total_length * (float(N) / 100): index += 1 cum_length += length # output the results outf = open(stats_file, "w") outf.write( "nscaffolds\tscaffold_length\tN%i\tmedian_length\tmean_length\tmax_length\n" % N) outf.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (number_of_scaffolds, total_length, sorted(scaffold_lengths, reverse=True)[index], str(median_length), str(mean_length), str(max_length)))
def runDREME(infile, outfile, neg_file="", options=""): ''' Run DREME on fasta file. If a neg_file is passed then DREME will use this as the negative set, otherwise the default is to shuffle the input ''' nseqs_pos = int(FastaIterator.count(infile)) if nseqs_pos < 2: E.warn("%s: less than 2 sequences - dreme skipped" % outfile) P.touch(outfile) return if neg_file: nseqs_neg = int(FastaIterator.count(neg_file)) if nseqs_neg < 2: E.warn( "%s: less than 2 sequences in negatives file - dreme skipped" % outfile) P.touch(outfile) return else: neg_file = "-n %s" % neg_file logfile = outfile + ".log" target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), outfile) tmpdir = P.get_temp_dir(".") statement = ''' dreme -p %(infile)s %(neg_file)s -png -oc %(tmpdir)s %(dreme_options)s %(options)s > %(logfile)s ''' P.run(statement) collectMEMEResults(tmpdir, target_path, outfile, method="dreme")
def segmentGaps(infile, gap_char): iterator = FastaIterator.FastaIterator(infile) while 1: try: cur_record = iterator.next() except StopIteration: break if cur_record is None: break contig = re.sub("\s.*", "", cur_record.title) for start, end in gapped_regions(cur_record.sequence, gap_char): yield (contig, start, end, 0)
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: analyze_sequences.py 2865 2010-03-03 10:18:28Z andreas $", usage=globals()["__doc__"]) parser.add_option("-s", "--species", dest="species", type="string", help="species to use.") parser.add_option("-p", "--prefix", dest="prefix", type="string", help="prefix to use for temporary files.") parser.set_defaults(filename_map=None, ) (options, args) = E.Start(parser, add_mysql_options=True) iterator = FastaIterator.FastaIterator(options.stdin) print "id\t" + SequenceProperties().GetHeader() while 1: cur_record = iterator.next() if cur_record is None: break sequence = re.sub(" ", "", cur_record.sequence) if len(sequence) % 3: raise "sequence %s is not multiples of 3: length=%i!" % ( cur_record.title, len(sequence)) s = SequenceProperties() s.Load(sequence) print cur_record.title + "\t" + str(s) E.Stop()
def buildMisprimingLib(infiles, outfile): ''' build fasta file of sequences to check for mispriming ''' fasta, identifiers = infiles inf = IOTools.openFile(fasta) E.info("reading ids for sequences to keep") ids = readIdentifiers(identifiers) outf = IOTools.openFile(outfile, "w") E.info("collecting sequences") for f in FastaIterator.iterate(IOTools.openFile(fasta)): if f.title not in ids: outf.write(">%s\n%s\n" % (f.title, f.sequence)) outf.close()
def segmentFixedWidthWindows(infile, window_size, window_shift): """return a list of fixed contig sizes.""" ninput, nskipped, noutput = 0, 0, 0 iterator = FastaIterator.FastaIterator(infile) window_shift = window_size # at most 50% can be gap gap_cutoff = int(window_size // 2) segments = [] while 1: ninput += 1 try: cur_record = iterator.next() except StopIteration: break if cur_record is None: break contig = re.sub("\s.*", "", cur_record.title) seq = cur_record.sequence size = len(cur_record.sequence) for x in range(0, size, window_shift): s = seq[x:x + window_size].upper() gc, at = 0, 0 for c in s: if c in "GC": gc += 1 elif c in "AT": at += 1 # skip segments containing mostly gaps if window_size - (gc + at) > gap_cutoff: nskipped += 1 continue segments.append( (contig, x, x + window_size, float(gc) / (gc + at))) noutput += 1 E.info("ninput=%i, noutput=%i, nskipped_windows=%i" % (ninput, noutput, nskipped)) return segments
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-pm", "--profilematrix", dest="matrixfile", type="string", help="name of profile file you want to convert") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) #outf = IOTools.openFile("my_output", "w") for line in options.matrixfile: line = line.strip() fields = line.split() total = sum([float(col) for col in fields[1:]]) if total == 0: continue else: for i, col in enumerate(fields): if i == 0: continue fields[i] = col / total options.stdout.write("\t".join(map(str, fields))) for fasta_read in FastaIterator.iterate(IOTools.openFile( options.fastafile)): read_sequence = fasta_read.sequence read_name = fasta_read.title quals = '.' * len(read_sequence) new_fastq = Fastq.Record(identifier=read_name, seq=read_sequence, quals=quals) new_fastq.fromPhred([30] * len(read_sequence), format='illumina-1.8') options.stdout.write(str(new_fastq) + "\n") # write footer and output benchmark information. E.Stop()
def build_scaffold_lengths(contigs_file, outfile, params): ''' output the distribution of scaffold lengths ''' PARAMS = params if PARAMS["filter"]: f = PARAMS["filter"] else: f = 0 inf = open(contigs_file) outf = open(outfile, "w") outf.write("scaffold_name\tlength\n") for record in FastaIterator.iterate(inf): scaffold_length = len(list(record.sequence)) if scaffold_length > f: # rename sequences if they have a space in them outf.write("%s\t%i\n" % (record.title.replace(" ", "_"), scaffold_length)) outf.close()
def filter16SGenes(infiles, outfile): '''Drop anything with gaps and drop anything with sequence length below 1400bp''' fasta_file, stat_file = infiles # fetch a list of those entries to be filtered to_drop = set() df = pd.read_table(stat_file, index_col=0) to_drop.update(df[df['ngap_regions'] > 0].index.tolist()) to_drop.update(df[df['length'] < 1400].index.tolist()) # drop from fasta file with IOTools.openFile(outfile, 'w') as outf: for fasta in FastaIterator.FastaIterator(IOTools.openFile(fasta_file)): if fasta.title in to_drop: continue else: outf.write('\n'.join(['>' + fasta.title, fasta.sequence]) + '\n')
def runMEMEOnSequences(infile, outfile): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker ''' to_cluster = True # job_options = "-l mem_free=8000M" nseqs = int(FastaIterator.count(infile)) if nseqs == 0: E.warn("%s: no sequences - meme skipped" % outfile) P.touch(outfile) return target_path = os.path.join( os.path.abspath(PARAMS["exportdir"]), "meme", outfile) tmpdir = P.getTempDir(".") statement = ''' meme %(infile)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(motifs_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run() collectMEMEResults(tmpdir, target_path, outfile)
def subsampleNReadsFromFasta(infile, outfile, nreads, logfile=""): checkParams() nseqs = FastaIterator.count(infile) if nreads > nseqs: prop = 1 else: prop = float(nreads)/float(nseqs) if logfile: logfile = "-L %s" % logfile statement = ''' python %(scriptsdir)s/fasta2fasta.py -I %(infile)s %(logfile)s -m sample --sample-proportion=%(prop)s -S %(outfile)s ''' P.run()
def countCompleteGenes(infile, outfile): ''' count the number of genes that are classed as complete based on having a start and stop codon ''' start = "ATG" stop = ["TAG", "TAA", "TGA"] ntotal = 0 nstart = 0 nstop = 0 nstart_nstop = 0 for fasta in FastaIterator.iterate(IOTools.openFile(infile)): ntotal += 1 if fasta.sequence.startswith(start): nstart += 1 if fasta.sequence[-3:len(fasta.sequence)] in stop: nstop += 1 if fasta.sequence.startswith(start) and fasta.sequence[-3:len(fasta.sequence)] in stop: nstart_nstop += 1 outf = open(outfile, "w") outf.write("total_genes\tpstart\tpstop\tpstart_stop\n") outf.write("\t".join(map(str,[ntotal, float(nstart)/ntotal, float(nstop)/ntotal, float(nstart_nstop)/ntotal])) + "\n") outf.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: split_fasta.py 1714 2007-12-11 16:51:12Z andreas $") parser.add_option("-f", "--file", dest="input_filename", type="string", help="input filename. If not given, stdin is used.", metavar="FILE") parser.add_option("-i", "--input-pattern", dest="input_pattern", type="string", help="input pattern. Parses description line in order to extract id.") parser.add_option("-o", "--output-pattern", dest="output_pattern", type="string", help="output pattern. Gives filename for a given sequence.") parser.add_option("-n", "--num-sequences", dest="num_sequences", type="int", help="split by number of sequences (not implemented yet).") parser.add_option("-m", "--map", dest="map_filename", type="string", help="map filename. Map identifiers to filenames", metavar="FILE") parser.add_option("-s", "--skip-identifiers", dest="skip_identifiers", action="store_true", help="do not write identifiers.", metavar="FILE") parser.add_option("--min-size", dest="min_size", type="int", help="minimum cluster size.") parser.set_defaults( input_filename=None, map_filename=None, skip_identifiers=False, input_pattern="^(\S+)", min_size=0, num_sequences=None, output_pattern="%s") (options, args) = E.Start(parser) if options.input_filename: infile = IOTools.openFile(options.input_filename, "r") else: infile = sys.stdin if options.map_filename: map_id2filename = IOTools.ReadMap(open(options.map_filename, "r")) else: map_id2filename = {} if options.num_sequences: files = FilesChunks(chunk_size=options.num_sequences, output_pattern=options.output_pattern, skip_identifiers=options.skip_identifiers) else: files = Files(output_pattern=options.output_pattern, skip_identifiers=options.skip_identifiers) if options.input_pattern: rx = re.compile(options.input_pattern) else: rx = None ninput = 0 noutput = 0 identifier = None chunk = 0 for seq in FastaIterator.iterate(infile): ninput += 1 if rx: try: identifier = rx.search(seq.title).groups()[0] except AttributeError: print "# parsing error in description line %s" % (seq.title) else: identifier = seq.title if map_id2filename: if identifier in map_id2filename: identifier = map_id2filename[identifier] else: continue files.Write(identifier, seq) noutput += 1 if options.input_filename: infile.close() # delete all clusters below a minimum size # Note: this has to be done at the end, because # clusters sizes are only available once both the fasta # file and the map has been parsed. if options.min_size: ndeleted = files.DeleteFiles(min_size=options.min_size) else: ndeleted = 0 if options.loglevel >= 1: print "# input=%i, output=%i, ndeleted=%i" % (ninput, noutput, ndeleted) E.Stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-s", "--correct-gap-shift", dest="correct_shift", action="store_true", help="correct gap length shifts in alignments. " "Requires alignlib_lite.py [%default]") parser.add_option( "-1", "--pattern1", dest="pattern1", type="string", help="pattern to extract identifier from in identifiers1. " "[%default]") parser.add_option( "-2", "--pattern2", dest="pattern2", type="string", help="pattern to extract identifier from in identifiers2. " "[%default]") parser.add_option( "-o", "--output-section", dest="output", type="choice", action="append", choices=("diff", "missed", "seqdiff"), help="what to output [%default]") parser.set_defaults(correct_shift=False, pattern1="(\S+)", pattern2="(\S+)", output=[]) (options, args) = E.Start(parser) if len(args) != 2: raise ValueError("two files needed to compare.") if options.correct_shift: try: import alignlib_lite except ImportError: raise ImportError( "option --correct-shift requires alignlib_lite.py_ " "but alignlib not found") seqs1 = dict([ (x.title, x.sequence) for x in FastaIterator.iterate( IOTools.openFile(args[0], "r"))]) seqs2 = dict([ (x.title, x.sequence) for x in FastaIterator.iterate( IOTools.openFile(args[1], "r"))]) if not seqs1: raise ValueError("first file %s is empty." % (args[0])) if not seqs2: raise ValueError("second file %s is empty." % (args[1])) MapIdentifiers(seqs1, options.pattern1) MapIdentifiers(seqs2, options.pattern2) nsame = 0 nmissed1 = 0 nmissed2 = 0 ndiff = 0 ndiff_first = 0 ndiff_last = 0 ndiff_prefix = 0 ndiff_selenocysteine = 0 ndiff_masked = 0 nfixed = 0 found2 = {} write_missed1 = "missed" in options.output write_missed2 = "missed" in options.output write_seqdiff = "seqdiff" in options.output write_diff = "diff" in options.output or write_seqdiff for k in sorted(seqs1): if k not in seqs2: nmissed1 += 1 if write_missed1: options.stdout.write("---- %s ---- %s\n" % (k, "missed1")) continue found2[k] = 1 s1 = seqs1[k].upper() s2 = seqs2[k].upper() m = min(len(s1), len(s2)) if s1 == s2: nsame += 1 else: status = "other" ndiff += 1 if s1[1:] == s2[1:]: ndiff_first += 1 status = "first" elif s1[:m] == s2[:m]: ndiff_prefix += 1 status = "prefix" elif s1[:-1] == s2[:-1]: ndiff_last += 1 status = "last" else: if len(s1) == len(s2): # get all differences: the first and last residues # can be different for peptide sequences when # comparing my translations with ensembl peptides. differences = [] for x in range(1, len(s1) - 1): if s1[x] != s2[x]: differences.append((s1[x], s2[x])) l = len(differences) # check for Selenocysteins if len([x for x in differences if x[0] == "U" or x[1] == "U"]) == l: ndiff_selenocysteine += 1 status = "selenocysteine" # check for masked residues elif len([x for x in differences if x[0] in "NX" or x[1] in "NX"]) == l: ndiff_masked += 1 status = "masked" # correct for different gap lengths if options.correct_shift: map_a2b = alignlib_lite.py_makeAlignmentVector() a, b = 0, 0 keep = False x = 0 while x < m and not (a == len(s1) and b == len(s2)): try: if s1[a] != s2[b]: while s1[a] == "N" and s2[b] != "N": a += 1 while s1[a] != "N" and s2[b] == "N": b += 1 if s1[a] != s2[b]: break except IndexError: print("# index error for %s: x=%i, a=%i, b=%i, l1=%i, l2=%i" % (k, x, a, b, len(s1), len(s2))) break a += 1 b += 1 map_a2b.addPairExplicit(a, b, 0.0) # check if we have reached the end: else: keep = True nfixed += 1 f = alignlib_lite.py_AlignmentFormatEmissions(map_a2b) print("fix\t%s\t%s" % (k, str(f))) if not keep: print("# warning: not fixable: %s" % k) if write_diff: options.stdout.write("---- %s ---- %s\n" % (k, status)) if write_seqdiff: options.stdout.write("< %s\n> %s\n" % (seqs1[k], seqs2[k])) for k in sorted(list(seqs2.keys())): if k not in found2: nmissed2 += 1 if write_missed2: options.stdout.write("---- %s ---- %s\n" % (k, "missed2")) options.stdlog.write("""# Legend: """) E.info("seqs1=%i, seqs2=%i, same=%i, ndiff=%i, nmissed1=%i, nmissed2=%i" % (len(seqs1), len(seqs2), nsame, ndiff, nmissed1, nmissed2)) E.info( "ndiff=%i: first=%i, last=%i, prefix=%i, selenocysteine=%i, masked=%i, fixed=%i, other=%i" % (ndiff, ndiff_first, ndiff_last, ndiff_prefix, ndiff_selenocysteine, ndiff_masked, nfixed, ndiff - ndiff_first - ndiff_last - ndiff_prefix - ndiff_selenocysteine - ndiff_masked - nfixed)) E.Stop()
def main( argv = None ): parser = E.OptionParser( version = "%prog version: $Id: analyze_codonbias_shannon.py 2864 2010-03-03 10:18:16Z andreas $", usage = globals()["__doc__"] ) parser.add_option( "-c", "--is-cds", dest="is_cds", action="store_true", help = "input are cds (nucleotide) sequences [%default]" ) parser.set_defaults( is_cds = False, ) (options, args) = E.Start( parser, argv = argv ) options.stdout.write( "snpid\tidentifier\tpos\treference\tvariant\tcounts\tweight\n" ) alphabet = "ACDEFGHIKLMNPQRSTVWY" snpid = 0 for entry in FastaIterator.iterate( options.stdin ): identifier = entry.title if options.is_cds: cds_sequence = entry.sequence.upper() assert len(cds_sequence) % 3 == 0, \ "length of sequence '%s' is not a multiple of 3" % entry.title sequence = Genomics.translate( cds_sequence ) weights = [] for pos, cds_pos in enumerate(range( 0, len(cds_sequence), 3)): codon = cds_sequence[cds_pos:cds_pos+3] counts = collections.defaultdict(int) for x in range(0,3): rna = codon[x] for na in "ACGT": if na == rna: continue taa = Genomics.translate(codon[:x] + na + codon[x+1:]) counts[taa] += 1 weights.append( counts ) else: sequence = entry.sequence.upper() counts = {} for x in alphabet: counts[x] = 1 weights = [counts] * len(sequence) for pos, ref in enumerate( sequence ): if ref not in alphabet: continue w = weights[pos] t = float(sum(w.values())) for variant in alphabet: if variant == ref: continue snpid +=1 options.stdout.write( "%s\n" % "\t".join( ( "%010i" % snpid, identifier, str(pos+1), ref, variant, "%i" % w[variant], "%6.4f" % (w[variant] / t), ))) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-k", "--kmer-size", dest="kmer", type="int", help="supply kmer length") parser.add_option( "-p", "--output-proportion", dest="proportion", action="store_true", help="output proportions - overides the default output", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) # do not allow greater than octonucleotide assert options.kmer <= 8, "cannot handle kmer of length %i" % options.kmer # how we deal with the nucleotides depends on the kmer length nucleotides = [] for nucleotide in ["A", "C", "T", "G"]: nucleotides = nucleotides + [x for x in itertools.repeat(nucleotide, options.kmer)] E.info("retrieving %imer sequences" % options.kmer) # get all kmer sequences to query kmers = set() for kmer in itertools.permutations(nucleotides, options.kmer): kmers.add(kmer) E.info("matching %imers in file" % options.kmer) # count the number of kmers in each sequence result = {} # NB assume that non fasta files are caught by FastaIterator total_entries = 0 for fasta in FastaIterator.iterate(options.stdin): total_entries += 1 result[fasta.title] = {} for kmer in kmers: counts = [m.start() for m in re.finditer("".join(kmer), fasta.sequence)] result[fasta.title][kmer] = len(counts) E.info("writing results") # write out the results headers = result.keys() rows = set() for kmer_counts in result.values(): for kmer, count in kmer_counts.iteritems(): rows.add("".join(kmer)) # write header row options.stdout.write("kmer\t" + "\t".join(headers) + "\n") # output proportions if required - normalises by # sequence length E.info("computing total counts") totals = {} for header in headers: totals[header] = sum([result[header][tuple(row)] for row in rows]) for row in rows: if options.proportion: options.stdout.write( "\t".join([row] + [str(float(result[header][tuple(row)]) / totals[header]) for header in headers]) + "\n" ) else: options.stdout.write("\t".join([row] + [str(result[header][tuple(row)]) for header in headers]) + "\n") E.info("written kmer counts for %i contigs" % total_entries) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser(version="%prog version: $Id: contigs2random_sample.py 2871 2010-03-03 10:20:44Z nicki $", usage=globals()["__doc__"]) parser.add_option("-m", "--species-map", dest="species_map", type="string", help="text file specifying the mapping between contig and genome") parser.add_option("-g", "--genome-dir", dest="genome_dir", type="string", help="specify directory where genome / genomes are stored") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) # read in contig lengths into dictionary E.info("reading contigs file") c_contigs = 0 contigs_lengths = {} for fasta in FastaIterator.iterate(options.stdin): c_contigs += 1 # titles of fasta records must be single strings with no special # characters contigs_lengths[fasta.title.split(" ")[0]] = len(fasta.sequence) E.info("read %i contigs" % c_contigs) # read in mapping between spcies and contigs species_map = {} for line in open(options.species_map).readlines(): data = line[:-1].split("\t") contig, species = data[0], data[1] species_map[contig] = species # read genomes into memory # NB this may need optimisin if using large # genomes or many genomes E.info("reading genomes from %s" % options.genome_dir) # The directory must ONLY contain genome files!! genomes_sequences = {} c_genomes = 0 for genome_file in glob.glob(os.path.join(options.genome_dir, "*")): c_genomes += 1 for fasta in FastaIterator.iterate(IOTools.openFile(genome_file)): genomes_sequences[fasta.title] = fasta.sequence E.info("read %i genomes from %s" % (c_genomes, options.genome_dir)) # iterate over the contigs and sample from the respective genome E.info("iterating over contigs") c_contigs_output = 0 for contig, length in contigs_lengths.iteritems(): if contig not in species_map: E.warn("contig %s not in species map file" % contig) else: c_contigs_output += 1 genome = species_map[contig] genome_length = len(genomes_sequences[genome]) # get the start position from which to sample start = random.randint(1, genome_length) try: end = start + length - 1 except ValueError: print "end of sampled contig extends beyond length of genome" sampled_seq = genomes_sequences[genome][start:end] options.stdout.write( ">%s_random\n%s\n" % (contig + "_%s" % species_map[contig], sampled_seq)) E.info("written %i contigs" % c_contigs_output) # write footer and output benchmark information. E.Stop()
else: files = Files( output_pattern = options.output_pattern, skip_identifiers = options.skip_identifiers ) if options.input_pattern: rx = re.compile( options.input_pattern ) else: rx = None ninput = 0 noutput = 0 identifier = None chunk = 0 for seq in FastaIterator.iterate( infile ): ninput += 1 if rx: try: identifier = rx.search(seq.title).groups()[0] except AttributeError: print "# parsing error in description line %s" % (seq.title) else: identifier = seq.title if map_id2filename: if identifier in map_id2filename: identifier = map_id2filename[identifier] else:
def findTATABox(infiles, outfile): '''find TATA box in promotors. There are several matrices to choose from: M00216 V$TATA_C Retroviral TATA box M00252 V$TATA_01 cellular and viral TATA box elements M00311 V$ATATA_B Avian C-type TATA box M00320 V$MTATA_B Muscle TATA box ''' # 1. create fasta file - look for TATA box # bedfile, genomefile = infiles statement = ''' slopBed -i %(bedfile)s -l %(tata_search_upstream)i -r %(tata_search_downstream)i -s -g %(genomefile)s | cgat bed2fasta --use-strand --genome=%(genome_dir)s/%(genome)s --log=%(outfile)s.log > %(outfile)s.fasta ''' P.run() match_executable = '/ifs/data/biobase/transfac/match/bin/match_linux64' match_matrix = '/ifs/data/biobase/transfac/dat/matrix.dat' match_profile = 'minFP_good.prf' match_profile = outfile + ".prf" prf = '''tata.prf prf to minimize sum of both errors - derived from minSUM.prf MIN_LENGTH 300 0.0 1.000 0.716 0.780 M00216 V$TATA_C 1.000 0.738 0.856 M00252 V$TATA_01 1.000 0.717 0.934 M00311 V$ATATA_B 1.000 0.711 0.784 M00320 V$MTATA_B // ''' with IOTools.openFile(match_profile, "w") as outf: outf.write(prf) # -u : uniq - only one best match per sequence statement = ''' %(match_executable)s %(match_matrix)s %(outfile)s.fasta %(outfile)s.match %(match_profile)s -u >> %(outfile)s.log ''' P.run() transcript2pos = {} for entry in FastaIterator.iterate(IOTools.openFile(outfile + ".fasta")): transcript_id, contig, start, end, strand = re.match( "(\S+)\s+(\S+):(\d+)..(\d+)\s+\((\S)\)", entry.title).groups() transcript2pos[transcript_id] = (contig, int(start), int(end), strand) MATCH = collections.namedtuple( "MATCH", "pid transfac_id pos strand core_similarity matrix_similarity sequence") def _grouper(infile): r = [] keep = False for line in infile: if line.startswith("Inspecting sequence ID"): keep = True if r: yield pid, r r = [] pid = re.match( "Inspecting sequence ID\s+(\S+)", line).groups()[0] continue elif line.startswith(" Total"): break if not keep: continue if line[:-1].strip() == "": continue transfac_id, v, core_similarity, matrix_similarity, sequence = [ x.strip() for x in line[:-1].split("|")] pos, strand = re.match("(\d+) \((\S)\)", v).groups() r.append(MATCH._make((pid, transfac_id, int(pos), strand, float(core_similarity), float(matrix_similarity), sequence))) yield pid, r offset = PARAMS["tata_search_upstream"] outf = IOTools.openFile(outfile + ".table.gz", "w") outf.write("\t".join(("transcript_id", "strand", "start", "end", "relative_start", "relative_end", "transfac_id", "core_similarity", "matrix_similarity", "sequence")) + "\n") bedf = IOTools.openFile(outfile, "w") c = E.Counter() found = set() for transcript_id, matches in _grouper(IOTools.openFile(outfile + ".match")): contig, seq_start, seq_end, strand = transcript2pos[transcript_id] c.promotor_with_matches += 1 nmatches = 0 found.add(transcript_id) for match in matches: c.matches_total += 1 lmatch = len(match.sequence) if match.strand == "-": c.matches_wrong_strand += 1 continue # get genomic location of match if strand == "+": genome_start = seq_start + match.pos else: genome_start = seq_end - match.pos - lmatch genome_end = genome_start + lmatch # get relative location of match if strand == "+": tss_start = seq_start + offset relative_start = genome_start - tss_start else: tss_start = seq_end - offset relative_start = tss_start - genome_end relative_end = relative_start + lmatch outf.write("\t".join(map(str, ( transcript_id, strand, genome_start, genome_end, relative_start, relative_end, match.transfac_id, match.core_similarity, match.matrix_similarity, match.sequence))) + "\n") c.matches_output += 1 nmatches += 1 bedf.write("\t".join(map( str, (contig, genome_start, genome_end, transcript_id, strand, match.matrix_similarity))) + "\n") if nmatches == 0: c.promotor_filtered += 1 else: c.promotor_output += 1 c.promotor_total = len(transcript2pos) c.promotor_without_matches = len( set(transcript2pos.keys()).difference(found)) outf.close() bedf.close() with IOTools.openFile(outfile + ".summary", "w") as outf: outf.write("category\tcounts\n") outf.write(c.asTable() + "\n") E.info(c)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--input-fasta", dest="fasta", type="str", help="name of fasta infile") parser.add_option("-k", "--kmer-size", dest="kmer", type="int", help="supply kmer length") parser.add_option("--subset", dest="subset", type="int", help="only analyse the first x entries") parser.set_defaults( fasta=None, kmer=10, subset=None) (options, args) = E.Start(parser) E.info("%s\n" % using("start")) assert options.fasta, "must provide a fasta filename (--input-fasta=)" k = KmerCounter() Iterator = FastaIterator.iterate(IOTools.openFile(options.fasta)) # total entries also acts as the index for the entry_id total_entries = 0 options.stdout.write("%s\n" % "\t".join(( "id", "unique_kmers", "non_unique_kmers", "fraction_unique"))) # iterate transcripts, shred and identify unique kmers E.info("shredding fasta to identify unique kmers") for entry in Iterator: if total_entries % 1000 == 0: E.info("1st shred complete for %i entries" % total_entries) if options.subset and total_entries >= options.subset: break k.shred(entry.sequence.upper(), options.kmer) total_entries += 1 total_entries = 0 Iterator = FastaIterator.iterate(IOTools.openFile(options.fasta)) # iterate transcripts, shread and count unique kmers E.info("re-shredding fasta to count unique kmers") for entry in Iterator: if total_entries % 1000 == 0: E.info("2nd shred complete for %i entries" % total_entries) transcript_id = entry.title.split()[0] if options.subset and total_entries >= options.subset: break total_entries += 1 unique, non_unique = k.countUniqueKmers( entry.sequence.upper(), options.kmer) fraction = np.divide(float(unique), (unique + non_unique)) options.stdout.write("%s\n" % "\t".join( map(str, (transcript_id, unique, non_unique, fraction)))) E.info("found %i kmers" % len(k.kmer2entry)) E.info("written kmer counts for %i contigs" % total_entries) # write footer and output benchmark information. E.info("%s\n" % using("end")) E.Stop()
parser.set_defaults( input_format="fasta", output_format="fasta", method = None, parameters = "", gop = -10.0, gep = -1.0, alignment_method = "sw", ) (options, args) = E.Start( parser ) options.parameters = options.parameters.split(",") iterator = FastaIterator.iterate( sys.stdin ) if options.method == "add": mali = Mali.Mali() mali.readFromFile( open(options.parameters[0], "r"), format = options.input_format ) del options.parameters[0] old_length = mali.getLength() new_mali = convertMali2Mali( mali ) if options.alignment_method == "sw": alignator = alignlib.makeAlignatorFullDP( options.gop, options.gep ) else:
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version", usage=globals()["__doc__"]) parser.add_option( "--output-quality-format", dest="q_format", type="int", help="sequence quality format, e.g 33 = +33/Sanger" "[default=%default].") parser.add_option( "--output-paired-end", dest="paired", action="store_true", help="generate paired end reads [default = %default].") parser.add_option( "--insert-length-mean", dest="insert_mean", type="float", help="mean insert length [default = %default].") parser.add_option( "--insert-length-sd", dest="insert_sd", type="float", help="insert length standard deviation [default = %default].") parser.add_option( "--counts-method", dest="counts_method", type="choice", choices=("reads", "copies"), help="simulate a ground truth number of reads per entry or" "copies per entry [default = %default].") parser.add_option( "--counts-min", dest="counts_min", type="float", help="minimum number of reads/read pairs per fasta entry" "or copies per entry [default = %default].") parser.add_option( "--counts-max", dest="counts_max", type="float", help="maximum number of reads/read pairs per fasta entry " "or copies per entry [default = %default].") parser.add_option( "--output-read-length", dest="read_length", type="int", help="read length [default = %default].") parser.add_option( "--sequence-error-phred", dest="phred", type="int", help="phred quality score [default = %default].") parser.add_option( "--output-counts", dest="output_counts", type="string", help="name for counts outfile [default=%default].") parser.add_option( "--output-fastq2", dest="fastq2_out", type="string", help="filename for second fastq outfile [default=%default].") parser.add_option( "--premrna-fraction", dest="premrna_fraction", type="float", help="the fraction of reads to simulate from pre-mRNA" "[default= % default].") parser.add_option( "--infile-premrna-fasta", dest="premrna_fasta", type="string", help="filename for pre-mRNA fasta[default=%default].") parser.set_defaults( q_format=33, paired=False, insert_mean=0, insert_sd=1, counts_method="reads", counts_min=1, counts_max=1, read_length=50, fastq2_out=None, output_counts=None, phred=30, premrna_fraction=0, premrna_fasta=None ) (options, args) = E.Start(parser) if options.paired: assert options.fastq2_out, ("must specify a second fastq outfile for " "paired end (--output-fastq2)") outf2 = IOTools.openFile(options.fastq2_out, "w") if options.premrna_fraction: assert options.premrna_fasta, ("must specfify the location of the" "fasta file for the pre-mRNA") # the sequence quality string will always be the same so define here sequence_quality = chr(options.q_format + options.phred) qual = "".join([sequence_quality] * options.read_length) if options.premrna_fraction: iterator = FastaIterator.iterate_together( options.stdin, IOTools.openFile(options.premrna_fasta)) else: iterator = FastaIterator.FastaIterator(options.stdin) # set a cut off of twice the read/pair length for short entries if options.paired: minimum_entry_length = ( 2 * ((options.read_length * 2) + options.insert_mean)) else: minimum_entry_length = 2 * options.read_length c = collections.Counter() counts = collections.Counter() copies = collections.Counter() for f_entry in iterator: if options.premrna_fraction: assert getTitle(f_entry[0]) == getTitle(f_entry[1]), ( "entry ids do not match: %s != %s" % ( f_entry[0].title, f_entry[1].title)) entry = f_entry[0] pre_entry = f_entry[1] else: entry = f_entry # reject short fasta entries if len(entry.sequence) < minimum_entry_length: E.info("skipping short transcript: %s length=%i" % (entry.title, len(entry.sequence))) c['skipped'] += 1 continue else: c['not_skipped'] += 1 if options.paired: fragment_length = ( (2 * options.read_length) + options.insert_mean) else: fragment_length = options.read_length reads_per_entry = float(len(entry.sequence)) / fragment_length if options.counts_method == "reads": n_reads = random.randint(options.counts_min, options.counts_max + 1) n_copies = float(n_reads) / reads_per_entry if options.premrna_fraction: n_reads_pre = int(round(n_reads * options.premrna_fraction)) elif options.counts_method == "copies": # random float [0-1] rand = np.random.random_sample() n_copies = (options.counts_min + (rand * (options.counts_max - options.counts_min))) n_reads = int(round(n_copies * reads_per_entry, 0)) # as n_reads must be rounded to int, need to redefine n_copies n_copies = float(n_reads) / reads_per_entry if options.premrna_fraction: reads_per_pre_entry = (float(len(pre_entry.sequence)) / fragment_length) n_copies_pre = n_copies * options.premrna_fraction n_reads_pre = int(round(n_copies_pre * reads_per_pre_entry, 0)) # as n_reads_pre must be rounded to int, need to # redefine n_copies_pre n_copies_pre = float(n_reads_pre) / reads_per_pre_entry entry_id = getTitle(entry) counts[entry_id] = n_reads copies[entry_id] = n_copies if "N" in entry.sequence.upper(): E.warn("fasta entry %s contains unknown bases ('N')" % entry_id) for i in range(0, n_reads): read = generateRead(entry=entry.sequence.upper(), read_length=options.read_length, error_rate=options.phred, paired=options.paired, insert_mean=options.insert_mean, insert_sd=options.insert_sd) if options.paired: r1, r2 = read h1 = "@%s_%i/1" % (entry_id, i) h2 = "@%s_%i/2" % (entry_id, i) options.stdout.write("\n".join((h1, r1, "+", qual)) + "\n") outf2.write("\n".join((h2, r2, "+", qual)) + "\n") else: h = "@%s_%i/1" % (entry_id, i) options.stdout.write("\n".join((h, read, "+", qual)) + "\n") if options.premrna_fraction: c['pre_counts'] += n_reads_pre c['pre_copies'] += n_copies_pre for i in range(0, n_reads_pre): read = generateRead(entry=pre_entry.sequence.upper(), read_length=options.read_length, error_rate=options.phred, paired=options.paired, insert_mean=options.insert_mean, insert_sd=options.insert_sd) if options.paired: r1, r2 = read h1 = "@%s_pre-mRNA_%i/1" % (entry_id, i) h2 = "@%s_pre-mRNA_%i/2" % (entry_id, i) options.stdout.write("\n".join((h1, r1, "+", qual)) + "\n") outf2.write("\n".join((h2, r2, "+", qual)) + "\n") else: h = "@%s_pre-mRNA_%i/1" % (entry_id, i) options.stdout.write("\n".join((h, read, "+", qual)) + "\n") if options.paired: outf2.close() with IOTools.openFile(options.output_counts, "w") as counts_out: counts_out.write("%s\n" % "\t".join(("id", "read_count", "tpm"))) sum_copies = sum(copies.values()) sum_counts = sum(counts.values()) for entry_id, count in counts.items(): tpm = 1000000 * (float(copies[entry_id]) / sum_copies) counts_out.write( "%s\n" % "\t".join(map(str, (entry_id, count, tpm)))) E.info("Reads simulated for %i fasta entries, %i entries skipped" % (c['not_skipped'], c['skipped'])) E.info("Simulated: %i reads (%i mRNA, %i pre-mRNA), " "%f transcripts (%f mRNA, %f pre-mRNA)" % ( sum_counts + c['pre_counts'], sum_counts, c['pre_counts'], sum_copies + c['pre_copies'], sum_copies, c['pre_copies'])) E.Stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version = "%prog version: $Id: run_nubiscan.py 2861 2010-02-23 17:36:32Z andreas $", usage = globals()["__doc__"] ) parser.add_option("-i", "--iterations", dest="iterations", type="int", help="number of iterations for sampling [default=%default]." ) parser.add_option("-q", "--qvalue", dest="qvalue_threshold", type="float", help="qvalue threshold [default=%default]." ) parser.add_option("--without-combine", dest="combine", action = "store_false", help="combine overlapping motifs [default=%default]." ) parser.add_option("-f", "--fdr-control", dest="fdr_control", type="choice", choices = ("per-sequence", "all", "xall"), help="qvalue threshold [default=%default]." ) parser.add_option("-m", "--motif", dest="motif", type="choice", choices=("rxrvdr", "rxrvdr1", "rxrvdr2", "nr"), help="qvalue threshold [default=%default]." ) parser.add_option("-a", "--arrangements", dest="arrangements", type="string", help ="',' separated list of repeat arrangements [default=%default]") parser.add_option("-x", "--mask", dest="mask", type="choice", choices=("dust","repeatmasker"), help ="mask sequences before scanning [default=%default]") parser.add_option("--output-stats", dest="output_stats", action = "store_true", help="output stats [default=%default]." ) parser.add_option("--add-sequence", dest="add_sequence", action = "store_true", help="add sequence information [default=%default]." ) parser.set_defaults( iterations = 100, qvalue_threshold = 0.05, motif = "rxrvdr", fdr_control = "all", combine = True, arrangements = None, mask = None, output_stats = False, add_sequence = False, ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start( parser, argv = argv, add_output_options = True ) ## do sth ninput, nskipped, noutput = 0, 0, 0 if options.arrangements == None: options.arrangements = [ "DR%s" % x for x in range(0,15) ] + [ "ER%s" % x for x in range(0,15) ] else: options.arrangements = options.arrangements.split(",") options.stdout.write( "%s" % "\t".join(Nubiscan.NubiscanMatch._fields) ) if options.add_sequence: options.stdout.write( "\tsequence" ) options.stdout.write("\n") if options.motif == 'nr': sense_matrix = NR elif options.motif == "rxrvdr": sense_matrix = RXRVDR elif options.motif == "rxrvdr1": sense_matrix = RXRVDR1 elif options.motif == "rxrvdr2": sense_matrix = RXRVDR2 else: raise ValueError("unknown matrix %s" % options.motif) if options.fdr_control == "all": seqs = list(FastaIterator.iterate(options.stdin)) if options.mask: masked_seqs = maskSequences( [x.sequence for x in seqs], options.mask ) else: masked_seqs = [x.sequence for x in seqs] ninput = len(seqs) map_id2title = dict( enumerate( [re.sub("\s.*", "", x.title) for x in seqs] ) ) matcher = Nubiscan.MatcherRandomisationSequences( sense_matrix, samples = options.iterations ) results = matcher.run( masked_seqs, options.arrangements, qvalue_threshold = options.qvalue_threshold ) if options.combine: results = Nubiscan.combineMotifs( results ) for r in results: if r.alternatives: alternatives = ",".join( [x.arrangement for x in r.alternatives ] ) else: alternatives = "" options.stdout.write( "\t".join( ( map_id2title[r.id], "%i" % r.start, "%i" % r.end, r.strand, r.arrangement, "%6.4f" % r.score, "%6.4f" % r.zscore, "%6.4e" % r.pvalue, "%6.4e" % r.qvalue, alternatives) ) ) if options.add_sequence: s = masked_seqs[int(r.id)][r.start:r.end] if r.strand == "-": s = Genomics.complement( s ) s = s[:6].upper() + s[6:-6].lower() + s[-6:].upper() options.stdout.write( "\t%s" % s ) options.stdout.write("\n") noutput += 1 # output stats if options.output_stats: outfile = E.openOutputFile( "fdr" ) outfile.write("bin\thist\tnobserved\n" ) for bin, hist, nobs in zip(matcher.bin_edges, matcher.hist, matcher.nobservations): outfile.write( "%f\t%f\t%f\n" % (bin, hist, nobs)) outfile.close() elif options.fdr_control == "xall": matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix, samples = options.iterations ) # collect all results matches = [] for seq in FastaIterator.iterate(options.stdin): ninput += 1 mm = matcher.run( seq.sequence, options.arrangements, qvalue_threshold = None ) for m in mm: matches.append( m._replace( sequence = seq.title ) ) # estimate qvalues for all matches across all sequences pvalues = [ x.pvalue for x in matches ] fdr = Stats.doFDR( pvalues ) qvalues = fdr.mQValues results = [] for m, qvalue in zip(matches, qvalues): if qvalue > options.qvalue_threshold: continue results.append( m._replace( qvalue = qvalue ) ) if options.combine: results = Nubiscan.combineMotifs( results ) # output for r in results: options.stdout.write( "\t".join( ( r.id, "%i" % r.start, "%i" % r.end, r.strand, r.arrangement, "%6.4f" % r.score, "%6.4f" % r.zscore, "%6.4e" % r.pvalue, "%6.4e" % r.qvalue ) ) + "\n" ) noutput += 1 elif options.fdr_control == "per-sequence": matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix, samples = options.iterations ) for seq in FastaIterator.iterate(options.stdin): ninput += 1 result = matcher.run( seq.sequence, options.arrangements, qvalue_threshold = options.qvalue_threshold ) if options.combine: result = Nubiscan.combineMotifs( result ) t = re.sub(" .*","", seq.title) for r in result: options.stdout.write( "\t".join( ( t, "%i" % r.start, "%i" % r.end, r.strand, r.arrangement, "%6.4f" % r.score, "%6.4f" % r.zscore, "%f" % r.pvalue, "%f" % r.qvalue ) ) + "\n" ) noutput += 1 E.info( "ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput,nskipped) ) ## write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--input-fasta", dest="fasta", type="str", help="name of fasta infile") parser.add_option( "--method", dest="method", type="choice", choices=("transcript", "gene"), help="count unique kmers per transcript or gene", ) parser.add_option("--genemap", dest="genemap", type="str", help="file mapping transcripts to genes") parser.add_option("-k", "--kmer-size", dest="kmer", type="int", help="supply kmer length") parser.add_option("--subset", dest="subset", type="int", help="only analyse the first x entries") parser.set_defaults(fasta=None, method="transcript", genemap=None, kmer=10, subset=None) (options, args) = E.Start(parser) E.info("%s\n" % using("start")) assert options.fasta, "must provide a fasta filename (--input-fasta=)" k = KmerCounter() Iterator = FastaIterator.iterate(IOTools.openFile(options.fasta)) # total entries also acts as the index for the entry_id total_entries = 0 options.stdout.write("%s\n" % "\t".join(("id", "unique_kmers", "non_unique_kmers", "fraction_unique"))) # iterate fasta entries, shred and identify kmers if options.method == "gene": E.info("shredding genes to identify unique kmers") assert options.genemap, ( "to perform a gene-level unique kmer count, " "you must supply a transcript2gene map (--genemap)" ) t2g = {} with IOTools.openFile(options.genemap, "r") as inf: for line in inf: transcript, gene = line.strip().split("\t") t2g[transcript] = gene genes = set() current_gene = None sequences = [] for entry in Iterator: if options.subset and total_entries >= options.subset: break transcript_id = entry.title.split()[0] gene_id = t2g[transcript_id] if gene_id != current_gene: if not current_gene: current_gene = gene_id continue # check this is the first time we've dealt with this gene? assert current_gene not in genes, ( "the fasta does not appear to be sorted in gene order, the" " same gene is observed in non-consecutive positions!" ) genes.add(current_gene) k.shred(sequences, options.kmer) if total_entries % 1000 == 0: E.info("1st shred complete for %i genes" % total_entries) total_entries += 1 sequences = [entry.sequence.upper()] current_gene = gene_id else: sequences.append(entry.sequence.upper()) # catch last gene if not options.subset or options.subset and total_entries < options.subset: k.shred(sequences, options.kmer) E.info("1st shred complete for %i genes" % total_entries) elif options.method == "transcript": E.info("shredding transcripts to identify unique kmers") for entry in Iterator: if total_entries % 1000 == 0: E.info("1st shred complete for %i transcripts" % total_entries) if options.subset and total_entries >= options.subset: break k.shred([entry.sequence.upper()], options.kmer) total_entries += 1 E.info("1st shred complete for %i transcripts" % total_entries) total_entries = 0 Iterator = FastaIterator.iterate(IOTools.openFile(options.fasta)) # iterate fasta entries, shread and count unique kmers if options.method == "gene": E.info("re-shredding fasta to count gene unique kmers") genes = set() current_gene = None sequences = [] for entry in Iterator: if options.subset and total_entries >= options.subset: break transcript_id = entry.title.split()[0] gene_id = t2g[transcript_id] if gene_id != current_gene: if not current_gene: current_gene = gene_id continue # check this is the first time we've dealt with this gene? assert current_gene not in genes, ( "the fasta does not appear to be sorted in gene order, the" " same gene is observed in non-consecutive positions!" ) genes.add(current_gene) unique, non_unique = k.countUniqueKmers(sequences, options.kmer) fraction = np.divide(float(unique), (unique + non_unique)) options.stdout.write("%s\n" % "\t".join(map(str, (current_gene, unique, non_unique, fraction)))) if total_entries % 1000 == 0: E.info("2nd shred complete for %i genes" % total_entries) total_entries += 1 sequences = [entry.sequence.upper()] current_gene = gene_id total_entries += 1 else: sequences.append(entry.sequence.upper()) # catch last gene if not options.subset or options.subset and total_entries < options.subset: unique, non_unique = k.countUniqueKmers(sequences, options.kmer) fraction = np.divide(float(unique), (unique + non_unique)) options.stdout.write("%s\n" % "\t".join(map(str, (gene_id, unique, non_unique, fraction)))) if options.method == "transcript": E.info("re-shredding fasta to count transcript unique kmers") for entry in Iterator: if total_entries % 1000 == 0: E.info("2nd shred complete for %i transcripts" % total_entries) if options.subset and total_entries >= options.subset: break transcript_id = entry.title.split()[0] total_entries += 1 unique, non_unique = k.countUniqueKmers([entry.sequence.upper()], options.kmer) fraction = np.divide(float(unique), (unique + non_unique)) options.stdout.write("%s\n" % "\t".join(map(str, (transcript_id, unique, non_unique, fraction)))) E.info("found %i kmers" % len(k.kmer2entry)) E.info("written kmer counts for %i contigs" % total_entries) # write footer and output benchmark information. E.info("%s\n" % using("end")) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: sequences2mali.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=( "plain", "fasta", "clustal", "stockholm", "phylip"), help="input format of multiple alignment") parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("plain", "fasta", "stockholm", "phylip"), help="output format of multiple alignment") parser.add_option("-m", "--method", dest="method", type="choice", choices=("add",), help="""method to use to build multiple alignment.""") parser.add_option("-p", "--parameters", dest="parameters", type="string", help="parameter stack for methods that require one.") parser.add_option("-a", "--alignment-method", dest="alignment_method", type="choice", choices=("sw", "nw"), help="alignment_method [%default].") parser.set_defaults( input_format="fasta", output_format="fasta", method=None, parameters="", gop=-10.0, gep=-1.0, alignment_method="sw", ) (options, args) = E.Start(parser) options.parameters = options.parameters.split(",") iterator = FastaIterator.iterate(sys.stdin) if options.method == "add": mali = Mali.Mali() mali.readFromFile( open(options.parameters[0], "r"), format=options.input_format) del options.parameters[0] old_length = mali.getLength() new_mali = convertMali2Mali(mali) if options.alignment_method == "sw": alignator = alignlib_lite.py_makeAlignatorFullDP( options.gop, options.gep) else: alignator = alignlib_lite.py_makeAlignatorFullDPGlobal( options.gop, options.gep) while 1: cur_record = iterator.next() if cur_record is None: break map_mali2seq = alignlib_lite.py_makeAlignataVector() sequence = alignlib_lite.py_makeSequence(cur_record.sequence) profile = alignlib_lite.py_makeProfileFromMali(new_mali) if options.loglevel >= 4: options.stdlog.write(profile.Write()) alignator.Align(profile, sequence, map_mali2seq) if options.loglevel >= 3: options.stdlog.write(map_mali2seq.Write()) # add sequence to mali a = alignlib_lite.py_makeAlignatumFromString(cur_record.sequence) a.thisown = 0 new_mali.addAlignatum(a, map_mali2seq, 1, 1, 1, 1, 1) id = cur_record.title mali.mIdentifiers.append(id) mali.mMali[id] = Mali.AlignedString(id, 0, len( cur_record.sequence), new_mali.getRow(new_mali.getWidth() - 1).getString()) # substitute for x in range(old_length): mali.mMali[mali.mIdentifiers[x]].mString = new_mali.getRow( x).getString() mali.writeToFile(sys.stdout, format=options.output_format) E.Stop()