def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: psl2table.py 2891 2010-04-07 08:59:18Z andreas $", usage=globals()["__doc__"]) parser.add_option( "--mask-lowercase", dest="mask_lowercase", action="store_true", help= "mask lowercase characters before computing properties [default=%default]" ) parser.add_option("--with-match", dest="with_match", action="store_true", help="echo the match in output [default=%default]") parser.add_option( "--without-match", dest="with_match", action="store_false", help="do not echo the match in output [default=%default]") parser.add_option( "-m", "--method", dest="methods", type="choice", action="append", choices=("counts", "baseml", "match", "query-counts", "sbjct-counts"), help="methods to compute properties between sequence pairs.") WrapperCodeML.BaseML().AddOptions(parser) parser.set_defaults( methods=[], mask_lowercase=False, is_pslx=True, with_match=True, ) (options, args) = E.Start(parser) counters_plain = [] counters = [] for method in options.methods: if method == "counts": counters.append( SequencePairProperties.SequencePairPropertiesCountsNa()) elif method == "query-counts": counters.append(QueriesCounter()) elif method == "sbjct-counts": counters.append(SbjctsCounter()) elif method == "baseml": counters.append( SequencePairProperties.SequencePairPropertiesBaseML(options)) elif method == "match": counters_plain.append(CounterMatch(options)) if counters: iterator = Blat.iterator_pslx(options.stdin) header = "\t".join(Blat.MatchPSLX().getHeaders()) else: iterator = Blat.iterator(options.stdin) header = "\t".join(Blat.Match().getHeaders()) if not options.with_match: header = "qName" options.stdout.write( "\t".join([ header, ] + ["\t".join(x.getHeaders()) for x in counters] + ["\t".join(x.getHeaders()) for x in counters_plain]) + "\n") ninput, noutput, nskipped = 0, 0, 0 for match in iterator: ninput += 1 if options.with_match: options.stdout.write(str(match)) else: options.stdout.write(match.mQueryId) if counters: qseq = match.mQuerySequence sseq = match.mSbjctSequence # mask non printable characters - sometimes # appear after using pslToPslX qseq = [re.sub("[^a-zA-Z]", "N", x) for x in qseq] sseq = [re.sub("[^a-zA-Z]", "N", x) for x in sseq] if options.mask_lowercase: qseq = [re.sub("[a-z]", "N", x) for x in qseq] sseq = [re.sub("[a-z]", "N", x) for x in sseq] match.mQuerySequence = qseq match.mSbjctSequence = sseq qseq = "".join(match.mQuerySequence).upper() sseq = "".join(match.mSbjctSequence).upper() if len(qseq) != len(sseq): if options.loglevel >= 1: options.stdlog.write( "# WARNING: two sequences of unequal length in match\n# %s\n" % str(match)) nskipped += 1 continue for counter in counters: counter(qseq, sseq) options.stdout.write( "\t" + "\t".join([str(counter) for counter in counters])) if counters_plain: for counter in counters_plain: counter(match) options.stdout.write( "\t" + "\t".join([str(counter) for counter in counters_plain])) options.stdout.write("\n") noutput += 1 if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped)) E.Stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-p", "--output-filename-pattern", dest="output_filename_pattern", type="string", help="OUTPUT filename with histogram information on aggregate coverages [%default].") parser.add_option("--read-length-mean", dest="read_length_mean", type="float", help="simulation parameter [default=%default].") parser.add_option("--read-length-std", dest="read_length_stddev", type="float", help="simulation parameter [default=%default].") parser.add_option("--coverage-mean", dest="coverage_mean", type="float", help="simulation parameter [default=%default].") parser.add_option("--coverage-std", dest="coverage_stddev", type="float", help="simulation parameter [default=%default].") parser.add_option("--ds-mean", dest="ds_mean", type="float", help="simulation parameter [default=%default].") parser.add_option("--ds-std", dest="ds_stddev", type="float", help="simulation parameter [default=%default].") parser.add_option("--error-mean", dest="error_mean", type="float", help="simulation parameter [default=%default].") parser.add_option("--error-std", dest="error_stddev", type="float", help="simulation parameter [default=%default].") parser.add_option("--min-read-length", dest="min_read_length", type="int", help="minimum read length [default=%default].") parser.add_option("--sample-size", dest="sample_size", type="int", help="randomly sample from selected transcripts [default=%default].") parser.add_option("--test", dest="test", type="int", help="test with # first entries [default=%default].") parser.add_option("--mode", dest="mode", type="choice", choices=("genes", "transcripts"), help="use genes or transcripts [default=%default].") parser.set_defaults( genome_file=None, read_length_mean=200.0, read_length_stddev=20.0, coverage_mean=2.0, coverage_stddev=1.0, ds_mean=None, ds_stddev=None, error_mean=None, error_stddev=None, min_read_length=50, test=None, mode="transcripts", output_filename_pattern=None, output_format_id="%010i", sample_size=0, ) (options, args) = E.Start(parser, argv) assert options.genome_file, "please supply an indexed genome." if options.output_filename_pattern: outfile_stats = open(options.output_filename_pattern % "stats", "w") outfile_stats.write( "id\tlen\tnreads\tlen_mean\tlen_std\tcov_mean\tcov_std\n") outfile_map = open(options.output_filename_pattern % "map", "w") outfile_map.write("id\ttranscript\n") else: outfile_stats = None outfile_map = None genome = IndexedFasta.IndexedFasta(options.genome_file) ninput, noutput, nskipped = 0, 0, 0 total_counts, total_read_lengths, total_len = [], [], 0 total_pids = [] total_error_pids = [] if options.mode == "transcripts": iterator = GTF.transcript_iterator( GTF.iterator_filtered(GTF.iterator(options.stdin), feature="exon")) getId = lambda x: x.transcript_id elif options.mode == "genes": iterator = GTF.flat_gene_iterator( GTF.iterator_filtered(GTF.iterator(options.stdin), feature="exon")) getId = lambda x: x.gene_id if options.sample_size: iterator = Iterators.sample(iterator) if options.ds_mean: do_mutate = True pid_calc = SequencePairProperties.SequencePairPropertiesPID() else: do_mutate = False if options.error_mean: do_error = True pid_calc = SequencePairProperties.SequencePairPropertiesPID() else: do_error = False for gtfs in iterator: id = getId(gtfs[0]) try: sequence = GTF.toSequence(gtfs, genome) except KeyError, msg: if options.loglevel >= 2: options.stdlog.write("# skipping %s: %s\n" % (id, msg)) nskipped += 1 continue lsequence = len(sequence) if lsequence <= options.min_read_length * 2: if options.loglevel >= 2: options.stdlog.write( "# skipping %s - sequence is too short: %i\n" % (id, lsequence)) nskipped += 1 continue ninput += 1 if do_mutate: new_sequence = getMutatedSequence(sequence, options.ds_mean) pid_calc.loadPair(sequence, new_sequence) pid = pid_calc.mPID total_pids.append(pid) sequence = new_sequence else: pid = 100.0 if options.loglevel >= 2: options.stdlog.write( "# processing %s - len=%i\n" % (id, lsequence)) options.stdlog.flush() total_len += lsequence lvsequence = lsequence * \ random.gauss(options.coverage_mean, options.coverage_stddev) covered = 0 counts = numpy.zeros(lsequence) nreads = 0 error_pids, read_lengths = [], [] while covered < lvsequence: read_length = int( random.gauss(options.read_length_mean, options.read_length_stddev)) positive = random.randint(0, 1) if positive: start = random.randint(0, lsequence) end = min(lsequence, start + read_length) else: end = random.randint(0, lsequence) start = max(0, end - read_length) read_length = end - start if read_length < options.min_read_length: continue segment = sequence[start:end] if not positive: segment = Genomics.complement(segment) noutput += 1 if do_error: new_segment = getMutatedSequence(segment, options.error_mean) pid_calc.loadPair(segment, new_segment) pid = pid_calc.mPID error_pids.append(pid) segment = new_segment else: pid = 100.0 options.stdout.write( ">%s\n%s\n" % (options.output_format_id % noutput, segment)) if outfile_map: outfile_map.write( "%s\t%s\n" % (id, options.output_format_id % noutput)) for x in range(start, end): counts[x] += 1 nreads += 1 covered += read_length read_lengths.append(read_length) if options.loglevel >= 2: options.stdout.write("# transcript %s: len=%i, nreads=%i, len_mean=%.2f, len_std=%.2f, cov_mean=%.2f, cov_stddev=%.2f\n" % (id, lsequence, nreads, numpy.mean( read_lengths), numpy.std( read_lengths), numpy.mean( counts), numpy.std(counts))) if outfile_stats: outfile_stats.write("%s\t%i\t%i\t%.2f\t%.2f\t%.2f\t%.2f\n" % (id, lsequence, nreads, numpy.mean( read_lengths), numpy.std( read_lengths), numpy.mean( counts), numpy.std(counts))) total_counts += list(counts) total_read_lengths += read_lengths total_error_pids += error_pids if options.test and ninput >= options.test: break if options.sample_size and ninput >= options.sample_size: break