def iterator_sorted(gff_iterator, sort_order="gene"): '''sort input and yield sorted output.''' entries = list(gff_iterator) if sort_order == "gene": entries.sort(key=lambda x: (x.gene_id, x.contig, x.start)) elif sort_order == "gene+transcript": entries.sort( key=lambda x: (x.gene_id, x.transcript_id, x.contig, x.start)) elif sort_order == "contig+gene": entries.sort( key=lambda x: (x.contig, x.gene_id, x.transcript_id, x.start)) elif sort_order == "transcript": entries.sort(key=lambda x: (x.transcript_id, x.contig, x.start)) elif sort_order == "position": entries.sort(key=lambda x: (x.contig, x.start)) elif sort_order == "position+gene": entries.sort(key=lambda x: (x.gene_id, x.start)) genes = list(flat_gene_iterator(entries)) genes.sort(key=lambda x: (x[0].contig, x[0].start)) entries = IOTools.flatten(genes) elif sort_order == "gene+exon": entries.sort(key=lambda x: (x.gene_id, x.exon_number)) for entry in entries: yield entry
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id: diff_bed.py 2866 2010-03-03 10:18:49Z andreas $", usage=globals()["__doc__"]) parser.add_option("-u", "--update", dest="filename_update", type="string", help="if filename is given, previous results will be read from there and only changed sets will be computed [default=%default].") parser.add_option("-p", "--pattern-identifier", dest="pattern_id", type="string", help="pattern to convert a filename to an id [default=%default].") parser.add_option("-t", "--tracks", dest="tracks", action="store_true", help="compare files against all tracks in the first file [default=%default]") parser.set_defaults( filename_update=None, pattern_id="(.*).bed", tracks=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if len(args) < 2: raise ValueError("at least two arguments required") if options.filename_update: infile = IOTools.openFile(options.filename_update, "r") previous_results = {} for line in infile: if line.startswith("#"): continue if line.startswith("set1"): continue data = line[:-1].split("\t") set1, set2 = data[0], data[1] if set1 not in previous_results: previous_results[set1] = {} if set2 not in previous_results: previous_results[set2] = {} previous_results[set1][set2] = "\t".join(data[2:]) rev = [(data[x + 1], data[x]) for x in range(2, len(data), 2)] previous_results[set2][set1] = "\t".join(IOTools.flatten(rev)) else: previous_results = {} pattern_id = re.compile(options.pattern_id) def getTitle(x): try: return pattern_id.search(x).groups()[0] except AttributeError: return x ncomputed, nupdated = 0, 0 if options.tracks: counter = CounterTracks(args[0]) options.stdout.write("set1\tset2\t%s\n" % counter.getHeader()) for filename in args[1:]: title1 = getTitle(filename) for title2 in counter.getTracks(): if previous_results: try: prev = previous_results[title1][title2] except KeyError: pass else: options.stdout.write( "%s\t%s\t%s\n" % ((title1, title2, prev))) nupdated += 1 continue counter.count(filename, title2) options.stdout.write( "%s\t%s\t%s\n" % ((title1, title2, str(counter)))) ncomputed += 1 else: counter = Counter() options.stdout.write("set1\tset2\t%s\n" % counter.getHeader()) for x in range(len(args)): title1 = getTitle(args[x]) for y in range(0, x): title2 = getTitle(args[y]) if previous_results: try: prev = previous_results[title1][title2] except KeyError: pass else: options.stdout.write( "%s\t%s\t%s\n" % ((title1, title2, prev))) nupdated += 1 continue counter.count(args[x], args[y]) options.stdout.write( "%s\t%s\t%s\n" % ((title1, title2, str(counter)))) ncomputed += 1 E.info("nupdated=%i, ncomputed=%i" % (nupdated, ncomputed)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: diff_bed.py 2866 2010-03-03 10:18:49Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-u", "--update", dest="filename_update", type="string", help= "if filename is given, previous results will be read from there and only changed sets will be computed [default=%default]." ) parser.add_option( "-p", "--pattern-identifier", dest="pattern_id", type="string", help="pattern to convert a filename to an id [default=%default].") parser.add_option( "-t", "--tracks", dest="tracks", action="store_true", help= "compare files against all tracks in the first file [default=%default]" ) parser.set_defaults( filename_update=None, pattern_id="(.*).bed", tracks=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if len(args) < 2: raise ValueError("at least two arguments required") if options.filename_update: infile = IOTools.openFile(options.filename_update, "r") previous_results = {} for line in infile: if line.startswith("#"): continue if line.startswith("set1"): continue data = line[:-1].split("\t") set1, set2 = data[0], data[1] if set1 not in previous_results: previous_results[set1] = {} if set2 not in previous_results: previous_results[set2] = {} previous_results[set1][set2] = "\t".join(data[2:]) rev = [(data[x + 1], data[x]) for x in range(2, len(data), 2)] previous_results[set2][set1] = "\t".join(IOTools.flatten(rev)) else: previous_results = {} pattern_id = re.compile(options.pattern_id) def getTitle(x): try: return pattern_id.search(x).groups()[0] except AttributeError: return x ncomputed, nupdated = 0, 0 if options.tracks: counter = CounterTracks(args[0]) options.stdout.write("set1\tset2\t%s\n" % counter.getHeader()) for filename in args[1:]: title1 = getTitle(filename) for title2 in counter.getTracks(): if previous_results: try: prev = previous_results[title1][title2] except KeyError: pass else: options.stdout.write("%s\t%s\t%s\n" % ((title1, title2, prev))) nupdated += 1 continue counter.count(filename, title2) options.stdout.write("%s\t%s\t%s\n" % ((title1, title2, str(counter)))) ncomputed += 1 else: counter = Counter() options.stdout.write("set1\tset2\t%s\n" % counter.getHeader()) for x in range(len(args)): title1 = getTitle(args[x]) for y in range(0, x): title2 = getTitle(args[y]) if previous_results: try: prev = previous_results[title1][title2] except KeyError: pass else: options.stdout.write("%s\t%s\t%s\n" % ((title1, title2, prev))) nupdated += 1 continue counter.count(args[x], args[y]) options.stdout.write("%s\t%s\t%s\n" % ((title1, title2, str(counter)))) ncomputed += 1 E.info("nupdated=%i, ncomputed=%i" % (nupdated, ncomputed)) E.Stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-s", "--ignore-strand", dest="ignore_strand", action="store_true", help="ignore strand information [default=%default].") parser.add_option( "-u", "--update", dest="filename_update", type="string", help="if filename is given, previous results will be read" "from there and only changed sets will be computed " "[default=%default].") parser.add_option( "-p", "--pattern-identifier", dest="pattern_id", type="string", help="pattern to convert a filename to an id" "[default=%default].") parser.add_option( "-g", "--output-only-genes", dest="output_only_genes", action="store_true", help="only output gene stats (includes gene lists)" " [default=%default].") parser.set_defaults( ignore_strand=False, filename_update=None, pattern_id="(.*).gtf", output_only_genes=False, ) (options, args) = E.Start(parser) if len(args) < 2: print USAGE raise ValueError("at least two arguments required") if options.filename_update: infile = open(options.filename_update, "r") previous_results = {} for line in infile: if line.startswith("#"): continue if line.startswith("set1"): continue data = line[:-1].split("\t") set1, set2 = data[0], data[1] if set1 not in previous_results: previous_results[set1] = {} if set2 not in previous_results: previous_results[set2] = {} previous_results[set1][set2] = "\t".join(data[2:]) rev = [(data[x + 1], data[x]) for x in range(2, len(data), 2)] previous_results[set2][set1] = "\t".join(IOTools.flatten(rev)) else: previous_results = {} if options.output_only_genes: counter = CounterGenes() else: counter = Counter() options.stdout.write("set1\tset2\t%s\n" % counter.getHeader()) pattern_id = re.compile(options.pattern_id) def getTitle(x): try: return pattern_id.search(x).groups()[0] except AttributeError: return x ncomputed, nupdated = 0, 0 for x in range(len(args)): title1 = getTitle(args[x]) for y in range(0, x): title2 = getTitle(args[y]) if previous_results: try: prev = previous_results[title1][title2] except KeyError: pass else: options.stdout.write( "%s\t%s\t%s\n" % ((title1, title2, prev))) nupdated += 1 continue counter.count(args[x], args[y]) options.stdout.write( "%s\t%s\t%s\n" % ((title1, title2, str(counter)))) ncomputed += 1 E.info("nupdated=%i, ncomputed=%i" % (nupdated, ncomputed)) E.Stop()
dbh = sqlite3.connect(PARAMS['database']) return dbh @transform(INPUT_FORMATS, regex("(.*)"), r"\1") def unprocessReads(infiles, outfiles): """dummy task - no processing of reads.""" pass # if preprocess tools are specified, preprocessing is done on output that has # already been generated in the first run if PARAMS.get("preprocessors", None): if PARAMS["auto_remove"]: # check if fastqc has been run for x in IOTools.flatten([glob.glob(y) for y in INPUT_FORMATS]): f = re.match(REGEX_TRACK, x).group(1) + ".fastqc" if not os.path.exists(f): raise ValueError( "file %s missing, " "you need to run the pipeline once before " "specifying 'auto_remove'" % f) @follows(mkdir("fasta.dir")) @transform(unprocessReads, regex(SEQUENCEFILES_REGEX), r"fasta.dir/\1.fasta") def makeAdaptorFasta(infile, outfile): '''Make a single fasta file for each sample of all contaminant adaptor sequences for removal '''
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-s", "--ignore-strand", dest="ignore_strand", action="store_true", help="ignore strand information [default=%default].") parser.add_option( "-u", "--update", dest="filename_update", type="string", help="if filename is given, previous results will be read" "from there and only changed sets will be computed " "[default=%default].") parser.add_option("-p", "--pattern-identifier", dest="pattern_id", type="string", help="pattern to convert a filename to an id" "[default=%default].") parser.add_option("-g", "--output-only-genes", dest="output_only_genes", action="store_true", help="only output gene stats (includes gene lists)" " [default=%default].") parser.set_defaults( ignore_strand=False, filename_update=None, pattern_id="(.*).gtf", output_only_genes=False, ) (options, args) = E.Start(parser) if len(args) < 2: print USAGE raise ValueError("at least two arguments required") if options.filename_update: infile = open(options.filename_update, "r") previous_results = {} for line in infile: if line.startswith("#"): continue if line.startswith("set1"): continue data = line[:-1].split("\t") set1, set2 = data[0], data[1] if set1 not in previous_results: previous_results[set1] = {} if set2 not in previous_results: previous_results[set2] = {} previous_results[set1][set2] = "\t".join(data[2:]) rev = [(data[x + 1], data[x]) for x in range(2, len(data), 2)] previous_results[set2][set1] = "\t".join(IOTools.flatten(rev)) else: previous_results = {} if options.output_only_genes: counter = CounterGenes() else: counter = Counter() options.stdout.write("set1\tset2\t%s\n" % counter.getHeader()) pattern_id = re.compile(options.pattern_id) def getTitle(x): try: return pattern_id.search(x).groups()[0] except AttributeError: return x ncomputed, nupdated = 0, 0 for x in range(len(args)): title1 = getTitle(args[x]) for y in range(0, x): title2 = getTitle(args[y]) if previous_results: try: prev = previous_results[title1][title2] except KeyError: pass else: options.stdout.write("%s\t%s\t%s\n" % ((title1, title2, prev))) nupdated += 1 continue counter.count(args[x], args[y]) options.stdout.write("%s\t%s\t%s\n" % ((title1, title2, str(counter)))) ncomputed += 1 E.info("nupdated=%i, ncomputed=%i" % (nupdated, ncomputed)) E.Stop()
dbh = sqlite3.connect(PARAMS['database']) return dbh @transform(INPUT_FORMATS, regex("(.*)"), r"\1") def unprocessReads(infiles, outfiles): """dummy task - no processing of reads.""" # if preprocess tools are specified, preprocessing is done on output that has # already been generated in the first run if PARAMS.get("preprocessors", None): if PARAMS["auto_remove"]: # check if fastqc has been run for x in IOTools.flatten([glob.glob(y) for y in INPUT_FORMATS]): f = re.match(REGEX_TRACK, x).group(1) + ".fastqc" if not os.path.exists(f): raise ValueError( "file %s missing, " "you need to run the pipeline once before " "specifying 'auto_remove'" % f) @follows(mkdir("fasta.dir")) @transform(unprocessReads, regex(SEQUENCEFILES_REGEX), r"fasta.dir/\1.fasta") def makeAdaptorFasta(infile, outfile): '''Make a single fasta file for each sample of all contaminant adaptor sequences for removal '''