def split_gtf_by_category(infiles, outfiles, catname): catfile, gtffile = infiles categories = pd.read_csv(catfile, index_col=0, squeeze=True, sep="\t") # create output filepool outpool = iotools.FilePool("{}_%s.gtf.gz".format(catname), force=True) gtffile = iotools.open_file(gtffile) for gtfline in gtf.iterator(gtffile): try: transcript_id = gtfline.transcript_id except AttributeError: transcript_id = None try: gene_id = gtfline.gene_id except AttributeError: gene_id = None if transcript_id in categories.index: outpool.write(categories[transcript_id], str(gtfline) + "\n") elif gene_id in categories.index: outpool.write(categories[gene_id], str(gtfline) + "\n") outpool.close()
def chunk_iterator_column(infile, args, prefix, use_header=False): """split at column. The table need not be sorted by this column. If num_files is given, files will randomly created and tags according to column randomly assigned. """ column, max_files = args files = iotools.FilePool() header = False if max_files: map_tag2file = {} for line in infile: if line[0] == "#": continue if not header and use_header: files.setHeader(line) header = True continue key = line[:-1].split("\t")[column] if max_files: if key in map_tag2file: key = map_tag2file[key] else: n = "%010i" % (len(map_tag2file) % max_files) map_tag2file[key] = n key = n files.write("%s/%s.in" % (prefix, key), line) for filename, count in list(files.items()): E.info("created file %s with %i items" % (filename, count)) yield filename
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-o", "--min-overlap", dest="min_overlap", type=int, help="minimum overlap") parser.add_argument( "-w", "--pattern-window", dest="pattern_window", type=str, help="regular expression to extract window coordinates from " "test id ") parser.add_argument("-i", "--invert", dest="invert", action="store_true", help="invert direction of fold change ") parser.set_defaults(min_overlap=10, invert=False, pattern_window="(\S+):(\d+)-(\d+)"), # add common options (-h/--help, ...) and parse command line (args) = E.start(parser, argv=argv, add_output_options=True) outfiles = iotools.FilePool(args.output_filename_pattern) if args.invert: test_f = lambda l2fold: l2fold < 0 else: test_f = lambda l2fold: l2fold > 0 def read(): rx_window = re.compile(args.pattern_window) # filter any of the DESeq/EdgeR message that end up at the top of the # output file for data in iotools.iterate(args.stdin): contig, start, end = rx_window.match(data.test_id).groups() start, end = list(map(int, (start, end))) yield DATA._make( (data.test_id, contig, start, end, data.treatment_name, float(data.treatment_mean), float(data.treatment_std), data.control_name, float(data.control_mean), float(data.control_std), float(data.pvalue), float(data.qvalue), float(data.l2fold), float(data.fold), int(data.significant), data.status, 0)) def grouper(data, distance=10): last = next(data) entries = [last] while 1: d = next(data) if d is None: break if d.contig == last.contig and d.start < last.start: raise ValueError("error not sorted by start") if ((d.contig != last.contig) or (d.start - last.end > distance) or (d.status != last.status) or (d.significant != last.significant) or (d.l2fold * last.l2fold < 0)): yield entries entries = [] entries.append(d) last = d yield entries counter = E.Counter() args.stdout.write("\t".join(DATA._fields) + "\n") # set of all sample names - used to create empty files samples = set() # need to sort by coordinate all_data = list(read()) all_data.sort(key=lambda x: (x.contig, x.start)) group_id = 0 for group in grouper(iter(all_data), distance=args.min_overlap): group_id += 1 start, end = group[0].start, group[-1].end assert start < end, 'start > end: %s' % str(group) n = float(len(group)) counter.input += n g = group[0] if g.l2fold < 0: l2fold = max([x.l2fold for x in group]) fold = max([x.fold for x in group]) else: l2fold = min([x.l2fold for x in group]) fold = min([x.fold for x in group]) outdata = DATA._make( (str(group_id), g.contig, start, end, g.treatment_name, sum([x.treatment_mean for x in group]) / n, max([x.treatment_std for x in group]), g.control_name, sum([x.control_mean for x in group]) / n, max([x.control_std for x in group]), max([x.pvalue for x in group]), max([x.qvalue for x in group]), l2fold, fold, g.significant, g.status, int(n))) samples.add(g.treatment_name) samples.add(g.control_name) if g.significant: if test_f(g.l2fold): # treatment lower methylation than control outfiles.write( g.treatment_name, "%s\t%i\t%i\t%i\t%f\n" % (g.contig, g.start, g.end, group_id, sum([x.treatment_mean for x in group]) / n)) else: outfiles.write( g.control_name, "%s\t%i\t%i\t%i\t%f\n" % (g.contig, g.start, g.end, group_id, sum([x.control_mean for x in group]) / n)) args.stdout.write("\t".join(map(str, outdata)) + "\n") counter.output += 1 # create empty files for sample in samples: outfiles.write(sample, "") outfiles.close() E.info("%s" % counter) # write footer and output benchmark information. E.stop()