def split_gtf_by_category(infiles, outfiles, catname): catfile, gtffile = infiles categories = pd.read_csv(catfile, index_col=0, squeeze=True, sep="\t") # create output filepool outpool = IOTools.FilePool("{}_%s.gtf.gz".format(catname), force=True) gtffile = IOTools.openFile(gtffile) for gtfline in GTF.iterator(gtffile): try: transcript_id = gtfline.transcript_id except AttributeError: transcript_id = None try: gene_id = gtfline.gene_id except AttributeError: gene_id = None if transcript_id in categories.index: outpool.write(categories[transcript_id], str(gtfline) + "\n") elif gene_id in categories.index: outpool.write(categories[gene_id], str(gtfline) + "\n") outpool.close()
def chunk_iterator_column(infile, args, prefix, use_header=False): """split at column. The table need not be sorted by this column. If num_files is given, files will randomly created and tags according to column randomly assigned. """ column, max_files = args files = IOTools.FilePool() header = False if max_files: map_tag2file = {} for line in infile: if line[0] == "#": continue if not header and use_header: files.setHeader(line) header = True continue key = line[:-1].split("\t")[column] if max_files: if key in map_tag2file: key = map_tag2file[key] else: n = "%010i" % (len(map_tag2file) % max_files) map_tag2file[key] = n key = n files.write("%s/%s.in" % (prefix, key), line) for filename, count in list(files.items()): E.info("created file %s with %i items" % (filename, count)) yield filename
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-o", "--min-overlap", dest="min_overlap", type="int", help="minimum overlap") parser.add_option( "-w", "--pattern-window", dest="pattern_window", type="string", help= "regular expression to extract window coordinates from test id [%default]" ) parser.add_option("-i", "--invert", dest="invert", action="store_true", help="invert direction of fold change [%default]") parser.set_defaults(min_overlap=10, invert=False, pattern_window="(\S+):(\d+)-(\d+)"), # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) outfiles = IOTools.FilePool(options.output_filename_pattern) if options.invert: test_f = lambda l2fold: l2fold < 0 else: test_f = lambda l2fold: l2fold > 0 def read(): rx_window = re.compile(options.pattern_window) # filter any of the DESeq/EdgeR message that end up at the top of the # output file for data in IOTools.iterate(options.stdin): contig, start, end = rx_window.match(data.test_id).groups() start, end = map(int, (start, end)) yield DATA._make( (contig, start, end, data.treatment_name, float(data.treatment_mean), float(data.treatment_std), data.control_name, float(data.control_mean), float(data.control_std), float(data.pvalue), float(data.qvalue), float(data.l2fold), float(data.fold), int(data.significant), data.status, 0)) def grouper(data, distance=10): last = data.next() entries = [last] while 1: d = data.next() if d is None: break if d.contig == last.contig and d.start < last.start: raise ValueError("error not sorted by start") if ((d.contig != last.contig) or (d.start - last.end > distance) or (d.status != last.status) or (d.significant != last.significant) or (d.l2fold * last.l2fold < 0)): yield entries entries = [] entries.append(d) last = d yield entries counter = E.Counter() options.stdout.write("\t".join(DATA._fields) + "\n") # set of all sample names - used to create empty files samples = set() # need to sort by coordinate all_data = list(read()) all_data.sort(key=lambda x: (x.contig, x.start)) for group in grouper(iter(all_data), distance=options.min_overlap): start, end = group[0].start, group[-1].end assert start < end, 'start > end: %s' % str(group) n = float(len(group)) counter.input += n g = group[0] if g.l2fold < 0: l2fold = max([x.l2fold for x in group]) fold = max([x.fold for x in group]) else: l2fold = min([x.l2fold for x in group]) fold = min([x.fold for x in group]) outdata = DATA._make( (g.contig, start, end, g.treatment_name, sum([x.treatment_mean for x in group]) / n, max([x.treatment_std for x in group]), g.control_name, sum([x.control_mean for x in group]) / n, max([x.control_std for x in group]), max([x.pvalue for x in group]), max([x.qvalue for x in group]), l2fold, fold, g.significant, g.status, int(n))) samples.add(g.treatment_name) samples.add(g.control_name) if g.significant: if test_f(g.l2fold): # treatment lower methylation than control outfiles.write( g.treatment_name, "%s\t%i\t%i\t%s\t%f\n" % (g.contig, g.start, g.end, g.treatment_name, sum([x.treatment_mean for x in group]) / n)) else: outfiles.write( g.control_name, "%s\t%i\t%i\t%s\t%f\n" % (g.contig, g.start, g.end, g.control_name, sum([x.control_mean for x in group]) / n)) options.stdout.write("\t".join(map(str, outdata)) + "\n") counter.output += 1 for sample in samples: outfiles.write(sample, "") outfiles.close() E.info("%s" % counter) # write footer and output benchmark information. E.Stop()
def buildExpressionTracks(infile, outfiles, map_exp2columns, suffix): '''build expression tracks. read the analysis from FILENAME_EXPRESSION ..note:: The file A589_Data_RMA.csv does NOT always contain the probeset_id in the first column, but instead it might be the transcript_cluster_id. A possible explanation is that if several probesets map to the same transcript cluster, the transcript cluster is normalized. The set of cluster_id and probeset ids are completely non-overlapping. Hence, the :term:`cluster_id` will be used. ''' E.info("importing expression data from %s" % infile) dbhandle = sqlite3.connect(PARAMS["database"]) cc = dbhandle.cursor() statement = "SELECT DISTINCT probeset, cluster_id, transcript_id FROM probeset2transcript" cc.execute(statement) map_cluster2transcript, map_probeset2cluster = {}, {} for probeset, cluster, transcript_id in cc.fetchall(): map_probeset2cluster[probeset] = cluster map_cluster2transcript[cluster] = transcript_id reader = csv.reader(open(infile, "rU")) first = True # do not delete old files as this function is called several times output_files = IOTools.FilePool(output_pattern="exp%s.data", force=False) headers = (("Probe Set ID", "cluster_id"), ("Gene Symbol", "genesymbol"), ("mRna - Description", "description"), ('mRNA Accession', 'mrna_id'), ('mRNA Source', 'source'), ('mRNA - xhyb', 'xhyb'), ('GO Biological Process ID', 'go_biol_id'), ('GO Biological Process Term', 'go_biol_term'), ('GO Cellular Component ID', 'go_cell_id'), ('GO Cellular Component Term', 'go_cell_term'), ('GO Molecular Function ID', 'go_mol_id'), ('GO Molecular Function Term', 'go_mol_term'), ('Pathway Source', 'pw_source'), ('Pathway Name', 'pw_name')) old_headers = set([x[0] for x in headers]) new_headers = [x[1] for x in headers] take = [] index_soure, index_accession, index_probeset = None, None, None counts = E.Counter() found = set() outf = open(outfiles[0] + suffix, "w") outf.write("# %s\n" % infile) outs = open(outfiles[1] + suffix, "w") outs.write("# %s\n" % infile) writer = csv.writer(outf) for row in reader: if first: first = False writer.writerow(row) for x, old_header in enumerate(row): if old_header == "mRNA Source": index_source = len(take) if old_header == "mRNA Accession": index_accession = len(take) if old_header == "Probe Set ID": index_probeset = len(take) if old_header in old_headers: take.append(x) # write headers to all files outs.write("\t".join(new_headers) + "\n") for exp, columns in map_exp2columns.items(): output_files.write( exp, "\t".join( ("cluster_id", Stats.Summary().getHeader(), "\t".join( ["R%i" % i for i in range(len(columns))]))) + "\n") else: new_row = [] for x in take: if row[x].strip() != "---": new_row.append(row[x].strip()) else: new_row.append("") probeset = new_row[index_probeset].strip() if probeset in map_probeset2cluster: probeset = map_probeset2cluster[probeset] counter.mapped_to_cluster += 1 if probeset not in map_cluster2transcript: writer.writerow(row) counts.skipped += 1 continue else: if probeset in found: counts.duplicates += 1 counts.output += 1 found.add(probeset) outs.write("\t".join(new_row) + "\n") for exp, cols in map_exp2columns.items(): data = [row[x] for x in cols] output_files.write( exp, "\t".join( (probeset, str(Stats.Summary( [float(x) for x in data])), "\t".join(data))) + "\n") outf.close() if counts.duplicates > 0: P.warn("duplicate probeset/clusters") P.info("probeset source information: %s" % str(counts)) output_files.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if DISABLE: print "# tophat_segment_juncs.py disabled" argv[0] = "segment_juncs.original" runCommand(argv, "segment_juncs.log") return 0 E.Start(no_parsing=True) # collect arguments parser = argparse.ArgumentParser(description='Process tophat options.') parser.add_argument('-p', '--num-threads', metavar='N', type=int, dest='nthreads', help='number of threads') parser.add_argument('--version', action='version', version='%(prog)s') options, args = parser.parse_known_args(argv[1:]) E.info("parallelizing segment juncs with %i threads" % options.nthreads) x = argv.index("--ium-reads") + 1 all_options = argv[1:x] (input_missing_reads, input_genome, output_junctions, output_insertions, output_deletions, input_left_all_reads, input_left_all_map, input_left_segments_maps) = argv[x:x + 8] input_left_segments_maps = input_left_segments_maps.split(",") if len(argv) > x + 8: (input_right_all_reads, input_right_all_map, input_right_segments_maps) = argv[x + 8:x + 11] input_right_segments_maps = input_right_segments_maps.split(",") else: input_right_all_reads = "" input_right_all_map = "" input_right_segments_maps = [] keys = set() # some filenames might appear multiple times files_to_split = set([input_left_all_map, \ input_right_all_map ] +\ input_left_segments_maps +\ input_right_segments_maps ) E.info("splitting %i files" % len(files_to_split)) ## split all map files by chromosome for filename in files_to_split: if filename == "": continue E.info("splitting %s" % filename) base, ext = os.path.splitext(filename) f = glob.glob("%s.input.*%s" % (filename, ext)) if f: E.info("files already exist - skipping") keys.update([ re.match("%s.input.(\S+)%s" % (filename, ext), x).groups()[0] for x in f ]) continue infile = IOTools.openFile(filename) outfiles = IOTools.FilePool(filename + ".input.%s" + ext) for line in infile: key = line.split("\t")[2] keys.add(key) outfiles.write(key, line) outfiles.close() # keys = set( ["chr1", "chr2", "chr3", "chr4", "chr5", # "chr6", "chr7", "chr8", "chr9", "chr10", # "chr11", "chr12", "chr13", "chr14", "chr15", # "chr16", "chr17", "chr18", "chr19", "chr20", # "chr21", "chr22", "chrX", "chrY", "chrM" ] ) E.info("working on %i contigs: %s" % (len(keys), list(keys))) pool = multiprocessing.pool.ThreadPool(options.nthreads) #pool = threadpool.ThreadPool( THREADS ) tmpdir = os.path.dirname(input_left_all_reads) logdir = os.path.join(tmpdir[:-len("tmp")], "logs") if not os.path.exists(logdir): raise IOError("can not find logdir %s" % logdir) args = [] for key in keys: def modout(old, key): if not old: return "" _, ext = os.path.splitext(old) return old + ".output.%s%s" % (key, ext) def modin(old, key): if not old: return "" _, ext = os.path.splitext(old) return old + ".input.%s%s" % (key, ext) def modgenome(old, key): dirname, filename = os.path.split(old) genome, ext = os.path.splitext(filename) if genome.lower().endswith("_cs"): genome = genome[:-3] new = os.path.join(dirname, genome + ".perchrom", key + ext) if not os.path.exists(new): raise ValueError("can not find chromoseme file %s" % new) return new cmd = ["segment_juncs"] +\ all_options +\ [input_missing_reads, \ modgenome(input_genome,key), \ modout(output_junctions,key),\ modout(output_insertions,key),\ modout(output_deletions,key),\ input_left_all_reads,\ modin( input_left_all_map, key ),\ ",".join( [ modin( x, key ) for x in input_left_segments_maps ] ),\ input_right_all_reads,\ modin( input_right_all_map, key ),\ ",".join( [ modin( x, key ) for x in input_right_segments_maps ] ) ] logfile = os.path.join(logdir, "segment_juncs_%s.log" % key) args.append((cmd, logfile)) E.info("submitting %i jobs" % len(keys)) pool.map(runCommand, args, chunksize=1) pool.close() pool.join() E.info("all jobs finished successfully") E.info("merging results") ## merge results for filename in (output_junctions, output_insertions, output_deletions): outfile = open(filename, "w") for inf in glob.glob(filename + ".output.*"): infile = open(inf, "r") outfile.write(infile.read()) infile.close() outfile.close() E.info("results merged") ## cleaning up is done automatically by tophat E.info("cleaning up") for f in glob.glob( os.path.join( tmpdir, "*.output.*") ) +\ glob.glob( os.path.join( tmpdir, "*.input.*") ): os.remove(f) ## write footer and output benchmark information. E.Stop()