def split_gtf_file(gtf_file, split_dir, ref_gtf_file, category_stats_file, bufsize=(1 << 30)): # split input gtf by library and mark test ids keyfunc = lambda myid: os.path.join(split_dir, "%s.gtf" % (myid)) bufobj = BufferedFileSplitter(keyfunc, bufsize) ref_fileh = open(ref_gtf_file, 'w') stats_dict = collections.defaultdict(lambda: CategoryStats()) logging.info("Splitting transcripts by library") for line in open(gtf_file): f = GTFFeature.from_string(line) is_ref = bool(int(f.attrs[GTFAttr.REF])) if is_ref: print >> ref_fileh, str(f) continue library_id = f.attrs[GTFAttr.LIBRARY_ID] # keep statistics if f.feature_type == 'transcript': is_test = bool(int(f.attrs[GTFAttr.TEST])) if is_test: category = Category.SAME_STRAND else: category = int(f.attrs[GTFAttr.CATEGORY]) score = float(f.attrs[GTFAttr.SCORE]) statsobj = stats_dict[library_id] statsobj.library_id = library_id statsobj.counts[category] += 1 statsobj.signal[category] += score # write features from each library to separate files bufobj.write(library_id, line) # close open file handles ref_fileh.close() bufobj.close() logging.debug("Buffer flushes: %d" % (bufobj.flushes)) # write library category statistics logging.info("Writing category statistics") fh = open(category_stats_file, "w") print >> fh, '\t'.join(CategoryStats.header_fields()) for statsobj in stats_dict.itervalues(): fields = statsobj.to_fields() print >> fh, '\t'.join(map(str, fields)) fh.close()
def split_gtf_file(gtf_file, split_dir, ref_gtf_file, category_stats_file, bufsize=(1 << 30)): # split input gtf by library and mark test ids keyfunc = lambda myid: os.path.join(split_dir, "%s.gtf" % (myid)) bufobj = BufferedFileSplitter(keyfunc, bufsize) ref_fileh = open(ref_gtf_file, "w") stats_dict = collections.defaultdict(lambda: CategoryStats()) logging.info("Splitting transcripts by library") for line in open(gtf_file): f = GTFFeature.from_string(line) is_ref = bool(int(f.attrs[GTFAttr.REF])) if is_ref: print >> ref_fileh, str(f) continue library_id = f.attrs[GTFAttr.LIBRARY_ID] # keep statistics if f.feature_type == "transcript": is_test = bool(int(f.attrs[GTFAttr.TEST])) if is_test: category = Category.SAME_STRAND else: category = int(f.attrs[GTFAttr.CATEGORY]) score = float(f.attrs[GTFAttr.SCORE]) statsobj = stats_dict[library_id] statsobj.library_id = library_id statsobj.counts[category] += 1 statsobj.signal[category] += score # write features from each library to separate files bufobj.write(library_id, line) # close open file handles ref_fileh.close() bufobj.close() logging.debug("Buffer flushes: %d" % (bufobj.flushes)) # write library category statistics logging.info("Writing category statistics") fh = open(category_stats_file, "w") print >> fh, "\t".join(CategoryStats.header_fields()) for statsobj in stats_dict.itervalues(): fields = statsobj.to_fields() print >> fh, "\t".join(map(str, fields)) fh.close()
def annotate_gtf(gtf_file, bed_dbs): # read reference databases bed_trees = [] for name,filename in bed_dbs: logging.debug("Loading BED db '%s' file '%s'" % (name,filename)) trees = build_interval_tree_from_bed(filename) bed_trees.append((name, trees)) # parse gtf file and annotate logging.debug("Annotating GTF") for lines in parse_loci(open(gtf_file)): features = [] transcripts = [] transcript_matches = collections.defaultdict(lambda: collections.defaultdict(lambda: set())) for line in lines: f = GTFFeature.from_string(line) features.append(f) t_id = f.attrs['transcript_id'] if f.feature_type == 'transcript': transcripts.append(f) elif f.feature_type == 'exon': for dbname,dbtrees in bed_trees: # intersect this exon with features hits = dbtrees[f.seqid].find(f.start, f.end) matches = set(hit.value for hit in hits if hit.strand == f.strand) f.attrs[dbname] = ','.join(sorted(matches)) # update transcript level matches transcript_matches[t_id][dbname].update(matches) # set transcript annotations for f in transcripts: t_id = f.attrs['transcript_id'] for dbname,dbtrees in bed_trees: matches = transcript_matches[t_id][dbname] f.attrs[dbname] = ','.join(sorted(matches)) # write features for f in features: print str(f) logging.debug("Done")