def add_gtf_file(gtf_file, outfh, is_ref, sample_id=None): refval = '1' if is_ref else '0' for chrom, transcript_dict, exon_dict in _parse_gtf_by_chrom(gtf_file): logging.debug("\tfinished chrom %s %d features" % (chrom, len(exon_dict))) # output reference transcripts for t_id, features in exon_dict.iteritems(): # sort features (exons) by start position features.sort(key=operator.attrgetter('start')) # annotate exons as reference features for f in features: f.attrs[GTFAttr.REF] = refval print >>outfh, str(f) # transcript feature if t_id in transcript_dict: f = transcript_dict[t_id] else: f = GTFFeature() f.seqid = features[0].seqid f.source = features[0].source f.feature_type = 'transcript' f.start = features[0].start f.end = features[-1].end f.score = features[0].score f.strand = features[0].strand f.phase = '.' f.attrs = features[0].attrs.copy() if "exon_number" in f.attrs: del f.attrs["exon_number"] f.attrs[GTFAttr.REF] = refval if sample_id is not None: f.attrs[GTFAttr.SAMPLE_ID] = sample_id print >>outfh, str(f)
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('-a', '--attr', dest='attrs', action='append') parser.add_argument("gtf_file") args = parser.parse_args() gtf_file = args.gtf_file comparisons = [] for attr in args.attrs: key, op, value = attr.split() if (op == '=') or (op == '=='): func = lambda a, b: (a == b) elif (op == '!') or (op == '!='): func = lambda a, b: (a != b) else: assert False comparisons.append((key, value, func)) for f in GTFFeature.parse(open(gtf_file)): match = True for k, v, func in comparisons: if not func(v, f.attrs.get(k, None)): match = False break if match: print str(f)
def get_gtf_metadata(gtf_file, gtf_attrs): if gtf_attrs is None: gtf_attrs = [] if 'transcript_id' in gtf_attrs: gtf_attrs.remove('transcript_id') # read gtf file metadata_dict = {} for feature in GTFFeature.parse(open(gtf_file)): if feature.feature_type != "exon": continue t_id = feature.attrs["transcript_id"] if t_id not in metadata_dict: # instantiate new metadata m = TranscriptMetadata() m.chrom = feature.seqid m.strand = feature.strand m.start = feature.start m.end = feature.end for gtf_attr in gtf_attrs: setattr(m, gtf_attr, feature.attrs.get(gtf_attr, '')) metadata_dict[t_id] = m else: m = metadata_dict[t_id] # update metadata m.start = feature.start if feature.start < m.start else m.start m.end = feature.end if feature.end > m.end else m.end m.length += (feature.end - feature.start) m.num_exons += 1 return metadata_dict
def read_gtf_file(filename, library_id): # read all transcripts t_dict = collections.OrderedDict() cur_t_id = 1 cur_g_id = 1 t_id_map = {} g_id_map = {} for feature in GTFFeature.parse(open(filename)): t_id = feature.attrs[GTFAttr.TRANSCRIPT_ID] # rename transcript id if t_id not in t_id_map: new_t_id = "%s.T%d" % (library_id, cur_t_id) t_id_map[t_id] = new_t_id cur_t_id += 1 else: new_t_id = t_id_map[t_id] # rename gene id g_id = feature.attrs[GTFAttr.GENE_ID] if g_id not in g_id_map: new_g_id = "%s.G%d" % (library_id, cur_g_id) g_id_map[g_id] = new_g_id cur_g_id += 1 else: new_g_id = g_id_map[g_id] # update transcript attributes feature.attrs[GTFAttr.TRANSCRIPT_ID] = new_t_id feature.attrs[GTFAttr.GENE_ID] = new_g_id # store feature if new_t_id not in t_dict: t_dict[new_t_id] = [] t_dict[new_t_id].append(feature) return t_dict
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("-a", "--attr", dest="attrs", action="append") parser.add_argument("gtf_file") args = parser.parse_args() gtf_file = args.gtf_file comparisons = [] for attr in args.attrs: key, op, value = attr.split() if (op == "=") or (op == "=="): func = lambda a, b: (a == b) elif (op == "!") or (op == "!="): func = lambda a, b: (a != b) else: assert False comparisons.append((key, value, func)) for f in GTFFeature.parse(open(gtf_file)): match = True for k, v, func in comparisons: if not func(v, f.attrs.get(k, None)): match = False break if match: print str(f)
def read_reference_gtf(ref_gtf_file): gene_map = {} for f in GTFFeature.parse(open(ref_gtf_file)): # get gene by id gene_id = f.attrs["gene_id"] if gene_id not in gene_map: g = Gene() g.gene_id = gene_id g.chrom = f.seqid g.strand = f.strand g.gene_start = f.start g.gene_end = f.end gene_map[gene_id] = g else: g = gene_map[gene_id] # update gene g.gene_start = min(g.gene_start, f.start) g.gene_end = max(g.gene_end, f.end) if f.feature_type == "exon": g.exons.add((f.start, f.end)) elif f.feature_type == "CDS": g.is_coding = True if "gene_name" in f.attrs: g.gene_names.add(f.attrs["gene_name"]) g.annotation_sources.add(f.source) logging.info("Sorting genes") genes = sorted(gene_map.values(), key=operator.attrgetter('chrom', 'gene_start')) del gene_map # cluster loci logging.debug("Building interval index") locus_cluster_trees = collections.defaultdict(lambda: ClusterTree(0, 1)) locus_trees = collections.defaultdict(lambda: IntervalTree()) for i, g in enumerate(genes): locus_cluster_trees[g.chrom].insert(g.gene_start, g.gene_end, i) for chrom, cluster_tree in locus_cluster_trees.iteritems(): for locus_start, locus_end, indexes in cluster_tree.getregions(): # cluster gene exons and add to interval tree exon_tree = IntervalTree() for i in indexes: g = genes[i] cluster_tree = ClusterTree(0, 1) for start, end in g.exons: cluster_tree.insert(start, end, 1) # update exons exon_clusters = [] for start, end, indexes in cluster_tree.getregions(): exon_clusters.append((start, end)) g.exons = exon_clusters del cluster_tree for start, end in g.exons: exon_tree.insert_interval(Interval(start, end, value=g)) # add to locus interval tree locus_trees[chrom].insert_interval( Interval(locus_start, locus_end, value=exon_tree)) logging.debug("Done indexing reference GTF file") return locus_trees
def read_reference_gtf(ref_gtf_file): gene_map = {} for f in GTFFeature.parse(open(ref_gtf_file)): # get gene by id gene_id = f.attrs["gene_id"] if gene_id not in gene_map: g = Gene() g.gene_id = gene_id g.chrom = f.seqid g.strand = f.strand g.gene_start = f.start g.gene_end = f.end gene_map[gene_id] = g else: g = gene_map[gene_id] # update gene g.gene_start = min(g.gene_start, f.start) g.gene_end = max(g.gene_end, f.end) if f.feature_type == "exon": g.exons.add((f.start, f.end)) elif f.feature_type == "CDS": g.is_coding = True if "gene_name" in f.attrs: g.gene_names.add(f.attrs["gene_name"]) g.annotation_sources.add(f.source) logging.info("Sorting genes") genes = sorted(gene_map.values(), key=operator.attrgetter('chrom', 'gene_start')) del gene_map # cluster loci logging.debug("Building interval index") locus_cluster_trees = collections.defaultdict(lambda: ClusterTree(0,1)) for i,g in enumerate(genes): locus_cluster_trees[g.chrom].insert(g.gene_start, g.gene_end, i) locus_trees = collections.defaultdict(lambda: IntervalTree()) for chrom, cluster_tree in locus_cluster_trees.iteritems(): for locus_start,locus_end,indexes in cluster_tree.getregions(): # cluster gene exons and add to interval tree exon_tree = IntervalTree() for i in indexes: g = genes[i] cluster_tree = ClusterTree(0,1) for start,end in g.exons: cluster_tree.insert(start, end, 1) # update exons exon_clusters = [] for start,end,indexes in cluster_tree.getregions(): exon_clusters.append((start,end)) g.exons = exon_clusters del cluster_tree for start,end in g.exons: exon_tree.insert_interval(Interval(start, end, value=g)) # add to locus interval tree locus_trees[chrom].insert_interval(Interval(locus_start, locus_end, value=exon_tree)) logging.debug("Done indexing reference GTF file") return locus_trees
def main(): # parse command line parser = argparse.ArgumentParser() parser.add_argument("-v", "--verbose", action="store_true", dest="verbose", default=False) parser.add_argument("ref_gtf_file") parser.add_argument("gtf_file") args = parser.parse_args() # set logging level if args.verbose: level = logging.DEBUG else: level = logging.INFO logging.basicConfig(level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") # check command line parameters if not os.path.exists(args.ref_gtf_file): parser.error("GTF file %s not found" % (args.ref_gtf_file)) if not os.path.exists(args.gtf_file): parser.error("GTF file %s not found" % (args.gtf_file)) logging.info("AssemblyLine %s" % (assemblyline.__version__)) logging.info("----------------------------------") # show parameters logging.info("Parameters:") logging.info("verbose logging: %s" % (args.verbose)) logging.info("ref gtf file: %s" % (args.ref_gtf_file)) logging.info("assembly gtf file: %s" % (args.gtf_file)) # find CDS regions if not os.path.exists('tmp.srt.gtf'): with open('tmp.gtf', 'w') as outfileh: logging.info("Reading CDS regions from reference GTF") for f in get_cds_features(args.ref_gtf_file): print >>outfileh, str(f) logging.info("Reading transcripts from assembly GTF") i = 0 for f in GTFFeature.parse(open(args.gtf_file)): print >>outfileh, str(f) i += 1 if i % 100000 == 0: logging.debug("Parsed %d transcripts" % (i)) logging.info("Sorting GTF file") sort_gtf('tmp.gtf', 'tmp.srt.gtf') for locus_transcripts in parse_gtf(open('tmp.srt.gtf')): locus_chrom = locus_transcripts[0].chrom locus_start = locus_transcripts[0].start locus_end = max(t.end for t in locus_transcripts) logging.debug("[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(locus_transcripts))) for start, end, strand, m, t, c in categorize(locus_transcripts): fields = [locus_chrom, str(start), str(end), '%s|%s|%s' % (m,t,c), '0', strand_int_to_str(strand)] print '\t'.join(fields) return 0
def get_gtf_metadata(gtf_file, omit_attrs=None, group_by="gene_id", feature_type="exon"): if omit_attrs is None: omit_attrs = [] # read gtf file and group by gene gene_feature_map = collections.defaultdict(lambda: []) gene_attrs_set = set() for feature in GTFFeature.parse(open(gtf_file)): if feature.feature_type != feature_type: continue feature_id = feature.attrs[group_by] gene_feature_map[feature_id].append(feature) gene_attrs_set.update(feature.attrs.keys()) gene_attrs_set.difference_update(omit_attrs) gene_attrs_list = sorted(gene_attrs_set) metadata_fields = ["tracking_id", "locus", "strand", "num_exons", "transcript_length"] + gene_attrs_list metadata_inds = dict((x,i) for i,x in enumerate(metadata_fields)) metadata_dict = {} # output metadata sorted by gene id for feature_id,features in gene_feature_map.iteritems(): # collect attributes for this gene attrdict = collections.defaultdict(lambda: set()) # cluster exons together for each gene cluster_tree = ClusterTree(0,1) for i,f in enumerate(features): cluster_tree.insert(f.start, f.end, i) for k,v in f.attrs.iteritems(): if k in gene_attrs_set: # some attributes have multiple values separated by a comma attrdict[k].update(v.split(',')) # determine larger exon clusters transcript_length = 0 exon_clusters = [] for start, end, indexes in cluster_tree.getregions(): exon_clusters.append((start,end)) transcript_length += (end - start) del cluster_tree chrom = features[0].seqid locus_start = min(e[0] for e in exon_clusters) locus_end = max(e[1] for e in exon_clusters) locus_string = "%s:%d-%d" % (chrom, locus_start, locus_end) strand = features[0].strand num_exons = len(exon_clusters) # make metadata row metadata = [feature_id, locus_string, strand, num_exons, transcript_length] + ['NA'] * len(gene_attrs_list) # get all attributes for k,vals in attrdict.iteritems(): ind = metadata_inds[k] metadata[ind] = ','.join(map(str, sorted(vals))) metadata_dict[metadata[0]] = metadata return metadata_fields, metadata_dict
def main(): # setup logging logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") # parse command line parser = argparse.ArgumentParser() parser.add_argument('gtf_file') args = parser.parse_args() # check command line parameters if not os.path.exists(args.gtf_file): parser.error("gtf file %s not found" % (args.gtf_file)) cur_t_id = 1 cur_g_id = 1 cur_tss_id = 1 t_id_map = {} g_id_map = {} tss_id_map = {} for feature in GTFFeature.parse(open(args.gtf_file)): t_id = feature.attrs['transcript_id'] g_id = feature.attrs['gene_id'] tss_id = feature.attrs['tss_id'] if t_id not in t_id_map: new_t_id = "T%06d" % (cur_t_id) t_id_map[t_id] = new_t_id cur_t_id += 1 else: new_t_id = t_id_map[t_id] if g_id not in g_id_map: new_g_id = "G%06d" % (cur_g_id) g_id_map[g_id] = new_g_id cur_g_id += 1 else: new_g_id = g_id_map[g_id] if tss_id not in tss_id_map: new_tss_id = "TSS%06d" % (cur_tss_id) tss_id_map[tss_id] = new_tss_id cur_tss_id += 1 else: new_tss_id = tss_id_map[tss_id] # update transcript attributes new_attrs = { 'transcript_id': new_t_id, 'gene_id': new_g_id, 'tss_id': new_tss_id } if 'exon_number' in feature.attrs: new_attrs['exon_number'] = feature.attrs['exon_number'] feature.attrs = new_attrs print feature return 0
def main(): # setup logging logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") # parse command line parser = argparse.ArgumentParser() parser.add_argument('gtf_file') args = parser.parse_args() # check command line parameters if not os.path.exists(args.gtf_file): parser.error("gtf file %s not found" % (args.gtf_file)) cur_t_id = 1 cur_g_id = 1 cur_tss_id = 1 t_id_map = {} g_id_map = {} tss_id_map = {} for feature in GTFFeature.parse(open(args.gtf_file)): t_id = feature.attrs['transcript_id'] g_id = feature.attrs['gene_id'] tss_id = feature.attrs['tss_id'] if t_id not in t_id_map: new_t_id = "T%06d" % (cur_t_id) t_id_map[t_id] = new_t_id cur_t_id += 1 else: new_t_id = t_id_map[t_id] if g_id not in g_id_map: new_g_id = "G%06d" % (cur_g_id) g_id_map[g_id] = new_g_id cur_g_id += 1 else: new_g_id = g_id_map[g_id] if tss_id not in tss_id_map: new_tss_id = "TSS%06d" % (cur_tss_id) tss_id_map[tss_id] = new_tss_id cur_tss_id += 1 else: new_tss_id = tss_id_map[tss_id] # update transcript attributes new_attrs = {'transcript_id': new_t_id, 'gene_id': new_g_id, 'tss_id': new_tss_id} if 'exon_number' in feature.attrs: new_attrs['exon_number'] = feature.attrs['exon_number'] feature.attrs = new_attrs print feature return 0
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("--group-by", dest="group_by", default="gene_id") parser.add_argument("gtf_file") parser.add_argument("gtf_attr") args = parser.parse_args() d = collections.defaultdict(lambda: set()) for f in GTFFeature.parse(open(args.gtf_file)): if args.gtf_attr in f.attrs: v = f.attrs[args.gtf_attr] f_id = f.attrs[args.group_by] d[v].add(f_id) for k in sorted(d): print k, len(d[k])
def parse_gtf(filename): # read all transcripts t_dict = collections.defaultdict(lambda: []) i = 0 for f in GTFFeature.parse(open(filename)): i += 1 if (i % 100000) == 0: logging.debug('parse_gtf read %d lines' % (i)) if f.feature_type != 'exon': continue t_id = f.attrs['transcript_id'] t_dict[t_id].append(f) i = 0 for features in t_dict.itervalues(): yield Feature.from_gtf_features(features) i += 1 logging.debug('Parsed %d transcripts' % (i))
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("--group-by", dest="group_by", default="gene_id") parser.add_argument("gtf_file") parser.add_argument("gtf_attr") args = parser.parse_args() d = collections.defaultdict(lambda: set()) for f in GTFFeature.parse(open(args.gtf_file)): if args.gtf_attr in f.attrs: v = f.attrs[args.gtf_attr] f_id = f.attrs[args.group_by] d[v].add(f_id) for k in sorted(d): print k, len(d[k])
def _parse_gtf_by_chrom(gtf_file): current_chrom = None exon_dict = collections.defaultdict(lambda: []) transcript_dict = {} for feature in GTFFeature.parse(open(gtf_file)): if (current_chrom != feature.seqid): if len(exon_dict) > 0: yield current_chrom, transcript_dict, exon_dict exon_dict = collections.defaultdict(lambda: []) transcript_dict = {} current_chrom = feature.seqid t_id = feature.attrs[GTFAttr.TRANSCRIPT_ID] if feature.feature_type == "transcript": transcript_dict[t_id] = feature elif feature.feature_type == "exon": exon_dict[t_id].append(feature) if len(exon_dict) > 0: yield current_chrom, transcript_dict, exon_dict
def split_gtf_file(gtf_file, split_dir, ref_gtf_file, category_stats_file, bufsize=(1 << 30)): # split input gtf by library and mark test ids keyfunc = lambda myid: os.path.join(split_dir, "%s.gtf" % (myid)) bufobj = BufferedFileSplitter(keyfunc, bufsize) ref_fileh = open(ref_gtf_file, 'w') stats_dict = collections.defaultdict(lambda: CategoryStats()) logging.info("Splitting transcripts by library") for line in open(gtf_file): f = GTFFeature.from_string(line) is_ref = bool(int(f.attrs[GTFAttr.REF])) if is_ref: print >> ref_fileh, str(f) continue library_id = f.attrs[GTFAttr.LIBRARY_ID] # keep statistics if f.feature_type == 'transcript': is_test = bool(int(f.attrs[GTFAttr.TEST])) if is_test: category = Category.SAME_STRAND else: category = int(f.attrs[GTFAttr.CATEGORY]) score = float(f.attrs[GTFAttr.SCORE]) statsobj = stats_dict[library_id] statsobj.library_id = library_id statsobj.counts[category] += 1 statsobj.signal[category] += score # write features from each library to separate files bufobj.write(library_id, line) # close open file handles ref_fileh.close() bufobj.close() logging.debug("Buffer flushes: %d" % (bufobj.flushes)) # write library category statistics logging.info("Writing category statistics") fh = open(category_stats_file, "w") print >> fh, '\t'.join(CategoryStats.header_fields()) for statsobj in stats_dict.itervalues(): fields = statsobj.to_fields() print >> fh, '\t'.join(map(str, fields)) fh.close()
def classify_library_transcripts(args): library_id, output_dir = args prefix = os.path.join(output_dir, library_id) # input files input_gtf_file = prefix + ".gtf" logfile = prefix + ".log" tablefile = prefix + '.inp.txt' # output files #info_file = prefix + ".info.txt" output_res_file = prefix + ".out.txt" expr_gtf_file = prefix + ".expr.gtf" bkgd_gtf_file = prefix + ".bkgd.gtf" # write table of observations logging.debug("[STARTED] library_id='%s'" % (library_id)) write_transcript_table(input_gtf_file, tablefile) # run R script to do classification logfh = open(logfile, "w") retcode = subprocess.call( ["Rscript", "--vanilla", CLASSIFY_R_SCRIPT, prefix], stdout=logfh, stderr=logfh) logfh.close() if retcode != 0: logging.error("[FAILED] library_id='%s'" % (library_id)) return retcode, library_id # get library stats #info_field_dict = read_classify_info(info_file) #has_tests = int(info_field_dict["tests"][0]) > 0 # get transcript predictions decision_dict = read_classify_decisions(output_res_file) # partition input into expressed vs background expr_fileh = open(expr_gtf_file, 'w') bkgd_fileh = open(bkgd_gtf_file, 'w') output_file_handles = [bkgd_fileh, expr_fileh] for feature in GTFFeature.parse(open(input_gtf_file)): t_id = feature.attrs[GTFAttr.TRANSCRIPT_ID] dinf = decision_dict[t_id] feature.attrs[GTFAttr.LOG10LR] = dinf.log10lr fileh = output_file_handles[int(dinf.pred)] print >> fileh, str(feature) for fileh in output_file_handles: fileh.close() logging.debug("[FINISHED] library_id='%s'" % (library_id)) return retcode, library_id
def get_gtf_metadata(gtf_file, metadata_file): # read gtf file and group by gene metadata_dict = {} for feature in GTFFeature.parse(open(gtf_file)): if feature.feature_type != "exon": continue t_id = feature.attrs["transcript_id"] if t_id not in metadata_dict: # instantiate new metadata m = TranscriptMetadata() m.chrom = feature.seqid m.strand = feature.strand m.start = feature.start m.end = feature.end for attr in GTF_ATTRS_SET: setattr(m, attr, feature.attrs.get(attr, '')) metadata_dict[t_id] = m else: m = metadata_dict[t_id] # update metadata m.start = feature.start if feature.start < m.start else m.start m.end = feature.end if feature.end > m.end else m.end m.length += (feature.end - feature.start) m.num_exons += 1 fileh = open(metadata_file, 'w') header_fields = [ 'tracking_id', 'locus', 'strand', 'num_exons', 'transcript_length' ] + GTF_ATTRS print >> fileh, '\t'.join(header_fields) tracking_ids = sorted(metadata_dict) for t_id in tracking_ids: m = metadata_dict[t_id] fields = [ t_id, '%s:%d-%d' % (m.chrom, m.start, m.end), m.strand, m.num_exons, m.length ] for attr in GTF_ATTRS: fields.append(getattr(m, attr)) print >> fileh, '\t'.join(map(str, fields)) fileh.close() return tracking_ids
def classify_library_transcripts(args): library_id, output_dir = args prefix = os.path.join(output_dir, library_id) # input files input_gtf_file = prefix + ".gtf" logfile = prefix + ".log" tablefile = prefix + ".inp.txt" # output files # info_file = prefix + ".info.txt" output_res_file = prefix + ".out.txt" expr_gtf_file = prefix + ".expr.gtf" bkgd_gtf_file = prefix + ".bkgd.gtf" # write table of observations logging.debug("[STARTED] library_id='%s'" % (library_id)) write_transcript_table(input_gtf_file, tablefile) # run R script to do classification logfh = open(logfile, "w") retcode = subprocess.call(["Rscript", "--vanilla", CLASSIFY_R_SCRIPT, prefix], stdout=logfh, stderr=logfh) logfh.close() if retcode != 0: logging.error("[FAILED] library_id='%s'" % (library_id)) return retcode, library_id # get library stats # info_field_dict = read_classify_info(info_file) # has_tests = int(info_field_dict["tests"][0]) > 0 # get transcript predictions decision_dict = read_classify_decisions(output_res_file) # partition input into expressed vs background expr_fileh = open(expr_gtf_file, "w") bkgd_fileh = open(bkgd_gtf_file, "w") output_file_handles = [bkgd_fileh, expr_fileh] for feature in GTFFeature.parse(open(input_gtf_file)): t_id = feature.attrs[GTFAttr.TRANSCRIPT_ID] dinf = decision_dict[t_id] feature.attrs[GTFAttr.LOG10LR] = dinf.log10lr fileh = output_file_handles[int(dinf.pred)] print >> fileh, str(feature) for fileh in output_file_handles: fileh.close() logging.debug("[FINISHED] library_id='%s'" % (library_id)) return retcode, library_id
def split_gtf_file(gtf_file, split_dir, ref_gtf_file, category_stats_file, bufsize=(1 << 30)): # split input gtf by library and mark test ids keyfunc = lambda myid: os.path.join(split_dir, "%s.gtf" % (myid)) bufobj = BufferedFileSplitter(keyfunc, bufsize) ref_fileh = open(ref_gtf_file, "w") stats_dict = collections.defaultdict(lambda: CategoryStats()) logging.info("Splitting transcripts by library") for line in open(gtf_file): f = GTFFeature.from_string(line) is_ref = bool(int(f.attrs[GTFAttr.REF])) if is_ref: print >> ref_fileh, str(f) continue library_id = f.attrs[GTFAttr.LIBRARY_ID] # keep statistics if f.feature_type == "transcript": is_test = bool(int(f.attrs[GTFAttr.TEST])) if is_test: category = Category.SAME_STRAND else: category = int(f.attrs[GTFAttr.CATEGORY]) score = float(f.attrs[GTFAttr.SCORE]) statsobj = stats_dict[library_id] statsobj.library_id = library_id statsobj.counts[category] += 1 statsobj.signal[category] += score # write features from each library to separate files bufobj.write(library_id, line) # close open file handles ref_fileh.close() bufobj.close() logging.debug("Buffer flushes: %d" % (bufobj.flushes)) # write library category statistics logging.info("Writing category statistics") fh = open(category_stats_file, "w") print >> fh, "\t".join(CategoryStats.header_fields()) for statsobj in stats_dict.itervalues(): fields = statsobj.to_fields() print >> fh, "\t".join(map(str, fields)) fh.close()
def get_gtf_metadata(gtf_file, metadata_file): # read gtf file and group by gene metadata_dict = {} for feature in GTFFeature.parse(open(gtf_file)): if feature.feature_type != "exon": continue t_id = feature.attrs["transcript_id"] if t_id not in metadata_dict: # instantiate new metadata m = TranscriptMetadata() m.chrom = feature.seqid m.strand = feature.strand m.start = feature.start m.end = feature.end for attr in GTF_ATTRS_SET: setattr(m, attr, feature.attrs.get(attr, '')) metadata_dict[t_id] = m else: m = metadata_dict[t_id] # update metadata m.start = feature.start if feature.start < m.start else m.start m.end = feature.end if feature.end > m.end else m.end m.length += (feature.end - feature.start) m.num_exons += 1 fileh = open(metadata_file, 'w') header_fields = ['tracking_id', 'locus', 'strand', 'num_exons', 'transcript_length'] + GTF_ATTRS print >>fileh, '\t'.join(header_fields) tracking_ids = sorted(metadata_dict) for t_id in tracking_ids: m = metadata_dict[t_id] fields = [t_id, '%s:%d-%d' % (m.chrom, m.start, m.end), m.strand, m.num_exons, m.length] for attr in GTF_ATTRS: fields.append(getattr(m, attr)) print >>fileh, '\t'.join(map(str, fields)) fileh.close() return tracking_ids
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('--source', dest='source', default=None) parser.add_argument('--remove', dest='remove', action='append', default=[]) parser.add_argument('-f', dest='force', action='store_true') parser.add_argument('--add', dest='add', action='append', default=[]) parser.add_argument('gtf_file') args = parser.parse_args() gtf_file = args.gtf_file source = args.source force = args.force if not os.path.exists(gtf_file): parser.error("GTF file '%s' not found" % (gtf_file)) add_attrs = [] for arg in args.add: k, v = arg.split(',') add_attrs.append((k, v)) rm_attrs = [] for arg in args.remove: rm_attrs.append(arg) i = 0 for f in GTFFeature.parse(open(args.gtf_file)): if source is not None: f.source = source for k, v in add_attrs: if (k in f.attrs) and not force: parser.error('attribute %s already in feature' % (k)) f.attrs[k] = v for k in rm_attrs: if k in f.attrs: del f.attrs[k] print str(f) i += 1 if (i % 100000) == 0: logging.debug('finished %d lines' % (i)) return 0
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('--source', dest='source', default=None) parser.add_argument('--remove', dest='remove', action='append', default=[]) parser.add_argument('-f', dest='force', action='store_true') parser.add_argument('--add', dest='add', action='append', default=[]) parser.add_argument('gtf_file') args = parser.parse_args() gtf_file = args.gtf_file source = args.source force = args.force if not os.path.exists(gtf_file): parser.error("GTF file '%s' not found" % (gtf_file)) add_attrs = [] for arg in args.add: k,v = arg.split(',') add_attrs.append((k,v)) rm_attrs = [] for arg in args.remove: rm_attrs.append(arg) i = 0 for f in GTFFeature.parse(open(args.gtf_file)): if source is not None: f.source = source for k,v in add_attrs: if (k in f.attrs) and not force: parser.error('attribute %s already in feature' % (k)) f.attrs[k] = v for k in rm_attrs: if k in f.attrs: del f.attrs[k] print str(f) i += 1 if (i % 100000) == 0: logging.debug('finished %d lines' % (i)) return 0
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('--add', dest='add', action='append', default=[]) parser.add_argument("metadata_file") parser.add_argument("gtf_file") args = parser.parse_args() add_attrs = set() for arg in args.add: add_attrs.add(arg) add_attrs = sorted(add_attrs) # read metadata meta_dict = {} with open(args.metadata_file) as f: header_fields = f.next().strip().split('\t') t_id_index = header_fields.index('transcript_id') header_indexes = [header_fields.index(x) for x in add_attrs] for line in f: fields = line.strip().split('\t') meta_dict[fields[t_id_index]] = [fields[i] for i in header_indexes] #print fields[t_id_index], meta_dict[fields[t_id_index]] # read GTF for f in GTFFeature.parse(open(args.gtf_file)): t_id = f.attrs['transcript_id'] if t_id in meta_dict: for k, v in zip(add_attrs, meta_dict[t_id]): f.attrs[k] = v else: for k in add_attrs: f.attrs[k] = 'NA' print str(f) return 0
def read_gtf_file(library, gtf_score_attr): # read all transcripts t_dict = collections.OrderedDict() cur_t_id = 1 cur_g_id = 1 t_id_map = {} g_id_map = {} for feature in GTFFeature.parse(open(library.gtf_file)): if feature.feature_type == "exon": t_id = feature.attrs[GTFAttr.TRANSCRIPT_ID] # rename transcript id if t_id not in t_id_map: new_t_id = "%s.T%d" % (library.library_id, cur_t_id) t_id_map[t_id] = new_t_id cur_t_id += 1 else: new_t_id = t_id_map[t_id] # rename gene id g_id = feature.attrs[GTFAttr.GENE_ID] if g_id not in g_id_map: new_g_id = "%s.G%d" % (library.library_id, cur_g_id) g_id_map[g_id] = new_g_id cur_g_id += 1 else: new_g_id = g_id_map[g_id] # update transcript attributes newattrs = {GTFAttr.TRANSCRIPT_ID: new_t_id, GTFAttr.GENE_ID: new_g_id, GTFAttr.SAMPLE_ID: library.sample_id, GTFAttr.LIBRARY_ID: library.library_id, GTFAttr.REF: '0', GTFAttr.SCORE: feature.attrs.get(gtf_score_attr, '0.0')} feature.attrs = newattrs # store feature if new_t_id not in t_dict: t_dict[new_t_id] = [] t_dict[new_t_id].append(feature) return t_dict
def add_reference_gtf_file(ref_gtf_file, test_gene_ids, random_test_frac, outfh): gene_dict = collections.defaultdict(lambda: []) user_defined_tests = len(test_gene_ids) > 0 # group by gene id for feature in GTFFeature.parse(open(ref_gtf_file)): if feature.feature_type != "exon": continue # group by gene id g_id = feature.attrs[GTFAttr.GENE_ID] gene_dict[g_id].append(feature) # output reference transcripts for g_id, g_features in gene_dict.iteritems(): # label test transcripts if user_defined_tests: is_test = (g_id in test_gene_ids) else: is_test = (random.random() < random_test_frac) # group by transcript id transcript_dict = collections.defaultdict(lambda: []) for feature in g_features: t_id = feature.attrs[GTFAttr.TRANSCRIPT_ID] transcript_dict[t_id].append(feature) for t_id, t_features in transcript_dict.iteritems(): # sort features (exons) by start position t_features.sort(key=operator.attrgetter('start')) # annotate exons as reference features for f in t_features: f.attrs[GTFAttr.REF] = '1' f.attrs[GTFAttr.TEST] = '1' if is_test else '0' print >>outfh, str(f) f = make_transcript_feature(t_features) f.attrs[GTFAttr.REF] = '1' f.attrs[GTFAttr.TEST] = '1' if is_test else '0' print >>outfh, str(f) del transcript_dict del gene_dict
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('--add', dest='add', action='append', default=[]) parser.add_argument("metadata_file") parser.add_argument("gtf_file") args = parser.parse_args() add_attrs = set() for arg in args.add: add_attrs.add(arg) add_attrs = sorted(add_attrs) # read metadata meta_dict = {} with open(args.metadata_file) as f: header_fields = f.next().strip().split('\t') t_id_index = header_fields.index('transcript_id') header_indexes = [header_fields.index(x) for x in add_attrs] for line in f: fields = line.strip().split('\t') meta_dict[fields[t_id_index]] = [fields[i] for i in header_indexes] #print fields[t_id_index], meta_dict[fields[t_id_index]] # read GTF for f in GTFFeature.parse(open(args.gtf_file)): t_id = f.attrs['transcript_id'] if t_id in meta_dict: for k,v in zip(add_attrs, meta_dict[t_id]): f.attrs[k] = v else: for k in add_attrs: f.attrs[k] = 'NA' print str(f) return 0
def annotate_gtf(gtf_file, bed_dbs): # read reference databases bed_trees = [] for name,filename in bed_dbs: logging.debug("Loading BED db '%s' file '%s'" % (name,filename)) trees = build_interval_tree_from_bed(filename) bed_trees.append((name, trees)) # parse gtf file and annotate logging.debug("Annotating GTF") for lines in parse_loci(open(gtf_file)): features = [] transcripts = [] transcript_matches = collections.defaultdict(lambda: collections.defaultdict(lambda: set())) for line in lines: f = GTFFeature.from_string(line) features.append(f) t_id = f.attrs['transcript_id'] if f.feature_type == 'transcript': transcripts.append(f) elif f.feature_type == 'exon': for dbname,dbtrees in bed_trees: # intersect this exon with features hits = dbtrees[f.seqid].find(f.start, f.end) matches = set(hit.value for hit in hits if hit.strand == f.strand) f.attrs[dbname] = ','.join(sorted(matches)) # update transcript level matches transcript_matches[t_id][dbname].update(matches) # set transcript annotations for f in transcripts: t_id = f.attrs['transcript_id'] for dbname,dbtrees in bed_trees: matches = transcript_matches[t_id][dbname] f.attrs[dbname] = ','.join(sorted(matches)) # write features for f in features: print str(f) logging.debug("Done")
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("gtf_file") args = parser.parse_args() gtf_file = args.gtf_file genes = set() transcript_dict = {} exon_dict = {} for f in GTFFeature.parse(open(gtf_file)): if f.feature_type != 'exon': continue genes.add(f.attrs["gene_id"]) t_id = f.attrs["transcript_id"] if t_id not in transcript_dict: transcript_dict[t_id] = (f.seqid, f.strand) exon_dict[t_id] = [] exon_dict[t_id].append((f.start, f.end)) introns = set() exons = set() for t_id in transcript_dict: chrom, strand = transcript_dict[t_id] t_exons = exon_dict[t_id] t_exons.sort() for start, end in t_exons: exons.add((chrom, strand, start, end)) for start, end in iterintrons(t_exons): introns.add((chrom, strand, start, end)) logging.debug("Genes: %d" % (len(genes))) logging.debug("Transcripts: %d" % (len(transcript_dict))) logging.debug("Introns: %d" % (len(introns))) logging.debug("Exons: %d" % (len(exons)))
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("--gtf-split-attr", dest="gtf_split_attr", default="library_id") parser.add_argument("gtf_file") args = parser.parse_args() fhdict = {} gtf_split_attr = args.gtf_split_attr for f in GTFFeature.parse(open(args.gtf_file)): if gtf_split_attr not in f.attrs: val = "na_missing" else: val = f.attrs[args.gtf_split_attr] filename = "%s.gtf" % (val) if not os.path.exists(filename): fh = open(filename, "w") fhdict[val] = fh else: fh = fhdict[val] print >>fh, str(f) for fh in fhdict.itervalues(): fh.close() return 0
def run_filter(cinfo, cutoff_dict): # maintain predictions in a dictionary where key is transcript id and # value is boolean prediction decision for transcript. t_id_decisions = {} # maintain result objects in dictionary keyed by transcript id t_id_results = {} # maintain heap queues that keeps track of the last transcript position # on each chromosome. prediction decisions only need to be remembered # until the parsing goes past the end of the transcript (all exons # accounted for) decision_heapqs = collections.defaultdict(lambda: []) # maintain heapq that keeps track of transcript position of # result objects. results only need to be remembered until parsing # goes past the chrom/start position of the transcript (all transcripts # accounted for) result_heapqs = collections.defaultdict(lambda: []) # read result file and gtf file in sync result_fh = open(cinfo.sorted_ctree_file) result_fh.next() gtf_fh = open(cinfo.output_gtf_file) # open output files for decision,filename in cinfo.decision_file_dict.iteritems(): cinfo.decision_fh_dict[decision] = open(filename, "w") # keep track of prediction statistics decision_stats = collections.defaultdict(lambda: 0) for feature in GTFFeature.parse(gtf_fh): # get transcript id used to lookup expressed/background # prediction decision t_id = feature.attrs[GTFAttr.TRANSCRIPT_ID] # check top of the decision heapq and pop transcripts when parsing # has gone past the end decision_heapq = decision_heapqs[feature.seqid] while (len(decision_heapq) > 0) and (feature.start > decision_heapq[0][0]): smallest_end, smallest_t_id = heapq.heappop(decision_heapq) del t_id_decisions[smallest_t_id] # check top of result heapq and pop transcripts when parsing has gone # past the end result_heapq = result_heapqs[feature.seqid] while (len(result_heapq) > 0) and (feature.start > result_heapq[0][0]): result_start, result_t_id = heapq.heappop(result_heapq) del t_id_results[result_t_id] # parse transcript/exon features differently if feature.feature_type == "transcript": # parse results until this t_id is found (all results # must stay valid until past this chrom/start location) while t_id not in t_id_results: result = ClassificationResult.from_line(result_fh.next()) # add to heapq to remove results that are no longer useful heapq.heappush(result_heapqs[result.chrom], (result.start, result.t_id)) # add to result dictionary t_id_results[result.t_id] = result # if current result position is beyond current transcript # position then we know that we are missing results for this # transcript and need to skip it if ((result.chrom != feature.seqid) or (result.start > feature.start)): break if t_id not in t_id_results: #logging.warning("Skipping: library_id=%s t_id=%s " # "chrom=%s start=%d " % # (feature.attrs[GTFAttr.LIBRARY_ID], t_id, # feature.seqid, feature.start)) decision = SKIPPED else: # lookup classification result and ensure that transcript_id # attribute matches result id result = t_id_results[t_id] # lookup cutoff value for classification library_id = feature.attrs[GTFAttr.LIBRARY_ID] cutoff = cutoff_dict[library_id] feature.attrs["cutoff"] = cutoff is_expr = (result.pred >= cutoff) # retain certain results as transcript attributes for attr_name in GTF_ATTRS_TO_RETAIN: feature.attrs[attr_name] = getattr(result, attr_name) # keep track of prediction decision and statistics # remember decision in dict so that it can be # applied to the transcript exons as well if result.annotated: if is_expr: decision = ANN_EXPR else: decision = ANN_BKGD else: if is_expr: decision = UNANN_EXPR else: decision = UNANN_BKGD # push transcript end onto decision heap queue # (decision must stay valid until past the end) heapq.heappush(decision_heapq, (feature.end, t_id)) # keep track of decision to apply it to exon features t_id_decisions[t_id] = decision else: decision = t_id_decisions[t_id] # keep track of stats decision_stats[decision] += 1 # output to separate files out_fh = cinfo.decision_fh_dict[decision] print >>out_fh, str(feature) # cleanup gtf_fh.close() result_fh.close() for fh in cinfo.decision_fh_dict.itervalues(): fh.close() return decision_stats
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("--source", dest="source", default='bed_to_gtf') parser.add_argument("bed_file") args = parser.parse_args() bed_file = args.bed_file source = args.source for x in BEDFeature.parse(open(bed_file)): f = GTFFeature() f.seqid = x.chrom f.source = source f.feature_type = 'transcript' f.start = x.tx_start f.end = x.tx_end f.score = x.score f.strand = x.strand f.phase = '.' f.attrs = {'transcript_id': x.name, 'gene_id': x.name} features = [f] for i, e in enumerate(x.exons): start, end = e f = GTFFeature() f.seqid = x.chrom f.source = source f.feature_type = 'exon' f.start = start f.end = end f.score = x.score f.strand = x.strand f.phase = '.' f.attrs = dict(features[0].attrs) f.attrs["exon_number"] = i features.append(f) for f in features: print str(f)
def gtf_add_transcript_features(gtf_file, outfh): transcript_dict = collections.defaultdict(lambda: []) for feature in GTFFeature.parse(open(gtf_file)): if feature.feature_type != "exon": continue t_id = feature.attrs[GTFAttr.TRANSCRIPT_ID] transcript_dict[t_id].append(feature) # output reference transcripts for t_id, features in transcript_dict.iteritems(): # sort features (exons) by start position features.sort(key=operator.attrgetter('start')) # transcript feature f = GTFFeature() f.seqid = features[0].seqid f.source = features[0].source f.feature_type = 'transcript' f.start = features[0].start f.end = features[-1].end f.score = features[0].score f.strand = features[0].strand f.phase = '.' f.attrs = features[0].attrs.copy() if "exon_number" in f.attrs: del f.attrs["exon_number"] #f.attrs[GTFAttr.REF] = '1' print >>outfh, str(f) # annotate exons as reference features for f in features: #f.attrs[GTFAttr.REF] = '1' print >>outfh, str(f)
def add_gtf_file(gtf_file, outfh, is_ref): refval = '1' if is_ref else '0' for chrom, transcript_dict, exon_dict in _parse_gtf_by_chrom(gtf_file): logging.debug("\tfinished chrom %s %d features" % (chrom, len(exon_dict))) # output reference transcripts for t_id, features in exon_dict.iteritems(): # sort features (exons) by start position features.sort(key=operator.attrgetter('start')) # annotate exons as reference features for f in features: f.attrs[GTFAttr.REF] = refval print >> outfh, str(f) # transcript feature if t_id in transcript_dict: f = transcript_dict[t_id] else: f = GTFFeature() f.seqid = features[0].seqid f.source = features[0].source f.feature_type = 'transcript' f.start = features[0].start f.end = features[-1].end f.score = features[0].score f.strand = features[0].strand f.phase = '.' f.attrs = features[0].attrs.copy() if "exon_number" in f.attrs: del f.attrs["exon_number"] f.attrs[GTFAttr.REF] = refval print >> outfh, str(f)
def to_gtf_features(self, source=None): if source is None: source = 'source' # transcript feature f = GTFFeature() f.seqid = self.chrom f.source = source f.feature_type = 'transcript' f.start = self.start f.end = self.end f.score = 1000.0 f.strand = self.strand f.phase = '.' f.attrs = self.attrs.copy() features = [f] # exon features for i, e in enumerate(self.exons): start, end = e f = GTFFeature() f.seqid = self.chrom f.source = source f.feature_type = 'exon' f.start = start f.end = end f.score = 1000.0 f.strand = self.strand f.phase = '.' f.attrs = self.attrs.copy() f.attrs["exon_number"] = i features.append(f) return features
def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id, transcript_id, score, frac): tx_start = exons[0].start tx_end = exons[-1].end strand_str = strand_int_to_str(strand) attr_dict = {'locus_id': locus_id, 'gene_id': gene_id, 'tss_id': tss_id, 'transcript_id': transcript_id} f = GTFFeature() f.seqid = chrom f.source = 'assemblyline' f.feature_type = 'transcript' f.start = tx_start f.end = tx_end f.score = 1000.0 * int(round(frac)) f.strand = strand_str f.phase = '.' f.attrs = {'score': '%.3f' % score, 'frac': '%.3f' % frac} f.attrs.update(attr_dict) yield f for i,e in enumerate(exons): f = GTFFeature() f.seqid = chrom f.source = 'assemblyline' f.feature_type = 'exon' f.start = e.start f.end = e.end f.score = int(round(frac)) f.strand = strand_str f.phase = '.' f.attrs = {'exon_number': i+1} f.attrs.update(attr_dict) yield f
def to_gtf_features(self, source=None): if source is None: source = 'source' # transcript feature f = GTFFeature() f.seqid = self.chrom f.source = source f.feature_type = 'transcript' f.start = self.start f.end = self.end f.score = 1000.0 f.strand = self.strand f.phase = '.' f.attrs = self.attrs.copy() features = [f] # exon features for i,e in enumerate(self.exons): start,end = e f = GTFFeature() f.seqid = self.chrom f.source = source f.feature_type = 'exon' f.start = start f.end = end f.score = 1000.0 f.strand = self.strand f.phase = '.' f.attrs = self.attrs.copy() f.attrs["exon_number"] = i features.append(f) return features
def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id, transcript_id, score, frac): tx_start = exons[0].start tx_end = exons[-1].end strand_str = strand_int_to_str(strand) attr_dict = {"locus_id": locus_id, "gene_id": gene_id, "tss_id": tss_id, "transcript_id": transcript_id} f = GTFFeature() f.seqid = chrom f.source = "assemblyline" f.feature_type = "transcript" f.start = tx_start f.end = tx_end f.score = 1000.0 * int(round(frac)) f.strand = strand_str f.phase = "." f.attrs = {"score": "%.3f" % score, "frac": "%.3f" % frac} f.attrs.update(attr_dict) yield f for i, e in enumerate(exons): f = GTFFeature() f.seqid = chrom f.source = "assemblyline" f.feature_type = "exon" f.start = e.start f.end = e.end f.score = int(round(frac)) f.strand = strand_str f.phase = "." f.attrs = {"exon_number": i + 1} f.attrs.update(attr_dict) yield f
def get_cds_features(gtf_file): cds = collections.defaultdict(lambda: set()) i = 0 for f in GTFFeature.parse(open(gtf_file)): if f.feature_type == "CDS": cds[f.seqid].add((f.start, f.end, f.strand)) i += 1 if (i % 100000) == 0: logging.debug("Parsed %d features" % (i)) logging.debug("Returning CDS transcripts") t_id = 1 for chrom in sorted(cds): for start, end, strand in sorted(cds[chrom]): for feature_type in ('transcript', 'exon'): f = GTFFeature() f.seqid = chrom f.source = 'cds' f.feature_type = feature_type f.start = start f.end = end f.score = 0 f.strand = strand f.phase = '.' f.attrs = {'cds': 1, 'transcript_id': 'CDS%08d' % (t_id)} yield f t_id += 1
def main(): # parse command line parser = argparse.ArgumentParser() parser.add_argument("-v", "--verbose", action="store_true", dest="verbose", default=False) parser.add_argument("ref_gtf_file") parser.add_argument("gtf_file") args = parser.parse_args() # set logging level if args.verbose: level = logging.DEBUG else: level = logging.INFO logging.basicConfig( level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") # check command line parameters if not os.path.exists(args.ref_gtf_file): parser.error("GTF file %s not found" % (args.ref_gtf_file)) if not os.path.exists(args.gtf_file): parser.error("GTF file %s not found" % (args.gtf_file)) logging.info("AssemblyLine %s" % (assemblyline.__version__)) logging.info("----------------------------------") # show parameters logging.info("Parameters:") logging.info("verbose logging: %s" % (args.verbose)) logging.info("ref gtf file: %s" % (args.ref_gtf_file)) logging.info("assembly gtf file: %s" % (args.gtf_file)) # find CDS regions if not os.path.exists('tmp.srt.gtf'): with open('tmp.gtf', 'w') as outfileh: logging.info("Reading CDS regions from reference GTF") for f in get_cds_features(args.ref_gtf_file): print >> outfileh, str(f) logging.info("Reading transcripts from assembly GTF") i = 0 for f in GTFFeature.parse(open(args.gtf_file)): print >> outfileh, str(f) i += 1 if i % 100000 == 0: logging.debug("Parsed %d transcripts" % (i)) logging.info("Sorting GTF file") sort_gtf('tmp.gtf', 'tmp.srt.gtf') for locus_transcripts in parse_gtf(open('tmp.srt.gtf')): locus_chrom = locus_transcripts[0].chrom locus_start = locus_transcripts[0].start locus_end = max(t.end for t in locus_transcripts) logging.debug( "[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(locus_transcripts))) for start, end, strand, m, t, c in categorize(locus_transcripts): fields = [ locus_chrom, str(start), str(end), '%s|%s|%s' % (m, t, c), '0', strand_int_to_str(strand) ] print '\t'.join(fields) return 0
''' Created on Feb 13, 2013 @author: mkiyer ''' import argparse from assemblyline.lib.gtf import GTFFeature if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('gtf_file') args = parser.parse_args() lncrna_biotypes = set(['3prime_overlapping_ncrna', 'antisense', 'lincRNA', 'sense_intronic', 'sense_overlapping']) transcript_ids = set() for feature in GTFFeature.parse(open(args.gtf_file)): if feature.feature_type != "exon": continue biotype = feature.attrs["gene_biotype"] if biotype in lncrna_biotypes: transcript_ids.add(feature.attrs['transcript_id']) for t_id in sorted(transcript_ids): print t_id
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("--source", dest="source", default='bed_to_gtf') parser.add_argument("bed_file") args = parser.parse_args() bed_file = args.bed_file source = args.source for x in BEDFeature.parse(open(bed_file)): f = GTFFeature() f.seqid = x.chrom f.source = source f.feature_type = 'transcript' f.start = x.tx_start f.end = x.tx_end f.score = x.score f.strand = x.strand f.phase = '.' f.attrs = {'transcript_id': x.name, 'gene_id': x.name} features = [f] for i,e in enumerate(x.exons): start, end = e f = GTFFeature() f.seqid = x.chrom f.source = source f.feature_type = 'exon' f.start = start f.end = end f.score = x.score f.strand = x.strand f.phase = '.' f.attrs = dict(features[0].attrs) f.attrs["exon_number"] = i features.append(f) for f in features: print str(f)
def make_transcript_feature(exon_features): f = GTFFeature() f.seqid = exon_features[0].seqid f.source = exon_features[0].source f.feature_type = 'transcript' f.start = exon_features[0].start f.end = exon_features[-1].end f.score = exon_features[0].score f.strand = exon_features[0].strand f.phase = '.' f.attrs = exon_features[0].attrs.copy() if "exon_number" in f.attrs: del f.attrs["exon_number"] return f
def get_cds_features(gtf_file): cds = collections.defaultdict(lambda: set()) i = 0 for f in GTFFeature.parse(open(gtf_file)): if f.feature_type == "CDS": cds[f.seqid].add((f.start, f.end, f.strand)) i += 1 if (i % 100000) == 0: logging.debug("Parsed %d features" % (i)) logging.debug("Returning CDS transcripts") t_id = 1 for chrom in sorted(cds): for start,end,strand in sorted(cds[chrom]): for feature_type in ('transcript', 'exon'): f = GTFFeature() f.seqid = chrom f.source = 'cds' f.feature_type = feature_type f.start = start f.end = end f.score = 0 f.strand = strand f.phase = '.' f.attrs = {'cds': 1, 'transcript_id': 'CDS%08d' % (t_id)} yield f t_id += 1
''' Created on Feb 23, 2013 @author: mkiyer ''' import argparse from assemblyline.lib.gtf import GTFFeature if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('gtf_file') args = parser.parse_args() transcript_ids = set() for feature in GTFFeature.parse(open(args.gtf_file)): if feature.feature_type != "exon": continue transcript_ids.add(feature.attrs['transcript_id']) for t_id in sorted(transcript_ids): print t_id