def to_gtf_features(self, source=None): if source is None: source = 'source' # transcript feature f = GTFFeature() f.seqid = self.chrom f.source = source f.feature_type = 'transcript' f.start = self.start f.end = self.end f.score = 1000.0 f.strand = self.strand f.phase = '.' f.attrs = self.attrs.copy() features = [f] # exon features for i,e in enumerate(self.exons): start,end = e f = GTFFeature() f.seqid = self.chrom f.source = source f.feature_type = 'exon' f.start = start f.end = end f.score = 1000.0 f.strand = self.strand f.phase = '.' f.attrs = self.attrs.copy() f.attrs["exon_number"] = i features.append(f) return features
def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id, transcript_id, score, frac): tx_start = exons[0].start tx_end = exons[-1].end strand_str = strand_int_to_str(strand) attr_dict = {'locus_id': locus_id, 'gene_id': gene_id, 'tss_id': tss_id, 'transcript_id': transcript_id} f = GTFFeature() f.seqid = chrom f.source = 'assemblyline' f.feature_type = 'transcript' f.start = tx_start f.end = tx_end f.score = 1000.0 * int(round(frac)) f.strand = strand_str f.phase = '.' f.attrs = {'score': '%.3f' % score, 'frac': '%.3f' % frac} f.attrs.update(attr_dict) yield f for i,e in enumerate(exons): f = GTFFeature() f.seqid = chrom f.source = 'assemblyline' f.feature_type = 'exon' f.start = e.start f.end = e.end f.score = int(round(frac)) f.strand = strand_str f.phase = '.' f.attrs = {'exon_number': i+1} f.attrs.update(attr_dict) yield f
def to_gtf_features(self, source=None): if source is None: source = 'source' # transcript feature f = GTFFeature() f.seqid = self.chrom f.source = source f.feature_type = 'transcript' f.start = self.start f.end = self.end f.score = 1000.0 f.strand = self.strand f.phase = '.' f.attrs = self.attrs.copy() features = [f] # exon features for i, e in enumerate(self.exons): start, end = e f = GTFFeature() f.seqid = self.chrom f.source = source f.feature_type = 'exon' f.start = start f.end = end f.score = 1000.0 f.strand = self.strand f.phase = '.' f.attrs = self.attrs.copy() f.attrs["exon_number"] = i features.append(f) return features
def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id, transcript_id, score, frac): tx_start = exons[0].start tx_end = exons[-1].end strand_str = strand_int_to_str(strand) attr_dict = {"locus_id": locus_id, "gene_id": gene_id, "tss_id": tss_id, "transcript_id": transcript_id} f = GTFFeature() f.seqid = chrom f.source = "assemblyline" f.feature_type = "transcript" f.start = tx_start f.end = tx_end f.score = 1000.0 * int(round(frac)) f.strand = strand_str f.phase = "." f.attrs = {"score": "%.3f" % score, "frac": "%.3f" % frac} f.attrs.update(attr_dict) yield f for i, e in enumerate(exons): f = GTFFeature() f.seqid = chrom f.source = "assemblyline" f.feature_type = "exon" f.start = e.start f.end = e.end f.score = int(round(frac)) f.strand = strand_str f.phase = "." f.attrs = {"exon_number": i + 1} f.attrs.update(attr_dict) yield f
def get_cds_features(gtf_file): cds = collections.defaultdict(lambda: set()) i = 0 for f in GTFFeature.parse(open(gtf_file)): if f.feature_type == "CDS": cds[f.seqid].add((f.start, f.end, f.strand)) i += 1 if (i % 100000) == 0: logging.debug("Parsed %d features" % (i)) logging.debug("Returning CDS transcripts") t_id = 1 for chrom in sorted(cds): for start, end, strand in sorted(cds[chrom]): for feature_type in ('transcript', 'exon'): f = GTFFeature() f.seqid = chrom f.source = 'cds' f.feature_type = feature_type f.start = start f.end = end f.score = 0 f.strand = strand f.phase = '.' f.attrs = {'cds': 1, 'transcript_id': 'CDS%08d' % (t_id)} yield f t_id += 1
def add_gtf_file(gtf_file, outfh, is_ref): refval = '1' if is_ref else '0' for chrom, transcript_dict, exon_dict in _parse_gtf_by_chrom(gtf_file): logging.debug("\tfinished chrom %s %d features" % (chrom, len(exon_dict))) # output reference transcripts for t_id, features in exon_dict.iteritems(): # sort features (exons) by start position features.sort(key=operator.attrgetter('start')) # annotate exons as reference features for f in features: f.attrs[GTFAttr.REF] = refval print >> outfh, str(f) # transcript feature if t_id in transcript_dict: f = transcript_dict[t_id] else: f = GTFFeature() f.seqid = features[0].seqid f.source = features[0].source f.feature_type = 'transcript' f.start = features[0].start f.end = features[-1].end f.score = features[0].score f.strand = features[0].strand f.phase = '.' f.attrs = features[0].attrs.copy() if "exon_number" in f.attrs: del f.attrs["exon_number"] f.attrs[GTFAttr.REF] = refval print >> outfh, str(f)
def gtf_add_transcript_features(gtf_file, outfh): transcript_dict = collections.defaultdict(lambda: []) for feature in GTFFeature.parse(open(gtf_file)): if feature.feature_type != "exon": continue t_id = feature.attrs[GTFAttr.TRANSCRIPT_ID] transcript_dict[t_id].append(feature) # output reference transcripts for t_id, features in transcript_dict.iteritems(): # sort features (exons) by start position features.sort(key=operator.attrgetter('start')) # transcript feature f = GTFFeature() f.seqid = features[0].seqid f.source = features[0].source f.feature_type = 'transcript' f.start = features[0].start f.end = features[-1].end f.score = features[0].score f.strand = features[0].strand f.phase = '.' f.attrs = features[0].attrs.copy() if "exon_number" in f.attrs: del f.attrs["exon_number"] #f.attrs[GTFAttr.REF] = '1' print >>outfh, str(f) # annotate exons as reference features for f in features: #f.attrs[GTFAttr.REF] = '1' print >>outfh, str(f)
def add_gtf_file(gtf_file, outfh, is_ref, sample_id=None): refval = '1' if is_ref else '0' for chrom, transcript_dict, exon_dict in _parse_gtf_by_chrom(gtf_file): logging.debug("\tfinished chrom %s %d features" % (chrom, len(exon_dict))) # output reference transcripts for t_id, features in exon_dict.iteritems(): # sort features (exons) by start position features.sort(key=operator.attrgetter('start')) # annotate exons as reference features for f in features: f.attrs[GTFAttr.REF] = refval print >>outfh, str(f) # transcript feature if t_id in transcript_dict: f = transcript_dict[t_id] else: f = GTFFeature() f.seqid = features[0].seqid f.source = features[0].source f.feature_type = 'transcript' f.start = features[0].start f.end = features[-1].end f.score = features[0].score f.strand = features[0].strand f.phase = '.' f.attrs = features[0].attrs.copy() if "exon_number" in f.attrs: del f.attrs["exon_number"] f.attrs[GTFAttr.REF] = refval if sample_id is not None: f.attrs[GTFAttr.SAMPLE_ID] = sample_id print >>outfh, str(f)
def get_cds_features(gtf_file): cds = collections.defaultdict(lambda: set()) i = 0 for f in GTFFeature.parse(open(gtf_file)): if f.feature_type == "CDS": cds[f.seqid].add((f.start, f.end, f.strand)) i += 1 if (i % 100000) == 0: logging.debug("Parsed %d features" % (i)) logging.debug("Returning CDS transcripts") t_id = 1 for chrom in sorted(cds): for start,end,strand in sorted(cds[chrom]): for feature_type in ('transcript', 'exon'): f = GTFFeature() f.seqid = chrom f.source = 'cds' f.feature_type = feature_type f.start = start f.end = end f.score = 0 f.strand = strand f.phase = '.' f.attrs = {'cds': 1, 'transcript_id': 'CDS%08d' % (t_id)} yield f t_id += 1
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("--source", dest="source", default='bed_to_gtf') parser.add_argument("bed_file") args = parser.parse_args() bed_file = args.bed_file source = args.source for x in BEDFeature.parse(open(bed_file)): f = GTFFeature() f.seqid = x.chrom f.source = source f.feature_type = 'transcript' f.start = x.tx_start f.end = x.tx_end f.score = x.score f.strand = x.strand f.phase = '.' f.attrs = {'transcript_id': x.name, 'gene_id': x.name} features = [f] for i,e in enumerate(x.exons): start, end = e f = GTFFeature() f.seqid = x.chrom f.source = source f.feature_type = 'exon' f.start = start f.end = end f.score = x.score f.strand = x.strand f.phase = '.' f.attrs = dict(features[0].attrs) f.attrs["exon_number"] = i features.append(f) for f in features: print str(f)
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("--source", dest="source", default='bed_to_gtf') parser.add_argument("bed_file") args = parser.parse_args() bed_file = args.bed_file source = args.source for x in BEDFeature.parse(open(bed_file)): f = GTFFeature() f.seqid = x.chrom f.source = source f.feature_type = 'transcript' f.start = x.tx_start f.end = x.tx_end f.score = x.score f.strand = x.strand f.phase = '.' f.attrs = {'transcript_id': x.name, 'gene_id': x.name} features = [f] for i, e in enumerate(x.exons): start, end = e f = GTFFeature() f.seqid = x.chrom f.source = source f.feature_type = 'exon' f.start = start f.end = end f.score = x.score f.strand = x.strand f.phase = '.' f.attrs = dict(features[0].attrs) f.attrs["exon_number"] = i features.append(f) for f in features: print str(f)
def make_transcript_feature(exon_features): f = GTFFeature() f.seqid = exon_features[0].seqid f.source = exon_features[0].source f.feature_type = 'transcript' f.start = exon_features[0].start f.end = exon_features[-1].end f.score = exon_features[0].score f.strand = exon_features[0].strand f.phase = '.' f.attrs = exon_features[0].attrs.copy() if "exon_number" in f.attrs: del f.attrs["exon_number"] return f