Пример #1
0
 def to_gtf_features(self, source=None):
     if source is None:
         source = 'source'
     # transcript feature
     f = GTFFeature()
     f.seqid = self.chrom
     f.source = source
     f.feature_type = 'transcript'
     f.start = self.start
     f.end = self.end
     f.score = 1000.0
     f.strand = self.strand
     f.phase = '.'
     f.attrs = self.attrs.copy()
     features = [f]
     # exon features
     for i,e in enumerate(self.exons):
         start,end = e
         f = GTFFeature()
         f.seqid = self.chrom
         f.source = source
         f.feature_type = 'exon'
         f.start = start
         f.end = end
         f.score = 1000.0
         f.strand = self.strand
         f.phase = '.'
         f.attrs = self.attrs.copy()
         f.attrs["exon_number"] = i
         features.append(f)
     return features
def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id, 
                     transcript_id, score, frac):
    tx_start = exons[0].start
    tx_end = exons[-1].end
    strand_str = strand_int_to_str(strand)
    attr_dict = {'locus_id': locus_id,
                 'gene_id': gene_id,
                 'tss_id': tss_id,
                 'transcript_id': transcript_id}
    f = GTFFeature()
    f.seqid = chrom
    f.source = 'assemblyline'
    f.feature_type = 'transcript'
    f.start = tx_start
    f.end = tx_end
    f.score = 1000.0 * int(round(frac))
    f.strand = strand_str
    f.phase = '.'
    f.attrs = {'score': '%.3f' % score,
               'frac': '%.3f' % frac}
    f.attrs.update(attr_dict)
    yield f
    for i,e in enumerate(exons):
        f = GTFFeature()
        f.seqid = chrom
        f.source = 'assemblyline'
        f.feature_type = 'exon'
        f.start = e.start
        f.end = e.end
        f.score = int(round(frac))
        f.strand = strand_str
        f.phase = '.'
        f.attrs = {'exon_number': i+1}
        f.attrs.update(attr_dict)
        yield f
Пример #3
0
 def to_gtf_features(self, source=None):
     if source is None:
         source = 'source'
     # transcript feature
     f = GTFFeature()
     f.seqid = self.chrom
     f.source = source
     f.feature_type = 'transcript'
     f.start = self.start
     f.end = self.end
     f.score = 1000.0
     f.strand = self.strand
     f.phase = '.'
     f.attrs = self.attrs.copy()
     features = [f]
     # exon features
     for i, e in enumerate(self.exons):
         start, end = e
         f = GTFFeature()
         f.seqid = self.chrom
         f.source = source
         f.feature_type = 'exon'
         f.start = start
         f.end = end
         f.score = 1000.0
         f.strand = self.strand
         f.phase = '.'
         f.attrs = self.attrs.copy()
         f.attrs["exon_number"] = i
         features.append(f)
     return features
def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id, transcript_id, score, frac):
    tx_start = exons[0].start
    tx_end = exons[-1].end
    strand_str = strand_int_to_str(strand)
    attr_dict = {"locus_id": locus_id, "gene_id": gene_id, "tss_id": tss_id, "transcript_id": transcript_id}
    f = GTFFeature()
    f.seqid = chrom
    f.source = "assemblyline"
    f.feature_type = "transcript"
    f.start = tx_start
    f.end = tx_end
    f.score = 1000.0 * int(round(frac))
    f.strand = strand_str
    f.phase = "."
    f.attrs = {"score": "%.3f" % score, "frac": "%.3f" % frac}
    f.attrs.update(attr_dict)
    yield f
    for i, e in enumerate(exons):
        f = GTFFeature()
        f.seqid = chrom
        f.source = "assemblyline"
        f.feature_type = "exon"
        f.start = e.start
        f.end = e.end
        f.score = int(round(frac))
        f.strand = strand_str
        f.phase = "."
        f.attrs = {"exon_number": i + 1}
        f.attrs.update(attr_dict)
        yield f
Пример #5
0
def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id, 
                     transcript_id, score, frac):
    tx_start = exons[0].start
    tx_end = exons[-1].end
    strand_str = strand_int_to_str(strand)
    attr_dict = {'locus_id': locus_id,
                 'gene_id': gene_id,
                 'tss_id': tss_id,
                 'transcript_id': transcript_id}
    f = GTFFeature()
    f.seqid = chrom
    f.source = 'assemblyline'
    f.feature_type = 'transcript'
    f.start = tx_start
    f.end = tx_end
    f.score = 1000.0 * int(round(frac))
    f.strand = strand_str
    f.phase = '.'
    f.attrs = {'score': '%.3f' % score,
               'frac': '%.3f' % frac}
    f.attrs.update(attr_dict)
    yield f
    for i,e in enumerate(exons):
        f = GTFFeature()
        f.seqid = chrom
        f.source = 'assemblyline'
        f.feature_type = 'exon'
        f.start = e.start
        f.end = e.end
        f.score = int(round(frac))
        f.strand = strand_str
        f.phase = '.'
        f.attrs = {'exon_number': i+1}
        f.attrs.update(attr_dict)
        yield f
def get_cds_features(gtf_file):
    cds = collections.defaultdict(lambda: set())
    i = 0
    for f in GTFFeature.parse(open(gtf_file)):
        if f.feature_type == "CDS":
            cds[f.seqid].add((f.start, f.end, f.strand))
        i += 1
        if (i % 100000) == 0:
            logging.debug("Parsed %d features" % (i))
    logging.debug("Returning CDS transcripts")
    t_id = 1
    for chrom in sorted(cds):
        for start, end, strand in sorted(cds[chrom]):
            for feature_type in ('transcript', 'exon'):
                f = GTFFeature()
                f.seqid = chrom
                f.source = 'cds'
                f.feature_type = feature_type
                f.start = start
                f.end = end
                f.score = 0
                f.strand = strand
                f.phase = '.'
                f.attrs = {'cds': 1, 'transcript_id': 'CDS%08d' % (t_id)}
                yield f
            t_id += 1
Пример #7
0
def add_gtf_file(gtf_file, outfh, is_ref):
    refval = '1' if is_ref else '0'
    for chrom, transcript_dict, exon_dict in _parse_gtf_by_chrom(gtf_file):
        logging.debug("\tfinished chrom %s %d features" %
                      (chrom, len(exon_dict)))
        # output reference transcripts
        for t_id, features in exon_dict.iteritems():
            # sort features (exons) by start position
            features.sort(key=operator.attrgetter('start'))
            # annotate exons as reference features
            for f in features:
                f.attrs[GTFAttr.REF] = refval
                print >> outfh, str(f)
            # transcript feature
            if t_id in transcript_dict:
                f = transcript_dict[t_id]
            else:
                f = GTFFeature()
                f.seqid = features[0].seqid
                f.source = features[0].source
                f.feature_type = 'transcript'
                f.start = features[0].start
                f.end = features[-1].end
                f.score = features[0].score
                f.strand = features[0].strand
                f.phase = '.'
                f.attrs = features[0].attrs.copy()
                if "exon_number" in f.attrs:
                    del f.attrs["exon_number"]
            f.attrs[GTFAttr.REF] = refval
            print >> outfh, str(f)
def gtf_add_transcript_features(gtf_file, outfh):
    transcript_dict = collections.defaultdict(lambda: [])
    for feature in GTFFeature.parse(open(gtf_file)):
        if feature.feature_type != "exon":
            continue
        t_id = feature.attrs[GTFAttr.TRANSCRIPT_ID]
        transcript_dict[t_id].append(feature)
    # output reference transcripts
    for t_id, features in transcript_dict.iteritems():
        # sort features (exons) by start position
        features.sort(key=operator.attrgetter('start'))
        # transcript feature
        f = GTFFeature()
        f.seqid = features[0].seqid
        f.source = features[0].source
        f.feature_type = 'transcript'
        f.start = features[0].start
        f.end = features[-1].end
        f.score = features[0].score
        f.strand = features[0].strand
        f.phase = '.'
        f.attrs = features[0].attrs.copy()
        if "exon_number" in f.attrs:
            del f.attrs["exon_number"]
        #f.attrs[GTFAttr.REF] = '1'
        print >>outfh, str(f)
        # annotate exons as reference features
        for f in features:
            #f.attrs[GTFAttr.REF] = '1'
            print >>outfh, str(f)
Пример #9
0
def add_gtf_file(gtf_file, outfh, is_ref, sample_id=None):
    refval = '1' if is_ref else '0'
    for chrom, transcript_dict, exon_dict in _parse_gtf_by_chrom(gtf_file):
        logging.debug("\tfinished chrom %s %d features" % (chrom, len(exon_dict)))
        # output reference transcripts
        for t_id, features in exon_dict.iteritems():
            # sort features (exons) by start position
            features.sort(key=operator.attrgetter('start'))
            # annotate exons as reference features
            for f in features:
                f.attrs[GTFAttr.REF] = refval
                print >>outfh, str(f)
            # transcript feature
            if t_id in transcript_dict:
                f = transcript_dict[t_id]
            else:
                f = GTFFeature()
                f.seqid = features[0].seqid
                f.source = features[0].source
                f.feature_type = 'transcript'
                f.start = features[0].start
                f.end = features[-1].end
                f.score = features[0].score
                f.strand = features[0].strand
                f.phase = '.'
                f.attrs = features[0].attrs.copy()
                if "exon_number" in f.attrs:
                    del f.attrs["exon_number"]
            f.attrs[GTFAttr.REF] = refval
            if sample_id is not None:
                f.attrs[GTFAttr.SAMPLE_ID] = sample_id
            print >>outfh, str(f)
def get_cds_features(gtf_file):
    cds = collections.defaultdict(lambda: set())
    i = 0
    for f in GTFFeature.parse(open(gtf_file)):
        if f.feature_type == "CDS":
            cds[f.seqid].add((f.start, f.end, f.strand))
        i += 1
        if (i % 100000) == 0:
            logging.debug("Parsed %d features" % (i))
    logging.debug("Returning CDS transcripts")
    t_id = 1
    for chrom in sorted(cds):
        for start,end,strand in sorted(cds[chrom]):
            for feature_type in ('transcript', 'exon'):
                f = GTFFeature()
                f.seqid = chrom
                f.source = 'cds'
                f.feature_type = feature_type
                f.start = start
                f.end = end
                f.score = 0
                f.strand = strand
                f.phase = '.'
                f.attrs = {'cds': 1, 
                           'transcript_id': 'CDS%08d' % (t_id)}
                yield f
            t_id += 1
Пример #11
0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("--source", dest="source", default='bed_to_gtf')
    parser.add_argument("bed_file")
    args = parser.parse_args()
    bed_file = args.bed_file
    source = args.source
    for x in BEDFeature.parse(open(bed_file)):
        f = GTFFeature()
        f.seqid = x.chrom
        f.source = source
        f.feature_type = 'transcript'
        f.start = x.tx_start
        f.end = x.tx_end
        f.score = x.score
        f.strand = x.strand
        f.phase = '.'
        f.attrs = {'transcript_id': x.name,
                   'gene_id': x.name}
        features = [f]
        for i,e in enumerate(x.exons):
            start, end = e
            f = GTFFeature()
            f.seqid = x.chrom
            f.source = source
            f.feature_type = 'exon'
            f.start = start
            f.end = end
            f.score = x.score
            f.strand = x.strand
            f.phase = '.'
            f.attrs = dict(features[0].attrs)
            f.attrs["exon_number"] = i
            features.append(f)
        for f in features:
            print str(f)
Пример #12
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("--source", dest="source", default='bed_to_gtf')
    parser.add_argument("bed_file")
    args = parser.parse_args()
    bed_file = args.bed_file
    source = args.source
    for x in BEDFeature.parse(open(bed_file)):
        f = GTFFeature()
        f.seqid = x.chrom
        f.source = source
        f.feature_type = 'transcript'
        f.start = x.tx_start
        f.end = x.tx_end
        f.score = x.score
        f.strand = x.strand
        f.phase = '.'
        f.attrs = {'transcript_id': x.name, 'gene_id': x.name}
        features = [f]
        for i, e in enumerate(x.exons):
            start, end = e
            f = GTFFeature()
            f.seqid = x.chrom
            f.source = source
            f.feature_type = 'exon'
            f.start = start
            f.end = end
            f.score = x.score
            f.strand = x.strand
            f.phase = '.'
            f.attrs = dict(features[0].attrs)
            f.attrs["exon_number"] = i
            features.append(f)
        for f in features:
            print str(f)
def make_transcript_feature(exon_features):
    f = GTFFeature()
    f.seqid = exon_features[0].seqid
    f.source = exon_features[0].source
    f.feature_type = 'transcript'
    f.start = exon_features[0].start
    f.end = exon_features[-1].end
    f.score = exon_features[0].score
    f.strand = exon_features[0].strand
    f.phase = '.'
    f.attrs = exon_features[0].attrs.copy()
    if "exon_number" in f.attrs:
        del f.attrs["exon_number"]
    return f