def transcripts_from_gtf_lines(lines, attr_defs=None): transcripts = collections.OrderedDict() for line in lines: feature = GTFFeature.from_string(line, attr_defs) t_id = feature.attrs["transcript_id"] if t_id not in transcripts: if feature.feature_type != "transcript": raise GTFError("Feature type '%s' found before 'transcript' record: %s" % (feature.feature_type, str(feature))) t = Transcript() t.chrom = feature.seqid t.start = feature.start t.end = feature.end # convert from string strand notation ("+", "-", ".") # to integer (0, 1) t.strand = strand_str_to_int(feature.strand) t.exons = [] t.attrs = feature.attrs transcripts[t_id] = t else: t = transcripts[t_id] if feature.feature_type == "exon": t.exons.append(Exon(feature.start, feature.end)) # sort transcript exons by genomic position for t in transcripts.itervalues(): t.exons.sort() return transcripts.values()
def to_formatted_gtf(lines, gtf_file, attr_defs=None): transcripts = collections.OrderedDict() for line in lines: feature = GTFFeature.from_string(line, attr_defs) # skip gene annotation in gtf files if "transcript_id" not in feature.attrs: continue t_id = feature.attrs["transcript_id"] # extract exon information if t_id not in transcripts: if feature.feature_type == "exon": t = Transcript() t.chrom = feature.seqid t.start = feature.start t.end = feature.end t.strand = strand_str_to_int(feature.strand) t.exons = [Exon(feature.start, feature.end)] t.attrs = dict() for each_attr in feature.attrs: if 'exon' not in each_attr.lower(): t.attrs.update({each_attr: feature.attrs[each_attr]}) transcripts[t_id] = t else: t.start = t.start if t.start <= feature.start else feature.start t.end = t.end if t.end >= feature.end else feature.end t.exons.append(Exon(feature.start, feature.end)) with open(gtf_file, 'w') as gtf_output: for each_tr in transcripts: each_tr_obj = transcripts[each_tr] for each_feature in each_tr_obj.to_gtf_features(): gtf_output.write( '{gtf_line}\n'.format(gtf_line=str(each_feature)))
def transcripts_from_gtf_lines(lines, attr_defs=None): transcripts = collections.OrderedDict() for line in lines: feature = GTFFeature.from_string(line, attr_defs) t_id = feature.attrs["transcript_id"] if t_id not in transcripts: if feature.feature_type != "transcript": raise GTFError( "Feature type '%s' found before 'transcript' record: %s" % (feature.feature_type, str(feature))) t = Transcript() t.chrom = feature.seqid t.start = feature.start t.end = feature.end # convert from string strand notation ("+", "-", ".") # to integer (0, 1) t.strand = strand_str_to_int(feature.strand) t.exons = [] t.attrs = feature.attrs transcripts[t_id] = t else: t = transcripts[t_id] if feature.feature_type == "exon": t.exons.append(Exon(feature.start, feature.end)) # sort transcript exons by genomic position for t in transcripts.itervalues(): t.exons.sort() return transcripts.values()