def annotate_novel_coding(assembled_gtf, ref_gtf, ref_fasta, data, out_file=None): if not out_file: out_file = os.path.splitext(assembled_gtf)[0] + ".annotated.gtf" if file_exists(out_file): return out_file classification = cpat.classify_with_cpat(assembled_gtf, ref_gtf, ref_fasta, data) if not classification: logger.info("Protein coding classification of %s was skipped because " "CPAT was not found." % assembled_gtf) return assembled_gtf ref_db = gtf.get_gtf_db(ref_gtf) known_transcript = {feature['transcript_id'][0]: feature.source for feature in gtf.complete_features(ref_db)} assembled_db = gtf.get_gtf_db(assembled_gtf) with file_transaction(out_file) as tx_out_file: with open(tx_out_file, 'w') as out_handle: for feature in gtf.complete_features(assembled_db): transcript_id = feature['transcript_id'][0] if transcript_id not in known_transcript: feature.source = classification[transcript_id] else: feature.source = known_transcript[transcript_id] out_handle.write(str(feature) + "\n") return out_file
def fix_cufflinks_attributes(ref_gtf, merged_gtf, data, out_file=None): """ replace the cufflinks gene_id and transcript_id with the gene_id and transcript_id from ref_gtf, where available """ base, ext = os.path.splitext(merged_gtf) fixed = out_file if out_file else base + ".clean.fixed" + ext if file_exists(fixed): return fixed ref_db = gtf.get_gtf_db(ref_gtf) merged_db = gtf.get_gtf_db(merged_gtf, in_memory=True) ref_tid_to_gid = {} for gene in ref_db.features_of_type('gene'): for transcript in ref_db.children(gene, level=1): ref_tid_to_gid[transcript.id] = gene.id ctid_to_cgid = {} ctid_to_oid = {} for gene in merged_db.features_of_type('gene'): for transcript in merged_db.children(gene, level=1): ctid_to_cgid[transcript.id] = gene.id feature = list(merged_db.children(transcript))[0] oid = feature.attributes.get("oId", [None])[0] if oid: ctid_to_oid[transcript.id] = oid cgid_to_gid = {} for ctid, oid in ctid_to_oid.items(): cgid = ctid_to_cgid.get(ctid, None) oid = ctid_to_oid.get(ctid, None) gid = ref_tid_to_gid.get(oid, None) if oid else None if cgid and gid: cgid_to_gid[cgid] = gid with file_transaction(data, fixed) as tmp_fixed_file: with open(tmp_fixed_file, "w") as out_handle: for gene in merged_db.features_of_type('gene'): for transcript in merged_db.children(gene, level=1): for feature in merged_db.children(transcript): cgid = feature.attributes.get("gene_id", [None])[0] gid = cgid_to_gid.get(cgid, None) ctid = None if gid: feature.attributes["gene_id"][0] = gid ctid = feature.attributes.get( "transcript_id", [None])[0] tid = ctid_to_oid.get(ctid, None) if tid: feature.attributes["transcript_id"][0] = tid if "nearest_ref" in feature.attributes: del feature.attributes["nearest_ref"] if "oId" in feature.attributes: del feature.attributes["oId"] out_handle.write(str(feature) + "\n") return fixed
def fix_cufflinks_attributes(ref_gtf, merged_gtf, data, out_file=None): """ replace the cufflinks gene_id and transcript_id with the gene_id and transcript_id from ref_gtf, where available """ base, ext = os.path.splitext(merged_gtf) fixed = out_file if out_file else base + ".clean.fixed" + ext if file_exists(fixed): return fixed ref_db = gtf.get_gtf_db(ref_gtf) merged_db = gtf.get_gtf_db(merged_gtf, in_memory=True) ref_tid_to_gid = {} for gene in ref_db.features_of_type('gene'): for transcript in ref_db.children(gene, level=1): ref_tid_to_gid[transcript.id] = gene.id ctid_to_cgid = {} ctid_to_oid = {} for gene in merged_db.features_of_type('gene'): for transcript in merged_db.children(gene, level=1): ctid_to_cgid[transcript.id] = gene.id feature = list(merged_db.children(transcript))[0] oid = feature.attributes.get("oId", [None])[0] if oid: ctid_to_oid[transcript.id] = oid cgid_to_gid = {} for ctid, oid in ctid_to_oid.items(): cgid = ctid_to_cgid.get(ctid, None) oid = ctid_to_oid.get(ctid, None) gid = ref_tid_to_gid.get(oid, None) if oid else None if cgid and gid: cgid_to_gid[cgid] = gid with file_transaction(data, fixed) as tmp_fixed_file: with open(tmp_fixed_file, "w") as out_handle: for gene in merged_db.features_of_type('gene'): for transcript in merged_db.children(gene, level=1): for feature in merged_db.children(transcript): cgid = feature.attributes.get("gene_id", [None])[0] gid = cgid_to_gid.get(cgid, None) ctid = None if gid: feature.attributes["gene_id"][0] = gid ctid = feature.attributes.get("transcript_id", [None])[0] tid = ctid_to_oid.get(ctid, None) if tid: feature.attributes["transcript_id"][0] = tid if "nearest_ref" in feature.attributes: del feature.attributes["nearest_ref"] if "oId" in feature.attributes: del feature.attributes["oId"] out_handle.write(str(feature) + "\n") return fixed
def annotate_novel_coding(assembled_gtf, ref_gtf, ref_fasta, out_file=None): if not out_file: out_file = os.path.splitext(assembled_gtf)[0] + ".annotated.gtf" if file_exists(out_file): return out_file classification = cpat.classify_with_cpat(assembled_gtf, ref_gtf, ref_fasta) ref_db = gtf.get_gtf_db(ref_gtf) known_transcript = {feature['transcript_id'][0]: feature.source for feature in gtf.complete_features(ref_db)} assembled_db = gtf.get_gtf_db(assembled_gtf) with file_transaction(out_file) as tx_out_file: with open(tx_out_file, 'w') as out_handle: for feature in gtf.complete_features(assembled_db): transcript_id = feature['transcript_id'][0] if transcript_id not in known_transcript: feature.source = classification[transcript_id] else: feature.source = known_transcript[transcript_id] out_handle.write(str(feature) + "\n") return out_file
def cleanup_transcripts(assembled_gtf, ref_gtf, ref_fasta, out_file=None): """ Clean up a GTF file of assembled transcripts 1) if a known gene is known to code for a protein, remove any *novel* isoforms of the that do not also code for a protein. 2) if a new gene has been annotated and none of its isoforms are protein coding and it is > 200 bp, mark it as a lincRNA. < 200 bp mark it as ncRNA """ if not out_file: out_file = os.path.splitext(assembled_gtf)[0] + ".cleaned.gtf" if file_exists(out_file): return out_file ref_db = gtf.get_gtf_db(ref_gtf) known_transcript = { feature['transcript_id'][0]: feature.source for feature in gtf.complete_features(ref_db) } ref_gene_to_source = gtf.get_gene_source_set(ref_gtf) assembled_db = gtf.get_gtf_db(assembled_gtf) assembled_fasta = gtf.gtf_to_fasta(assembled_gtf, ref_fasta) lengths = fasta.sequence_length(assembled_fasta) with file_transaction(out_file) as tx_out_file: with open(tx_out_file, 'w') as out_handle: for feature in gtf.complete_features(assembled_db): transcript_id = feature['transcript_id'][0] gene_id = feature['gene_id'][0] if transcript_id in known_transcript: out_handle.write(str(feature) + "\n") continue known_coding = "protein_coding" in ref_gene_to_source.get( gene_id, [None]) if known_coding and feature.source != "protein_coding": continue if feature.source != "protein_coding": if lengths[transcript_id] > 200: feature.source = "lincRNA" else: feature.source = "ncRNA" out_handle.write(str(feature) + "\n") return out_file
def annotate_novel_coding(assembled_gtf, ref_gtf, ref_fasta, data, out_file=None): if not out_file: out_file = os.path.splitext(assembled_gtf)[0] + ".annotated.gtf" if file_exists(out_file): return out_file classification = cpat.classify_with_cpat(assembled_gtf, ref_gtf, ref_fasta, data) if not classification: logger.info("Protein coding classification of %s was skipped because " "CPAT was not found." % assembled_gtf) return assembled_gtf ref_db = gtf.get_gtf_db(ref_gtf) known_transcript = {feature["transcript_id"][0]: feature.source for feature in gtf.complete_features(ref_db)} assembled_db = gtf.get_gtf_db(assembled_gtf) with file_transaction(out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for feature in gtf.complete_features(assembled_db): transcript_id = feature["transcript_id"][0] if transcript_id not in known_transcript: feature.source = classification[transcript_id] else: feature.source = known_transcript[transcript_id] out_handle.write(str(feature) + "\n") return out_file
def cleanup_transcripts(assembled_gtf, ref_gtf, ref_fasta, out_file=None): """ Clean up a GTF file of assembled transcripts 1) if a known gene is known to code for a protein, remove any *novel* isoforms of the that do not also code for a protein. 2) if a new gene has been annotated and none of its isoforms are protein coding and it is > 200 bp, mark it as a lincRNA. < 200 bp mark it as ncRNA """ if not out_file: out_file = os.path.splitext(assembled_gtf)[0] + ".cleaned.gtf" if file_exists(out_file): return out_file ref_db = gtf.get_gtf_db(ref_gtf) known_transcript = {feature['transcript_id'][0]: feature.source for feature in gtf.complete_features(ref_db)} ref_gene_to_source = gtf.get_gene_source_set(ref_gtf) assembled_db = gtf.get_gtf_db(assembled_gtf) assembled_fasta = gtf.gtf_to_fasta(assembled_gtf, ref_fasta) lengths = fasta.sequence_length(assembled_fasta) with file_transaction(out_file) as tx_out_file: with open(tx_out_file, 'w') as out_handle: for feature in gtf.complete_features(assembled_db): transcript_id = feature['transcript_id'][0] gene_id = feature['gene_id'][0] if transcript_id in known_transcript: out_handle.write(str(feature) + "\n") continue known_coding = "protein_coding" in ref_gene_to_source.get(gene_id, [None]) if known_coding and feature.source != "protein_coding": continue if feature.source != "protein_coding": if lengths[transcript_id] > 200: feature.source = "lincRNA" else: feature.source = "ncRNA" out_handle.write(str(feature) + "\n") return out_file
def isoform_to_gene_name(gtf_file, out_file=None): """ produce a table of isoform -> gene mappings for loading into EBSeq """ if not out_file: out_file = tempfile.NamedTemporaryFile(delete=False).name if file_exists(out_file): return out_file db = gtf.get_gtf_db(gtf_file) line_format = "{transcript}\t{gene}\n" with file_transaction(out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for feature in db.features_of_type('transcript'): transcript = feature['transcript_id'][0] gene = feature['gene_id'][0] out_handle.write(line_format.format(**locals())) return out_file
def isoform_to_gene_name(gtf_file, out_file, data): """ produce a table of isoform -> gene mappings for loading into EBSeq """ if not out_file: out_file = tempfile.NamedTemporaryFile(delete=False).name if file_exists(out_file): return out_file db = gtf.get_gtf_db(gtf_file) line_format = "{transcript}\t{gene}\n" with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for feature in db.features_of_type('transcript'): transcript = feature['transcript_id'][0] gene = feature['gene_id'][0] out_handle.write(line_format.format(**locals())) return out_file
def clean_assembly(gtf_file, clean=None, dirty=None): """ clean the likely garbage transcripts from the GTF file including: 1. any novel single-exon transcripts 2. any features with an unknown strand """ base, ext = os.path.splitext(gtf_file) db = gtf.get_gtf_db(gtf_file, in_memory=True) clean = clean if clean else base + ".clean" + ext dirty = dirty if dirty else base + ".dirty" + ext if file_exists(clean): return clean, dirty with open(clean, "w") as clean_handle, open(dirty, "w") as dirty_handle: for gene in db.features_of_type('gene'): for transcript in db.children(gene, level=1): if is_likely_noise(db, transcript): write_transcript(db, dirty_handle, transcript) else: write_transcript(db, clean_handle, transcript) return clean, dirty
def make_pizzly_gtf(gtf_file, out_file, data): """ pizzly needs the GTF to be in gene -> transcript -> exon order for each gene. it also wants the gene biotype set as the source """ if file_exists(out_file): return out_file db = gtf.get_gtf_db(gtf_file) with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for gene in db.features_of_type("gene"): children = [x for x in db.children(id=gene)] for child in children: if child.attributes.get("gene_biotype", None): gene_biotype = child.attributes.get("gene_biotype") gene.attributes['gene_biotype'] = gene_biotype gene.source = gene_biotype[0] print(gene, file=out_handle) for child in children: child.source = gene_biotype[0] # gffread produces a version-less FASTA file child.attributes.pop("transcript_version", None) print(child, file=out_handle) return out_file