def define_gff3_genes(gtf): # GFF/GTF conversion and differences article: http://blog.nextgenetics.net/?e=27 exons_dt, gene_features_dt, gene_coord_dt = (defaultdict(list) for _ in range(3)) with open(gtf) as fh: for ix, line in enumerate(fh): # Skip comment lines if line[0] != '#': data = line.strip().split('\t') trans_id = get_id(line, 'transcript_id') gene_id = get_id(line, 'gene_id') chrom, source, feat_type, exon_st, exon_end, score, strand, frame, *_ = data # Replace last column with a GFF formatted attributes columns # Added a GID attribute to conserve all the GTF data data[-1] = "ID=" + trans_id + ";GID=" + gene_id exons_dt[gene_id].extend([int(exon_st), int(exon_end)]) gene_features_dt[gene_id] = "\t".join([ chrom, source, "gene", "EXON_ST", "EXON_END", score, strand, frame, data[-1] ]) for gene in gene_features_dt: gene_line = gene_features_dt[gene]\ .replace("EXON_ST", str(min(exons_dt[gene])))\ .replace("EXON_END", str(max(exons_dt[gene]))) + "\n" gene_coord_dt[gene] = gene_line return gene_coord_dt
def get_models_gtf(gtf_filename): '''Build gene models from gtf file''' infile = open(gtf_filename, 'rU') gff_lines = infile.readlines() infile.close() locus_dict = {} for line in gff_lines: line = line.strip() if line.startswith('#'): continue line_fields = line.split('\t') transcript_ID = get_id(line, 'transcript_id') locus_ID = get_id(line, 'gene_id') chrom = line_fields[0] if locus_ID not in locus_dict: gene_model = Gene(locus_ID) gene_model.add_sense(line_fields[6]) locus_dict.update({locus_ID: gene_model}) else: gene_model = locus_dict[locus_ID] if transcript_ID not in gene_model.transcript_dict: transcript_model = Transcript(transcript_ID) transcript_model.sense = line_fields[6] gene_model.transcript_dict.update( {transcript_ID: transcript_model}) transcript_model.gene = locus_ID transcript_model.chrom = chrom transcript_model = gene_model.transcript_dict[transcript_ID] if line_fields[2].upper() == 'CDS': # and proteinCoding == True: cds_coords = [int(line_fields[3]), int(line_fields[4])] transcript_model.add_CDS(cds_coords) if line_fields[2].upper() == 'EXON': # and proteinCoding == True: exon_coords = [int(line_fields[3]), int(line_fields[4])] transcript_model.add_exon(exon_coords) gene_model.transcript_dict.update({transcript_ID: transcript_model}) locus_dict.update({locus_ID: gene_model}) return locus_dict
def gtf_to_gff3(gtf): gff3_lines = defaultdict(list) with open(gtf) as fh: for ix, line in enumerate(fh): # Skip comment lines if line[0] != '#': data = line.strip().split('\t') trans_id = get_id(line, 'transcript_id') gene_id = get_id(line, 'gene_id') chrom, source, feat_type, exon_st, exon_end, score, strand, frame, *_ = data # Replace last column with a GFF formatted attributes columns # Added a GID attribute to conserve all the GTF data data[-1] = "ID=" + trans_id + ";GID=" + gene_id # New GFF line gff3_lines[gene_id].append('\t'.join(data) + "\n") gene_coord_dt = define_gff3_genes(gtf) # Necessary to add a gene feature line to the GFF file in the correct order new_lines = [] for gene_id in sorted(gene_coord_dt.keys()): new_lines.append(gene_coord_dt[gene_id]) for ln in sorted(gff3_lines[gene_id]): new_lines.append(ln) outfile = os.path.splitext(gtf)[0] + ".gff3" with open(outfile, "w+") as fh: for line in new_lines: fh.write(line) return outfile
def add_features_to_gtf(gtf_file): gtf_obj = create_gtf_object(gtf_file) trans_cds_dt = gtf_obj.trans_cds_dt trans_5utr_dt = gtf_obj.trans_5utr_dt trans_3utr_dt = gtf_obj.trans_3utr_dt trans_start_codon_dt = gtf_obj.trans_start_codon trans_stop_codon_dt = gtf_obj.trans_stop_codon trans_gene_coords_dt = {} for gene, trans_list in gtf_obj.gene_trans_dt.items(): gene_coords = gtf_obj.gene_coords_dt[gene] for trans in trans_list: trans_gene_coords_dt[trans] = gene_coords transcripts_lines_dt = defaultdict(list) for trans, lines_ix_list in gtf_obj.trans_gtf_lines_index.items(): line_1 = linecache.getline(gtf_obj.gtf_path, lines_ix_list[0]) seqname, source, _, _, _, score, strand, frame, attr = line_1.strip( '\n').split('\t') gene_coords = trans_gene_coords_dt[trans] start, end = gene_coords[0], gene_coords[-1] g_row = f'{seqname}\t{source}\tgene\t{start}\t{end}\t"."\t{strand}\t{frame}\t{attr}' t_row = f'{seqname}\t{source}\ttranscript\t{start}\t{end}\t"."\t{strand}\t{frame}\t{attr}' for line in [g_row, t_row]: if line not in transcripts_lines_dt[trans]: # Transcripts visualization on IGV is better without these lines; thus I disable it for the moment # transcripts_lines_dt[trans].append(line.strip('\n')) continue for line_ix in lines_ix_list: line = linecache.getline(gtf_obj.gtf_path, line_ix) # Important! Ignore any line that is not an exon coordinate so as the CDS re-annotation is completely new # Other features will be re-added by the function write_gtf_with_features further downstream _, _, line_feature, *_ = line.split('\t') if line_feature != "exon": continue if line not in transcripts_lines_dt[trans]: transcripts_lines_dt[trans].append(line.strip('\n')) feature_dicts_list = [ trans_cds_dt, trans_5utr_dt, trans_3utr_dt, trans_start_codon_dt, trans_stop_codon_dt ] feature_tags_list = [ "CDS", "five_prime_utr", "three_prime_utr", "start_codon", "stop_codon" ] for feature_dt, feature_tag in zip(feature_dicts_list, feature_tags_list): score = "." try: coord_list = feature_dt[trans] # The value of "start_codon", "stop_codon" is a tuple; thus, it must be converted to a list if feature_tag in {"start_codon", "stop_codon"}: coord_list = [coord_list] for coord in coord_list: start, end = sorted(coord) line = f"{seqname}\t{source}\t{feature_tag}\t{start}\t{end}\t{score}\t{strand}\t{frame}\t{attr}" if line not in transcripts_lines_dt[trans]: transcripts_lines_dt[trans].append(line.strip('\n')) except Exception: continue gtf_lines = [] for trans, trans_lines in transcripts_lines_dt.items(): for line in trans_lines: gtf_lines.append(line.strip('\n')) sorted_lines = sorted(gtf_lines, key=lambda l: (get_id(l, 'gene_id'), get_id(l, 'transcript_id'), int(l.split('\t')[3]))) # Write output file, overwrite input file with open(gtf_file, "w+") as fh: for line in sorted_lines: fh.write(line + '\n') return gtf_file
def annotate_cds_into_gtf(gtf_obj, trans_cds_dt, outfile): print(time.asctime(), f'Re-annotating CDS coordinates into file: {outfile}') gtf_path = gtf_obj.gtf_path new_gtf_lines = defaultdict(list) # First, Upload the transcripts for which a CDS was found trans_with_cds = set() for trans, trans_cds in trans_cds_dt.items(): trans_with_cds.add(trans) # GTF line format: seqname, source, feature, start, end, score, strand, frame, attr # Pick a representative GTF line from for the transcript ID to fill the other fields trans_line_ix = gtf_obj.trans_gtf_lines_index[trans][0] trans_gtf_line = linecache.getline(gtf_path, trans_line_ix) seqname, source, _, _, _, score, _, frame, attr = trans_gtf_line.strip( '\n').split('\t') feature_cds = "CDS" score_cds = "." strand = gtf_obj.trans_sense_dt[trans] # Create a new, updated, Trans_id to GTF_line dict for trans_line_ix in gtf_obj.trans_gtf_lines_index[trans]: # Important! Ignore any line that is not an exon coordinate so as the CDS re-annotation is completely new # Other features will be re-added by the function write_gtf_with_features further downstream # GTF file row format: seqname, source, feature, start, end, score, strand, frame, attr trans_gtf_line = linecache.getline(gtf_path, trans_line_ix) # Do NOT rewrite the feature variabl "line_feature", it cause conflict with same variable on top! _, _, line_feature, *_ = trans_gtf_line.split('\t') if line_feature.upper() == "EXON": new_gtf_lines[trans].append(trans_gtf_line) if trans_cds: for cds in trans_cds: start, end = sorted(cds) new_line = [ seqname, source, feature_cds, str(start), str(end), score_cds, strand, frame, attr ] new_line = "\t".join(new_line) + "\n" new_gtf_lines[trans].append(new_line) cds_not_found_trans = set(gtf_obj.trans_exons_dt.keys()) - trans_with_cds # Second, upload the lines of transcript for which a CDS could not be found for trans in sorted(cds_not_found_trans): for trans_line_ix in gtf_obj.trans_gtf_lines_index[trans]: trans_gtf_line = linecache.getline(gtf_path, trans_line_ix) new_gtf_lines[trans].append(trans_gtf_line) # # Third, upload the lines for the transcript for which a CDS was already known # for trans in sorted(trans_with_cds): # for line_obj in gtf_obj.trans_gtf_lines_dt[trans]: # new_gtf_lines[trans].append(line_obj) missing_trans = set(gtf_obj.trans_gene_dt.keys()).symmetric_difference( set(new_gtf_lines.keys())) for trans in missing_trans: for trans_line_ix in gtf_obj.trans_gtf_lines_index[trans]: trans_gtf_line = linecache.getline(gtf_path, trans_line_ix) new_gtf_lines[trans].append(trans_gtf_line) # To conclude, sort all the lines and write output all_lines = [] for trans, trans_lines in new_gtf_lines.items(): for line in trans_lines: # GTF line format: seqname, source, feature, start, end, score, strand, frame, attr seqname, source, feature, start, end, score, strand, frame, attr = line.split( '\t') gene = get_id(line, 'gene_id') trans = get_id(line, 'transcript_id') sort_tag = f"{seqname}-{gene}-{trans}-{feature}-{start}" all_lines.append((line, sort_tag)) # Write Output with open(outfile, "w+") as fh: for (line, sort_tag) in sorted(all_lines, key=lambda x: x[1]): fh.write(line) return outfile