def intron_sizes( inputfile=None, outputfile=None, key_name=None): """ Add a new key to transcript features containing a comma-separated list of intron sizes. """ gtf = GTF(inputfile, check_ensembl_format=False) all_tx_ids = gtf.get_tx_ids(nr=True) intron_bo = gtf.get_introns(by_transcript=True, name=["transcript_id"], intron_nb_in_name=False, feat_name=False) strands = gtf.select_by_key("feature", "transcript").extract_data("transcript_id,strand", as_dict_of_values=True, no_na=True, nr=True, hide_undef=True) intron_size = {tx: [] for tx in all_tx_ids} for bed_line in intron_bo: intron_size[bed_line.name] += [str(bed_line.end - bed_line.start)] for tx_id in intron_size: if len(intron_size[tx_id]): if strands[tx_id] == "-": intron_size[tx_id] = ",".join(reversed(intron_size[tx_id])) else: intron_size[tx_id] = ",".join(intron_size[tx_id]) else: intron_size[tx_id] = "0" if len(intron_size): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=intron_size, new_key=key_name) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def exon_sizes(inputfile=None, outputfile=None, key_name=None): """ Add a new key to transcript features containing a comma-separated list of exon-size. """ gtf = GTF(inputfile) all_tx_ids = gtf.get_tx_ids(nr=True) tx_to_size_list = dict() exons_starts = gtf.select_by_key("feature", "exon").extract_data( "transcript_id,start", as_dict_of_merged_list=True, no_na=True, nr=False) if not len(exons_starts): message("No exon found.", type="ERROR") exons_ends = gtf.select_by_key("feature", "exon").extract_data( "transcript_id,end", as_dict_of_merged_list=True, no_na=True, nr=False) strands = gtf.select_by_key("feature", "transcript").extract_data( "transcript_id,strand", as_dict_of_values=True, no_na=True, nr=True, hide_undef=True) for tx_id in all_tx_ids: size_list = [] for s, e in zip(exons_starts[tx_id], exons_ends[tx_id]): size = str(int(e) - int(s) + 1) size_list += [size] if strands[tx_id] == "-": size_list = reversed(size_list) tx_to_size_list[tx_id] = ",".join(size_list) if len(tx_to_size_list): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=tx_to_size_list, new_key=key_name) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def nb_exons(inputfile=None, outputfile=None, key_name=None, text_format=False): """ Count the number of exons in the gtf file. """ gtf = GTF(inputfile) n_exons = defaultdict(int) # ------------------------------------------------------------------------- # Computing number of exon for each transcript in input GTF file # # ------------------------------------------------------------------------- message("Computing number of exons for each transcript in input GTF file.") exon = gtf.select_by_key("feature", "exon") fields = exon.extract_data("transcript_id") for i in fields: tx_id = i[0] n_exons[tx_id] += 1 if text_format: for tx_id in n_exons: outputfile.write(tx_id + "\t" + str(n_exons[tx_id]) + "\ttranscript\n") else: if len(n_exons): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=n_exons, new_key=key_name) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def divergent( inputfile=None, outputfile=None, key_name=None, upstream=1500, downstream=1500, chrom_info=None, no_strandness=False, no_annotation=False): """ Find transcript with divergent promoters. """ message("Using -u " + str(upstream) + ".") message("Using -d " + str(downstream) + ".") tx_with_divergent = dict() dist_to_divergent = dict() tss_pos = dict() message("Loading GTF.") gtf = GTF(inputfile) message("Getting transcript coordinates.") tx_feat = gtf.select_by_key("feature", "transcript") message("Getting tss coordinates.") tss_bo = tx_feat.get_tss(name=["transcript_id", "gene_id"], sep="||") # get tss position for i in tss_bo: tx_id_tss, gn_id_tss = i.name.split("||") tss_pos[tx_id_tss] = int(i.start) message("Getting promoter coordinates.") promoter_bo = tss_bo.slop(s=True, l=upstream, r=downstream, g=chrom_info.name).cut([0, 1, 2, 3, 4, 5]) message("Intersecting...") if no_strandness: prom_with_tss_bo = promoter_bo.intersect(tss_bo, wb=True, s=False, S=False) else: prom_with_tss_bo = promoter_bo.intersect(tss_bo, wb=True, s=False, S=True) tmp_file = make_tmp_file("promoter_slop", ".bed") promoter_bo.saveas(tmp_file.name) tmp_file = make_tmp_file("promoter_intersection_with_tss_as_", ".bed") prom_with_tss_bo.saveas(tmp_file.name) for i in prom_with_tss_bo: tx_id_tss, gn_id_tss = i.fields[9].split("||") tx_id_prom, gene_id_prom = i.fields[3].split("||") if gene_id_prom != gn_id_tss: if tx_id_prom in tx_with_divergent: dist = abs(tss_pos[tx_id_prom] - tss_pos[tx_id_tss]) if dist < dist_to_divergent[tx_id_prom]: dist_to_divergent[tx_id_prom] = dist tx_with_divergent[tx_id_prom] = tx_id_tss else: dist = abs(tss_pos[tx_id_prom] - tss_pos[tx_id_tss]) dist_to_divergent[tx_id_prom] = dist tx_with_divergent[tx_id_prom] = tx_id_tss if not no_annotation: if key_name is None: key_name = "divergent" key_name_dist = "dist_to_divergent" else: key_name_dist = "dist_" + key_name if len(tx_with_divergent): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=tx_with_divergent, new_key=key_name) gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=dist_to_divergent, new_key=key_name_dist) gtf.write(outputfile, gc_off=True) else: gtf.select_by_key("transcript_id", ",".join(list(tx_with_divergent.keys()))).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def overlapping( inputfile=None, outputfile=None, key_name=None, upstream=1500, downstream=1500, chrom_info=None, feature_type='transcript', same_strandedness=False, diff_strandedness=False, annotate_gtf=False, bool=False, annotate_all=False, invert_match=False): """ Description: Find transcripts whose body/TSS/TTS do or do not overlap with any transcript from another gene. """ # ---------------------------------------------------------------------- # Prepare key names # ---------------------------------------------------------------------- if annotate_gtf: if key_name is None: key_info = ["overlap", feature_type, "u" + str(upstream / 1000) + "k", "d" + str(downstream / 1000) + "k" ] key_name = "_".join(key_info) if invert_match: message("--annotate-gtf and --invert-match are " "mutually exclusive.", type="ERROR") if same_strandedness and diff_strandedness: message("--same-strandedness and --diff-strandedness are " "mutually exclusive.", type="ERROR") message("Using -u " + str(upstream)) message("Using -d " + str(downstream)) overlapping_tx = defaultdict(list) # Load the GTF so that it won't be lost # if GTF stream comes from stdin gtf = GTF(inputfile) message("Getting transcript in bed format") tx_feat = gtf.select_by_key("feature", "transcript") if annotate_all: overlapping_tx = gtf.extract_data(keys=["transcript_id"], as_dict=True, default_val="0") for i in overlapping_tx: overlapping_tx[i] = [] # ---------------------------------------------------------------------- # Get transcript limits # ---------------------------------------------------------------------- tx_bed = tx_feat.to_bed(name=["transcript_id", "gene_id"], sep="||") message("Getting " + feature_type + " and 'slopping'.") if feature_type == "transcript": bed_obj = tx_bed.slop(s=True, l=upstream, r=downstream, g=chrom_info.name).cut([0, 1, 2, 3, 4, 5]) elif feature_type == "promoter": bed_obj = tx_feat.get_tss(name=["transcript_id", "gene_id"], sep="||").slop(s=True, l=upstream, r=downstream, g=chrom_info.name).cut([0, 1, 2, 3, 4, 5]) elif feature_type == "tts": bed_obj = tx_feat.get_tts(name=["transcript_id", "gene_id"], sep="||").slop(s=True, l=upstream, r=downstream, g=chrom_info.name).cut([0, 1, 2, 3, 4, 5]) else: message("Not implemented yet", type="ERROR") tmp_file = make_tmp_file(feature_type + "_slopped_region", ".bed") bed_obj.saveas(tmp_file.name) overlap_regions = bed_obj.intersect(tx_bed, wb=True, s=same_strandedness, S=diff_strandedness) tmp_file = make_tmp_file(feature_type + "_overlapping_regions", ".bed") overlap_regions.saveas(tmp_file.name) for i in overlap_regions: tx_other, gn_other = i.fields[9].split("||") tx_id, gene_id = i.fields[3].split("||") if gene_id != gn_other: overlapping_tx[tx_id] += [tx_other] if bool: for k, _ in overlapping_tx.items(): if not len(overlapping_tx[k]): overlapping_tx[k] = "0" else: overlapping_tx[k] = "1" if not invert_match: if not annotate_gtf: value = ",".join(set(overlapping_tx.keys())) gtf.select_by_key("transcript_id", value).write(outputfile, gc_off=True) else: if len(overlapping_tx): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=overlapping_tx, new_key=key_name) gtf.write(outputfile, gc_off=True) else: values = ",".join(set(overlapping_tx.keys())) gtf.select_by_key("transcript_id", values, invert_match).write(outputfile, gc_off=True) gc.disable() close_properly(outputfile, inputfile)
def select_by_intron_size(inputfile=None, outputfile=None, intron_size=0, merged=False, invert_match=False, delete_monoexonic=False, add_intron_size=False): """ Select genes which contain an intron of size at least s or whose sum of intron size is at least s """ message("Searching for intronic regions.") gtf = GTF(inputfile, check_ensembl_format=False) introns_bo = gtf.get_introns(by_transcript=True, name=["transcript_id"], intron_nb_in_name=False).sort() # Get the list of transcripts all_tx_ids = gtf.get_tx_ids(nr=True) # The list of transcripts # to be deleted to_delete = OrderedDict() if merged: # Create a dict that will contain the sum of introns for # each transcript intron_sum_dict = OrderedDict.fromkeys(all_tx_ids, 0) for i in introns_bo: size = i.end - i.start tx_id = i.name intron_sum_dict[tx_id] += size for tx_id, sum_intron in list(intron_sum_dict.items()): if sum_intron != 0: if not invert_match: if sum_intron < intron_size: to_delete[tx_id] = 1 else: if sum_intron >= intron_size: to_delete[tx_id] = 1 else: if delete_monoexonic: to_delete[tx_id] = 1 if add_intron_size: gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=intron_sum_dict, new_key="intron_size_sum") else: # Create a dict that will contain a list introns size # for each transcript intron_size_dict = defaultdict(list) for tx_id in all_tx_ids: intron_size_dict[tx_id] = [] for i in introns_bo: size = i.end - i.start tx_id = i.name intron_size_dict[tx_id] += [size] for tx_id, list_size in list(intron_size_dict.items()): if not list_size: intron_size_dict[tx_id] = [0] if delete_monoexonic: to_delete[tx_id] = 1 continue for size in intron_size_dict[tx_id]: if not invert_match: if size < intron_size: to_delete[tx_id] = 1 else: if size >= intron_size: to_delete[tx_id] = 1 if add_intron_size: for tx_id, list_size in list(intron_size_dict.items()): list_size = [str(x) for x in list_size] intron_size_dict[tx_id] = ",".join(list_size) gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=intron_size_dict, new_key="intron_size") all_tx_ids = gtf.get_tx_ids(nr=True) all_tx_ids = [x for x in all_tx_ids if x not in to_delete] msg_list = ",".join(list(to_delete.keys())) nb_char = min([len(msg_list), 40]) msg_list = msg_list[0:nb_char] message("Deleting: " + msg_list + "...") gtf = gtf.select_by_key("transcript_id", ",".join(all_tx_ids)) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def closest_genes( inputfile=None, outputfile=None, from_region_type=None, no_header=False, nb_neighbors=1, to_region_type=None, same_strandedness=False, diff_strandedness=False, text_format=False, identifier="gene_id", collapse=False): """ Find the n closest genes for each gene. """ if same_strandedness and diff_strandedness: message("--same-strandedness and --diff-strandedness are " "mutually exclusive.", type="ERROR") # ---------------------------------------------------------------------- # load GTF # ---------------------------------------------------------------------- gtf = GTF(inputfile) gn_gtf = gtf.select_by_key("feature", "gene") gn_ids = gn_gtf.get_gn_ids(nr=True) if len(gn_gtf) == 0: message("No gene feature found. Please use convert_ensembl.", type="ERROR") if nb_neighbors >= (len(gn_gtf) - 1): message("Two much neighbors", type="ERROR") all_ids = gn_gtf.extract_data(identifier, as_list=True, no_na=False) if "." in all_ids: message("Some identifiers are undefined ('.').", type="ERROR") if len(all_ids) == 0: message("The identifier was not found.", type="ERROR") # ---------------------------------------------------------------------- # load GTF and requested regions (for source/'from' transcript) # ---------------------------------------------------------------------- if from_region_type == 'tss': from_regions = gn_gtf.get_5p_end(feat_type="gene", name=[identifier], ).cut([0, 1, 2, 3, 4, 5]).sort() elif from_region_type == 'tts': from_regions = gn_gtf.get_3p_end(feat_type="gene", name=[identifier], ).cut([0, 1, 2, 3, 4, 5]).sort() elif from_region_type == 'gene': from_regions = gn_gtf.to_bed(name=[identifier], ).cut([0, 1, 2, 3, 4, 5]).sort() else: message("Unknown type.", type="ERROR") # ---------------------------------------------------------------------- # load GTF and requested regions (for dest/'to' transcript) # ---------------------------------------------------------------------- if to_region_type == 'tss': to_regions = gn_gtf.get_5p_end(feat_type="gene", name=[identifier], ).cut([0, 1, 2, 3, 4, 5]).sort() elif to_region_type == 'tts': to_regions = gn_gtf.get_3p_end(feat_type="gene", name=[identifier], ).cut([0, 1, 2, 3, 4, 5]).sort() elif to_region_type == 'gene': to_regions = gn_gtf.to_bed(name=[identifier], ).cut([0, 1, 2, 3, 4, 5]).sort() else: message("Unknown type.", type="ERROR") # ---------------------------------------------------------------------- # Search closest genes # ---------------------------------------------------------------------- gene_closest = defaultdict(list) gene_closest_dist = defaultdict(list) closest_bo = from_regions.closest(b=to_regions, k=nb_neighbors, N=True, s=same_strandedness, S=diff_strandedness, d=True) for i in closest_bo: gene_closest[i[3]] += [i[9]] gene_closest_dist[i[3]] += [i[12]] if not text_format: if len(gene_closest): gtf = gtf.add_attr_from_dict(feat="gene", key=identifier, a_dict=gene_closest, new_key="closest_gn") gtf = gtf.add_attr_from_dict(feat="gene", key=identifier, a_dict=gene_closest_dist, new_key="closest_dist") gtf.write(outputfile, gc_off=True) else: if not no_header: outputfile.write("genes\tclosest_genes\tdistances\n") for gene in gn_ids: if not collapse: outputfile.write("\t".join([gene, ",".join(gene_closest[gene]), ",".join(gene_closest_dist[gene])]) + "\n") else: for closest, dist in zip(gene_closest[gene], gene_closest_dist[gene]): outputfile.write("\t".join([gene, closest, dist]) + "\n") gc.disable() close_properly(outputfile, inputfile)
def convergent(inputfile=None, outputfile=None, upstream=1500, downstream=1500, chrom_info=None): """ Find transcript with convergent tts. """ message("Using -u " + str(upstream) + ".") message("Using -d " + str(downstream) + ".") tx_to_convergent_nm = dict() dist_to_convergent = dict() tts_pos = dict() message("Loading GTF.") gtf = GTF(inputfile) message("Getting transcript coordinates.") tx_feat = gtf.select_by_key("feature", "transcript") message("Getting tts coordinates.") tts_bo = tx_feat.get_tts(name=["transcript_id", "gene_id"], sep="||") # get tts position for i in tts_bo: tx_id_ov, gn_id_ov = i.name.split("||") tts_pos[tx_id_ov] = int(i.start) message("Getting tts coordinates.") tts_region_bo = tts_bo.slop(s=True, l=upstream, r=downstream, g=chrom_info.name).cut([0, 1, 2, 3, 4, 5]) message("Intersecting...") tts_intersect_bo = tts_region_bo.intersect(tts_bo, wb=True, s=False, S=True) tmp_file = make_tmp_file("tts_slop", ".bed") tts_region_bo.saveas(tmp_file.name) tmp_file = make_tmp_file("tts_slop_intersection_with_tts_as_", ".bed") tts_intersect_bo.saveas(tmp_file.name) for i in tts_intersect_bo: tx_id_main, gene_id_main = i.fields[3].split("||") tx_id_ov, gn_id_ov = i.fields[9].split("||") if gene_id_main != gn_id_ov: if tx_id_main in tx_to_convergent_nm: dist = abs(tts_pos[tx_id_main] - tts_pos[tx_id_ov]) if dist < dist_to_convergent[tx_id_main]: dist_to_convergent[tx_id_main] = dist tx_to_convergent_nm[tx_id_main] = tx_id_ov else: dist = abs(tts_pos[tx_id_main] - tts_pos[tx_id_ov]) dist_to_convergent[tx_id_main] = dist tx_to_convergent_nm[tx_id_main] = tx_id_ov if len(tx_to_convergent_nm): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=tx_to_convergent_nm, new_key="convergent") gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=dist_to_convergent, new_key="dist_to_convergent") gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def feature_size(inputfile=None, outputfile=None, ft_type="transcript", names="transcript_id", key_name='feature_size', separator="|", bed=False): """ Get the size and limits (start/end) of features enclosed in the GTF. If bed format is requested returns the limits zero-based half open and the size as a score. Otherwise output GTF file with 'feat_size' as a new key and size as value. """ message("Computing feature sizes.") gtf = GTF(inputfile) feat_list = gtf.get_feature_list(nr=True) + ['mature_rna'] if ft_type not in feat_list + ["*"]: message("Unable to find requested feature.", type="ERROR") names = names.split(",") if ft_type != 'mature_rna': if bed: bed_obj = gtf.select_by_key("feature", ft_type).to_bed(name=names, sep=separator, add_feature_type=True) for i in bed_obj: i.score = str(i.end - i.start) write_properly(chomp(str(i)), outputfile) else: tmp_file = make_tmp_file(prefix="feature_size", suffix=".txt") elmt = gtf.extract_data("feature,start,end", as_list_of_list=True, no_na=False, hide_undef=False) for i in elmt: if i[0] != ft_type and ft_type != "*": tmp_file.write("?\n") else: tmp_file.write(str(int(i[2]) - int(i[1]) + 1) + "\n") tmp_file.close() gtf.add_attr_column(tmp_file, key_name).write(outputfile, gc_off=True) else: tx_size = gtf.get_transcript_size() if bed: bed_obj = gtf.select_by_key("feature", 'transcript').to_bed( ['transcript_id'] + names, add_feature_type=False, sep=separator, more_name=['mature_rna']) for i in bed_obj: names = i.name.split(separator) tx_id = names.pop(0) i.score = tx_size[tx_id] i.name = separator.join(names) write_properly(chomp(str(i)), outputfile) else: if len(tx_size): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=tx_size, new_key=key_name) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)