def intron_sizes( inputfile=None, outputfile=None, key_name=None): """ Add a new key to transcript features containing a comma-separated list of intron sizes. """ gtf = GTF(inputfile, check_ensembl_format=False) all_tx_ids = gtf.get_tx_ids(nr=True) intron_bo = gtf.get_introns(by_transcript=True, name=["transcript_id"], intron_nb_in_name=False, feat_name=False) strands = gtf.select_by_key("feature", "transcript").extract_data("transcript_id,strand", as_dict_of_values=True, no_na=True, nr=True, hide_undef=True) intron_size = {tx: [] for tx in all_tx_ids} for bed_line in intron_bo: intron_size[bed_line.name] += [str(bed_line.end - bed_line.start)] for tx_id in intron_size: if len(intron_size[tx_id]): if strands[tx_id] == "-": intron_size[tx_id] = ",".join(reversed(intron_size[tx_id])) else: intron_size[tx_id] = ",".join(intron_size[tx_id]) else: intron_size[tx_id] = "0" if len(intron_size): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=intron_size, new_key=key_name) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def exon_sizes(inputfile=None, outputfile=None, key_name=None): """ Add a new key to transcript features containing a comma-separated list of exon-size. """ gtf = GTF(inputfile) all_tx_ids = gtf.get_tx_ids(nr=True) tx_to_size_list = dict() exons_starts = gtf.select_by_key("feature", "exon").extract_data( "transcript_id,start", as_dict_of_merged_list=True, no_na=True, nr=False) if not len(exons_starts): message("No exon found.", type="ERROR") exons_ends = gtf.select_by_key("feature", "exon").extract_data( "transcript_id,end", as_dict_of_merged_list=True, no_na=True, nr=False) strands = gtf.select_by_key("feature", "transcript").extract_data( "transcript_id,strand", as_dict_of_values=True, no_na=True, nr=True, hide_undef=True) for tx_id in all_tx_ids: size_list = [] for s, e in zip(exons_starts[tx_id], exons_ends[tx_id]): size = str(int(e) - int(s) + 1) size_list += [size] if strands[tx_id] == "-": size_list = reversed(size_list) tx_to_size_list[tx_id] = ",".join(size_list) if len(tx_to_size_list): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=tx_to_size_list, new_key=key_name) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def select_by_intron_size(inputfile=None, outputfile=None, intron_size=0, merged=False, invert_match=False, delete_monoexonic=False, add_intron_size=False): """ Select genes which contain an intron of size at least s or whose sum of intron size is at least s """ message("Searching for intronic regions.") gtf = GTF(inputfile, check_ensembl_format=False) introns_bo = gtf.get_introns(by_transcript=True, name=["transcript_id"], intron_nb_in_name=False).sort() # Get the list of transcripts all_tx_ids = gtf.get_tx_ids(nr=True) # The list of transcripts # to be deleted to_delete = OrderedDict() if merged: # Create a dict that will contain the sum of introns for # each transcript intron_sum_dict = OrderedDict.fromkeys(all_tx_ids, 0) for i in introns_bo: size = i.end - i.start tx_id = i.name intron_sum_dict[tx_id] += size for tx_id, sum_intron in list(intron_sum_dict.items()): if sum_intron != 0: if not invert_match: if sum_intron < intron_size: to_delete[tx_id] = 1 else: if sum_intron >= intron_size: to_delete[tx_id] = 1 else: if delete_monoexonic: to_delete[tx_id] = 1 if add_intron_size: gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=intron_sum_dict, new_key="intron_size_sum") else: # Create a dict that will contain a list introns size # for each transcript intron_size_dict = defaultdict(list) for tx_id in all_tx_ids: intron_size_dict[tx_id] = [] for i in introns_bo: size = i.end - i.start tx_id = i.name intron_size_dict[tx_id] += [size] for tx_id, list_size in list(intron_size_dict.items()): if not list_size: intron_size_dict[tx_id] = [0] if delete_monoexonic: to_delete[tx_id] = 1 continue for size in intron_size_dict[tx_id]: if not invert_match: if size < intron_size: to_delete[tx_id] = 1 else: if size >= intron_size: to_delete[tx_id] = 1 if add_intron_size: for tx_id, list_size in list(intron_size_dict.items()): list_size = [str(x) for x in list_size] intron_size_dict[tx_id] = ",".join(list_size) gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=intron_size_dict, new_key="intron_size") all_tx_ids = gtf.get_tx_ids(nr=True) all_tx_ids = [x for x in all_tx_ids if x not in to_delete] msg_list = ",".join(list(to_delete.keys())) nb_char = min([len(msg_list), 40]) msg_list = msg_list[0:nb_char] message("Deleting: " + msg_list + "...") gtf = gtf.select_by_key("transcript_id", ",".join(all_tx_ids)) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)