def intronic(inputfile=None, outputfile=None, names='transcript_id', separator="_", intron_nb_in_name=False, no_feature_name=False, by_transcript=False): """ Extract intronic regions. """ message("Searching for intronic regions.") # Need to load if the gtf comes from # <stdin> gtf = GTF(inputfile, check_ensembl_format=False) if not by_transcript: introns_bo = gtf.get_introns() for i in introns_bo: write_properly(chomp(str(i)), outputfile) else: introns_bo = gtf.get_introns(by_transcript=True, name=names.split(","), sep=separator, intron_nb_in_name=intron_nb_in_name, feat_name=not no_feature_name) for i in introns_bo: write_properly(chomp(str(i)), outputfile) gc.disable() close_properly(outputfile, inputfile)
def intron_sizes( inputfile=None, outputfile=None, key_name=None): """ Add a new key to transcript features containing a comma-separated list of intron sizes. """ gtf = GTF(inputfile, check_ensembl_format=False) all_tx_ids = gtf.get_tx_ids(nr=True) intron_bo = gtf.get_introns(by_transcript=True, name=["transcript_id"], intron_nb_in_name=False, feat_name=False) strands = gtf.select_by_key("feature", "transcript").extract_data("transcript_id,strand", as_dict_of_values=True, no_na=True, nr=True, hide_undef=True) intron_size = {tx: [] for tx in all_tx_ids} for bed_line in intron_bo: intron_size[bed_line.name] += [str(bed_line.end - bed_line.start)] for tx_id in intron_size: if len(intron_size[tx_id]): if strands[tx_id] == "-": intron_size[tx_id] = ",".join(reversed(intron_size[tx_id])) else: intron_size[tx_id] = ",".join(intron_size[tx_id]) else: intron_size[tx_id] = "0" if len(intron_size): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=intron_size, new_key=key_name) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def select_by_intron_size(inputfile=None, outputfile=None, intron_size=0, merged=False, invert_match=False, delete_monoexonic=False, add_intron_size=False): """ Select genes which contain an intron of size at least s or whose sum of intron size is at least s """ message("Searching for intronic regions.") gtf = GTF(inputfile, check_ensembl_format=False) introns_bo = gtf.get_introns(by_transcript=True, name=["transcript_id"], intron_nb_in_name=False).sort() # Get the list of transcripts all_tx_ids = gtf.get_tx_ids(nr=True) # The list of transcripts # to be deleted to_delete = OrderedDict() if merged: # Create a dict that will contain the sum of introns for # each transcript intron_sum_dict = OrderedDict.fromkeys(all_tx_ids, 0) for i in introns_bo: size = i.end - i.start tx_id = i.name intron_sum_dict[tx_id] += size for tx_id, sum_intron in list(intron_sum_dict.items()): if sum_intron != 0: if not invert_match: if sum_intron < intron_size: to_delete[tx_id] = 1 else: if sum_intron >= intron_size: to_delete[tx_id] = 1 else: if delete_monoexonic: to_delete[tx_id] = 1 if add_intron_size: gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=intron_sum_dict, new_key="intron_size_sum") else: # Create a dict that will contain a list introns size # for each transcript intron_size_dict = defaultdict(list) for tx_id in all_tx_ids: intron_size_dict[tx_id] = [] for i in introns_bo: size = i.end - i.start tx_id = i.name intron_size_dict[tx_id] += [size] for tx_id, list_size in list(intron_size_dict.items()): if not list_size: intron_size_dict[tx_id] = [0] if delete_monoexonic: to_delete[tx_id] = 1 continue for size in intron_size_dict[tx_id]: if not invert_match: if size < intron_size: to_delete[tx_id] = 1 else: if size >= intron_size: to_delete[tx_id] = 1 if add_intron_size: for tx_id, list_size in list(intron_size_dict.items()): list_size = [str(x) for x in list_size] intron_size_dict[tx_id] = ",".join(list_size) gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=intron_size_dict, new_key="intron_size") all_tx_ids = gtf.get_tx_ids(nr=True) all_tx_ids = [x for x in all_tx_ids if x not in to_delete] msg_list = ",".join(list(to_delete.keys())) nb_char = min([len(msg_list), 40]) msg_list = msg_list[0:nb_char] message("Deleting: " + msg_list + "...") gtf = gtf.select_by_key("transcript_id", ",".join(all_tx_ids)) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def coverage( inputfile=None, outputfile=None, bw_list=None, labels=None, pseudo_count=1, nb_window=1, ft_type="promoter", n_highest=None, downstream=1000, key_name="cov", zero_to_na=False, name_column=None, upstream=1000, chrom_info=None, nb_proc=1, matrix_out=False, stat='mean'): """ Compute transcript coverage with one or several bigWig. """ # ------------------------------------------------------------------------- # Create a list of labels. # Take user input in account # ------------------------------------------------------------------------- bw_list = [x.name for x in bw_list] if len(bw_list) != len(set(bw_list)): message("Found the same bigwigs several times.", type="ERROR") message('Checking labels.') if labels is not None: labels = labels.split(",") # Ensure the number of labels is the same as the number of bw files. if len(labels) != len(bw_list): message("The number of labels should be the same as the number of" " bigwig files.", type="ERROR") # Ensure labels are non-redondant if len(labels) > len(set(labels)): message("Labels must be unique.", type="ERROR") else: labels = [] for i in range(len(bw_list)): labels += [ os.path.splitext( os.path.basename( bw_list[i]))[0]] # ------------------------------------------------------------------------- # Check the number of windows # # ------------------------------------------------------------------------- if n_highest is None: n_highest = nb_window message('Number of bins: %d' % nb_window) message('N highest values: %d' % n_highest) if n_highest > nb_window: message('The number of window used for computing the score' ' (-n) can not be greater than the number of' ' windows (-w)', type="ERROR") sys.exit() # ------------------------------------------------------------------------- # Check input file is in bed or GTF format # # ------------------------------------------------------------------------- message("Loading input file...") if inputfile.name == '<stdin>': gtf = GTF(inputfile.name) is_gtf = True else: region_bo = BedTool(inputfile.name) if len(region_bo) == 0: message("Unable to find requested regions", type="ERROR") if region_bo.file_type == 'gff': gtf = GTF(inputfile.name) is_gtf = True else: is_gtf = False # ------------------------------------------------------------------------- # Get regions of interest # # ------------------------------------------------------------------------- name_column = name_column.split(",") if is_gtf: message("Getting regions of interest...") if ft_type.lower() == "intergenic": region_bo = gtf.get_intergenic(chrom_info, 0, 0).slop(s=True, l=upstream, r=downstream, g=chrom_info.name).sort() elif ft_type.lower() == "intron": region_bo = gtf.get_introns().slop(s=True, l=upstream, r=downstream, g=chrom_info.name).sort() elif ft_type == "intron_by_tx": region_bo = gtf.get_introns(by_transcript=True, name=name_column, ).slop(s=True, l=upstream, r=downstream, g=chrom_info.name).sort() elif ft_type.lower() in ["promoter", "tss"]: region_bo = gtf.get_tss(name=name_column, ).slop(s=True, l=upstream, r=downstream, g=chrom_info.name).sort() elif ft_type.lower() in ["tts", "terminator"]: region_bo = gtf.get_tts(name=name_column).slop(s=True, l=upstream, r=downstream, g=chrom_info.name).sort() else: region_bo = gtf.select_by_key( "feature", ft_type, 0 ).to_bed(name=name_column).slop(s=True, l=upstream, r=downstream, g=chrom_info.name).sort() if len(region_bo) == 0: message("Unable to find requested regions", type="ERROR") else: region_bo = region_bo.slop(s=True, l=upstream, r=downstream, g=chrom_info.name).sort() region_bed = make_tmp_file(prefix="region", suffix=".bed") region_bo.saveas(region_bed.name) # ------------------------------------------------------------------------- # Compute coverage # # ------------------------------------------------------------------------- result_bed = bw_cov_mp(bw_list=bw_list, region_file=open(region_bed.name), labels=labels, bin_nb=nb_window, pseudo_count=pseudo_count, zero_to_na=zero_to_na, nb_proc=nb_proc, n_highest=n_highest, stat=stat, verbose=pygtftk.utils.VERBOSITY) if matrix_out: result_bed.close() df_first = pd.read_csv(result_bed.name, sep="\t", header=None) df_first = df_first.ix[:, [0, 1, 2, 3, 5, 4]] df_list = [] for i in range(len(labels)): # create a sub data frame containing the coverage values of the # current bwig str_to_find = r"^" + labels[i] + r"\|" tmp_df = df_first[df_first[3].str.match(str_to_find)].copy() to_replace = r"^" + labels[i] + r"\|" tmp_df.iloc[:, 3] = tmp_df.iloc[:, 3].replace(to_replace, r"", regex=True) df_list += [tmp_df] df_final = df_list.pop(0) for i in df_list: # Add columns to df final by joining on # chrom, start, end, transcript_id, strand df_final = df_final.merge(i.iloc[:, list(range(6))], on=[0, 1, 2, 3, 5]) df_final.columns = ["chrom", "start", "end", "name", "strand"] + labels df_final.to_csv(outputfile, sep="\t", index=False) else: nb_line = 0 for i in result_bed: outputfile.write(i) nb_line += 1 if nb_line == 0: message("No line available in output...", type="ERROR") gc.disable() close_properly(inputfile, outputfile)