def convert(inputfile=None, outputfile=None, format="bed", names="gene_id,transcript_id", separator="|", more_names=''): """ Convert a GTF to various format. """ if format == "bed3": gtf = GTF(inputfile, check_ensembl_format=False) for i in gtf.extract_data("seqid,start,end", as_list_of_list=True, hide_undef=False, no_na=False): i[1] = str(int(i[1]) - 1) outputfile.write("\t".join(i) + "\n") elif format in ["bed", "bed6"]: gtf = GTF(inputfile, check_ensembl_format=False).write_bed(outputfile=outputfile, name=names, sep=separator, more_name=more_names) gc.disable() close_properly(outputfile, inputfile)
def count(inputfile=None, outputfile=None, header=None, additional_text=None): """ Count the number of features in the gtf file. """ if header is not None: header = header.split(",") gtf = GTF(inputfile, check_ensembl_format=False) feat_nb = OrderedDict() for i in gtf.extract_data("feature"): i = i[0] if i in feat_nb: feat_nb[i] += 1 else: feat_nb[i] = 1 if header is not None: outputfile.write("\t".join(header) + "\n") for i in feat_nb: if additional_text is None: outputfile.write(i + "\t" + str(feat_nb[i]) + "\n") else: outputfile.write(i + "\t" + str(feat_nb[i]) + "\t" + additional_text + "\n") gc.disable() close_properly(outputfile, inputfile)
def bed_to_gtf(inputfile=None, outputfile=None, ft_type="transcript", source="Unknown"): """ Convert a bed file to a gtf. This will make the poor bed feel as if it was a nice gtf (but with lots of empty fields...). May be helpful sometimes... """ message("Converting the bed file into GTF file.") if inputfile.name == '<stdin>': tmp_file = make_tmp_file(prefix="input_bed", suffix=".bed") for i in inputfile: write_properly(chomp(str(i)), tmp_file) tmp_file.close() inputfile.close() bed_obj = BedTool(tmp_file.name) else: bed_obj = BedTool(inputfile.name) n = 1 for i in bed_obj: if i.strand == "": i.strand = "." if i.name == "": i.name = str("feature_" + str(n)) if i.score == "": i.score = "0" if ft_type == "exon": key_value = "gene_id \"" + i.name + "\"; " + \ "transcript_id \"" + i.name + "\"; " + \ "exon_id \"" + i.name + "\";" elif ft_type == "gene": key_value = "gene_id \"" + i.name + "\";" else: key_value = "gene_id \"" + i.name + "\"; " + \ "transcript_id \"" + i.name + "\";" if pygtftk.utils.ADD_CHR == 1: chrom_out = "chr" + i.chrom else: chrom_out = i.chrom list_out = [ chrom_out, source, ft_type, str(i.start + 1), str(i.end), str(i.score), i.strand, ".", key_value ] write_properly("\t".join(list_out), outputfile) n += 1 gc.disable() close_properly(outputfile)
def intronic(inputfile=None, outputfile=None, names='transcript_id', separator="_", intron_nb_in_name=False, no_feature_name=False, by_transcript=False): """ Extract intronic regions. """ message("Searching for intronic regions.") # Need to load if the gtf comes from # <stdin> gtf = GTF(inputfile, check_ensembl_format=False) if not by_transcript: introns_bo = gtf.get_introns() for i in introns_bo: write_properly(chomp(str(i)), outputfile) else: introns_bo = gtf.get_introns(by_transcript=True, name=names.split(","), sep=separator, intron_nb_in_name=intron_nb_in_name, feat_name=not no_feature_name) for i in introns_bo: write_properly(chomp(str(i)), outputfile) gc.disable() close_properly(outputfile, inputfile)
def get_attr_value_list(inputfile=None, outputfile=None, key_name="gene_id", print_key_name=False, separator="\n", count=False): """ Get the list of values observed for an attributes. """ gtf = GTF(inputfile, check_ensembl_format=False) if not count: for akey in key_name.split(","): for i in gtf.get_attr_value_list(akey): if print_key_name: outputfile.write(akey + separator + i + "\n") else: outputfile.write(i + "\n") gc.disable() close_properly(outputfile, inputfile) else: if separator == "\n": separator = "\t" for akey in key_name.split(","): for i in gtf.get_attr_value_list(akey, count=True): if print_key_name: outputfile.write(akey + separator + i[0] + separator + i[1] + "\n") else: outputfile.write(i[0] + separator + i[1] + "\n") gc.disable() close_properly(outputfile, inputfile)
def midpoints(inputfile=None, outputfile=None, ft_type="transcript", names="transcript_id", separator="|"): """ Get the midpoint coordinates for the requested feature. """ message("Loading input file...") if inputfile.name == '<stdin>': is_gtf = True else: region_bo = BedTool(inputfile.name) if len(region_bo) == 0: message("Unable to find requested regions", type="ERROR") if region_bo.file_type == 'gff': is_gtf = True else: is_gtf = False if is_gtf: gtf = GTF(inputfile.name, check_ensembl_format=False) bed_obj = gtf.select_by_key("feature", ft_type).get_midpoints( name=names.split(","), sep=separator) for line in bed_obj: write_properly(chomp(str(line)), outputfile) else: for line in region_bo: diff = line.end - line.start if diff % 2 != 0: # e.g 10-13 (zero based) -> 11-13 one based # mipoint is 12 (one-based) -> 11-12 (zero based) # e.g 949-1100 (zero based) -> 950-1100 one based # mipoint is 1025 (one-based) -> 1024-1025 (zero based) # floored division (python 2)... line.end = line.start + int(diff // 2) + 1 line.start = line.end - 1 else: # e.g 10-14 (zero based) -> 11-14 one based # mipoint is 12-13 (one-based) -> 11-13 (zero based) # e.g 9-5100 (zero based) -> 10-5100 one based # mipoint is 2555-2555 (one-based) -> 2554-2555 (zero based) # floored division (python 2)... # No real center. Take both line.start = line.start + int(diff // 2) - 1 line.end = line.start + 2 outputfile.write(str(line)) gc.disable() close_properly(outputfile, inputfile)
def del_attr(inputfile=None, outputfile=None, key="transcript_id", reg_exp=False, invert_match=False): """ Delete extended attributes in the target gtf file. attr_list can be a comma-separated list of attributes. """ # ---------------------------------------------------------------------- # Read the GTF and get the list of attributes # ---------------------------------------------------------------------- gtf = GTF(inputfile, check_ensembl_format=False) attr_list = gtf.attr_extended # ---------------------------------------------------------------------- # If regExp, select the corresponding keys # ---------------------------------------------------------------------- if reg_exp: key_list = [] try: rgxp = re.compile(key) except: message("Check the regular expression please.", type="ERROR") for attr in attr_list: if rgxp.search(attr): key_list += [attr] else: key_list = key.split(",") # ---------------------------------------------------------------------- # If invert-match select all but the selected # ---------------------------------------------------------------------- key_to_del = [] if invert_match: for attr in attr_list: if attr not in key_list: key_to_del += [attr] else: key_to_del = key_list # ---------------------------------------------------------------------- # Delete the keys # ---------------------------------------------------------------------- gtf = gtf.del_attr(feat="*", keys=",".join(key_list), force=True).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def convert_ensembl(inputfile=None, outputfile=None, no_check_gene_chr=False): """ Convert the GTF file to ensembl format. """ GTF(inputfile, check_ensembl_format=False).convert_to_ensembl( check_gene_chr=not no_check_gene_chr, ).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def seqid_list(inputfile=None, outputfile=None, separator=""): """ Select the seqid/chromosomes. """ for i in GTF(inputfile, check_ensembl_format=False).get_chroms(nr=True): outputfile.write(str(i) + separator) gc.disable() close_properly(outputfile, inputfile)
def add_exon_nb(inputfile=None, outputfile=None, exon_numbering_key=None): """Add the exon number to each exon (based on 5' to 3' orientation).""" message("Calling nb_exons.", type="DEBUG") GTF(inputfile.name, check_ensembl_format=False).add_exon_number(exon_numbering_key).write( outputfile, gc_off=True) close_properly(inputfile, outputfile)
def get_feature_list(inputfile=None, outputfile=None, separator=""): """ Get the list of features enclosed in the GTF. """ gtf = GTF(inputfile, check_ensembl_format=False) for i in gtf.get_feature_list(nr=True): outputfile.write(str(i) + separator) gc.disable() close_properly(outputfile, inputfile)
def select_by_numeric_value(inputfile=None, outputfile=None, test=None, na_omit=None): """Select lines from a GTF file based on a boolean test on numeric values. """ GTF(inputfile, check_ensembl_format=False).eval_numeric( test, na_omit=na_omit, ).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def select_by_regexp(inputfile=None, outputfile=None, key=None, regexp=None, invert_match=False): """Select lines from a GTF file based on attributes and associated values. """ GTF(inputfile, check_ensembl_format=False).select_by_regexp( key, regexp, invert_match).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def add_prefix(inputfile=None, outputfile=None, key="transcript_id", text=None, target_feature="*", suffix=False): """ Add a prefix to target values. """ gtf = GTF(inputfile, check_ensembl_format=False) gtf.add_prefix(target_feature, key, text, suffix).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def merge_attr(inputfile=None, outputfile=None, src_key="gene_id,transcript_id", separator="|", target_feature="*", dest_key="gene_tx_ids"): """ Merge a set of attributes into a destination attribute. """ GTF(inputfile, check_ensembl_format=False).merge_attr(target_feature, src_key, dest_key, separator).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def random_list(inputfile=None, outputfile=None, number=None, ft_type=None, seed_value=None): """ Select a random list of genes or transcripts. """ message("loading the GTF.") gtf = GTF(inputfile) message("Getting ID list.") if ft_type == 'gene': id_list = gtf.extract_data("gene_id", as_list=True, nr=True, hide_undef=True, no_na=True) else: id_list = gtf.extract_data("transcript_id", as_list=True, nr=True, hide_undef=True, no_na=True) if number > len(id_list): message("To much feature. Using : " + str(len(id_list)), type="WARNING") number = len(id_list) if seed_value is not None: random.seed(seed_value, version=1) id_list = random.sample(id_list, number) message("Printing.") my_id = ft_type + "_id" gtf.select_by_key(my_id, ",".join(id_list)).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def select_by_nb_exon(inputfile=None, outputfile=None, min_exon_number=None, max_exon_number=None): """ Select transcripts based on the number of exons. """ msg = "Selecting transcript by exon number (range: [{m},{M}])" msg = msg.format(m=str(min_exon_number), M=str(max_exon_number)) message(msg) gtf = GTF(inputfile, check_ensembl_format=False).select_by_number_of_exons( min_exon_number, max_exon_number) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def intron_sizes( inputfile=None, outputfile=None, key_name=None): """ Add a new key to transcript features containing a comma-separated list of intron sizes. """ gtf = GTF(inputfile, check_ensembl_format=False) all_tx_ids = gtf.get_tx_ids(nr=True) intron_bo = gtf.get_introns(by_transcript=True, name=["transcript_id"], intron_nb_in_name=False, feat_name=False) strands = gtf.select_by_key("feature", "transcript").extract_data("transcript_id,strand", as_dict_of_values=True, no_na=True, nr=True, hide_undef=True) intron_size = {tx: [] for tx in all_tx_ids} for bed_line in intron_bo: intron_size[bed_line.name] += [str(bed_line.end - bed_line.start)] for tx_id in intron_size: if len(intron_size[tx_id]): if strands[tx_id] == "-": intron_size[tx_id] = ",".join(reversed(intron_size[tx_id])) else: intron_size[tx_id] = ",".join(intron_size[tx_id]) else: intron_size[tx_id] = "0" if len(intron_size): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=intron_size, new_key=key_name) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def exon_sizes(inputfile=None, outputfile=None, key_name=None): """ Add a new key to transcript features containing a comma-separated list of exon-size. """ gtf = GTF(inputfile) all_tx_ids = gtf.get_tx_ids(nr=True) tx_to_size_list = dict() exons_starts = gtf.select_by_key("feature", "exon").extract_data( "transcript_id,start", as_dict_of_merged_list=True, no_na=True, nr=False) if not len(exons_starts): message("No exon found.", type="ERROR") exons_ends = gtf.select_by_key("feature", "exon").extract_data( "transcript_id,end", as_dict_of_merged_list=True, no_na=True, nr=False) strands = gtf.select_by_key("feature", "transcript").extract_data( "transcript_id,strand", as_dict_of_values=True, no_na=True, nr=True, hide_undef=True) for tx_id in all_tx_ids: size_list = [] for s, e in zip(exons_starts[tx_id], exons_ends[tx_id]): size = str(int(e) - int(s) + 1) size_list += [size] if strands[tx_id] == "-": size_list = reversed(size_list) tx_to_size_list[tx_id] = ",".join(size_list) if len(tx_to_size_list): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=tx_to_size_list, new_key=key_name) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def random_tx(inputfile=None, outputfile=None, max_transcript=None, seed_value=None): """ Select randomly up to m transcript for each gene. """ message("loading the GTF.") gtf = GTF(inputfile).select_by_key("feature", "gene", invert_match=True) message("Getting gene_id and transcript_id") gene2tx = gtf.extract_data("gene_id,transcript_id", as_dict_of_merged_list=True, no_na=True, nr=True) message("Selecting random transcript") if seed_value is not None: random.seed(seed_value, version=1) tx_to_delete = [] for gn_id in gene2tx: tx_list = gene2tx[gn_id] nb_tx = len(tx_list) max_cur = min(max_transcript, nb_tx) pos_to_keep = random.sample(list(range(len(tx_list))), max_cur) tx_list = [j for i, j in enumerate(tx_list) if i not in pos_to_keep] tx_to_delete += tx_list message("Printing results") message("Selecting transcript.") gtf.select_by_key("transcript_id", ",".join(tx_to_delete), invert_match=True).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def join_multi_file(inputfile=None, outputfile=None, target_feature=None, key_to_join=None, matrix_files=()): """ Join attributes from a set of tabulated files. """ # ----------------------------------------------------------- # load the GTF # ----------------------------------------------------------- gtf = GTF(inputfile, check_ensembl_format=False) # ----------------------------------------------------------- # Check target feature # ----------------------------------------------------------- feat_list = gtf.get_feature_list(nr=True) if target_feature is not None: target_feature_list = target_feature.split(",") for i in target_feature_list: if i not in feat_list + ["*"]: message("Feature " + i + " not found.", type="ERROR") else: target_feature = ",".join(feat_list) # ----------------------------------------------------------- # Do it # ----------------------------------------------------------- for join_file in matrix_files: gtf = gtf.add_attr_from_matrix_file(feat=target_feature, key=key_to_join, inputfile=join_file.name) gtf.write(outputfile, gc_off=True) gc.disable() close_properly(outputfile, inputfile)
def get_attr_list( inputfile=None, outputfile=None, separator="\n"): """ Get the list of attributes from a GTF file. """ gtf = GTF(inputfile, check_ensembl_format=False) attr_list = gtf.get_attr_list() n = 0 for i in attr_list: if n != len(attr_list) - 1: outputfile.write(i + separator) else: outputfile.write(i) n += 1 gc.disable() close_properly(outputfile, inputfile)
def intergenic(inputfile=None, outputfile=None, chrom_info=None): """ Extract intergenic regions. """ message("Searching for intergenic regions.") gtf = GTF(inputfile) intergenic_regions = gtf.get_intergenic(chrom_info) nb_intergenic_region = 1 for i in intergenic_regions: i.name = "region_" + str(nb_intergenic_region) write_properly(chomp(str(i)), outputfile) nb_intergenic_region += 1 gc.disable() close_properly(outputfile, inputfile)
def count_key_values(inputfile=None, outputfile=None, keys="gene_id,transcript_id", uniq=True, additional_text=None): """ Count the number values for a set of keys. """ gtf = GTF(inputfile, check_ensembl_format=False) if uniq: val_list = defaultdict(set) else: val_list = defaultdict(list) if keys == "*": key_list = gtf.get_attr_list() keys = ",".join(key_list) else: key_list = keys.split(",") for i in gtf.extract_data(keys, as_list_of_list=True): for k, v in zip(key_list, i): if v in ['.', '?']: continue if uniq: val_list[k].add(v) else: val_list[k] += [v] for i in key_list: if additional_text is None: outputfile.write(i + "\t" + str(len(val_list[i])) + "\n") else: outputfile.write(i + "\t" + str(len(val_list[i])) + "\t" + additional_text + "\n") gc.disable() close_properly(outputfile, inputfile)
def nb_exons(inputfile=None, outputfile=None, key_name=None, text_format=False): """ Count the number of exons in the gtf file. """ gtf = GTF(inputfile) n_exons = defaultdict(int) # ------------------------------------------------------------------------- # Computing number of exon for each transcript in input GTF file # # ------------------------------------------------------------------------- message("Computing number of exons for each transcript in input GTF file.") exon = gtf.select_by_key("feature", "exon") fields = exon.extract_data("transcript_id") for i in fields: tx_id = i[0] n_exons[tx_id] += 1 if text_format: for tx_id in n_exons: outputfile.write(tx_id + "\t" + str(n_exons[tx_id]) + "\ttranscript\n") else: if len(n_exons): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=n_exons, new_key=key_name) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def nb_transcripts(inputfile=None, outputfile=None, text_format=False, key_name=""): """ Compute the number of transcript per gene. """ gtf = GTF(inputfile) message("Computing the number of transcript per gene in input GTF file.") # Computation of transcript number is performed on exon lines # Just in case some transcript lines would be lacking (but they should # not...) n_tx = gtf.get_gn_to_tx() if not text_format: tmp_file = make_tmp_file(prefix="nb_tx", suffix=".txt") for i in n_tx: if not text_format: tmp_file.write(i + "\t" + str(len(n_tx[i])) + "\n") else: outputfile.write(i + "\t" + str(len(n_tx[i])) + "\n") if not text_format: tmp_file.close() gtf.add_attr_from_file(feat="gene", key="gene_id", new_key=key_name, inputfile=tmp_file.name).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def divergent( inputfile=None, outputfile=None, key_name=None, upstream=1500, downstream=1500, chrom_info=None, no_strandness=False, no_annotation=False): """ Find transcript with divergent promoters. """ message("Using -u " + str(upstream) + ".") message("Using -d " + str(downstream) + ".") tx_with_divergent = dict() dist_to_divergent = dict() tss_pos = dict() message("Loading GTF.") gtf = GTF(inputfile) message("Getting transcript coordinates.") tx_feat = gtf.select_by_key("feature", "transcript") message("Getting tss coordinates.") tss_bo = tx_feat.get_tss(name=["transcript_id", "gene_id"], sep="||") # get tss position for i in tss_bo: tx_id_tss, gn_id_tss = i.name.split("||") tss_pos[tx_id_tss] = int(i.start) message("Getting promoter coordinates.") promoter_bo = tss_bo.slop(s=True, l=upstream, r=downstream, g=chrom_info.name).cut([0, 1, 2, 3, 4, 5]) message("Intersecting...") if no_strandness: prom_with_tss_bo = promoter_bo.intersect(tss_bo, wb=True, s=False, S=False) else: prom_with_tss_bo = promoter_bo.intersect(tss_bo, wb=True, s=False, S=True) tmp_file = make_tmp_file("promoter_slop", ".bed") promoter_bo.saveas(tmp_file.name) tmp_file = make_tmp_file("promoter_intersection_with_tss_as_", ".bed") prom_with_tss_bo.saveas(tmp_file.name) for i in prom_with_tss_bo: tx_id_tss, gn_id_tss = i.fields[9].split("||") tx_id_prom, gene_id_prom = i.fields[3].split("||") if gene_id_prom != gn_id_tss: if tx_id_prom in tx_with_divergent: dist = abs(tss_pos[tx_id_prom] - tss_pos[tx_id_tss]) if dist < dist_to_divergent[tx_id_prom]: dist_to_divergent[tx_id_prom] = dist tx_with_divergent[tx_id_prom] = tx_id_tss else: dist = abs(tss_pos[tx_id_prom] - tss_pos[tx_id_tss]) dist_to_divergent[tx_id_prom] = dist tx_with_divergent[tx_id_prom] = tx_id_tss if not no_annotation: if key_name is None: key_name = "divergent" key_name_dist = "dist_to_divergent" else: key_name_dist = "dist_" + key_name if len(tx_with_divergent): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=tx_with_divergent, new_key=key_name) gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=dist_to_divergent, new_key=key_name_dist) gtf.write(outputfile, gc_off=True) else: gtf.select_by_key("transcript_id", ",".join(list(tx_with_divergent.keys()))).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def tabulate(inputfile=None, outputfile=None, key=None, no_unset=False, unique=False, no_basic=False, accept_undef=False, select_gene_ids=False, select_gene_names=False, select_transcript_ids=False, select_exon_ids=False, separator="\t", no_header=False): """Convert a GTF to tabulated format. """ # ---------------------------------------------------------------------- # Check mode # ---------------------------------------------------------------------- if select_transcript_ids: key = "transcript_id" elif select_gene_ids: key = "gene_id" elif select_gene_names: key = "gene_id" elif select_exon_ids: key = "exon_id" no_undef = False if not accept_undef: no_undef = True # ---------------------------------------------------------------------- # REad GTF and process # ---------------------------------------------------------------------- gtf = GTF(inputfile, check_ensembl_format=False) if key in ["all", "*"]: if no_basic: attr_list = gtf.get_attr_list(add_basic=False) else: attr_list = gtf.get_attr_list(add_basic=True) tab = gtf.extract_data(attr_list) else: tab = gtf.extract_data(key) if not no_header: message("Writing header") write_properly(separator.join(tab.colnames), outputfile) message("Writing") try: if not unique: if no_unset: if no_undef: for i in tab: if any([True for x in i.fields if x in [".", "?"]]): continue i.write(outputfile, separator) else: for i in tab: if any([True for x in i.fields if x in ["."]]): continue i.write(outputfile, separator) else: if no_undef: for i in tab: if any([True for x in i.fields if x in ["?"]]): continue i.write(outputfile, separator) else: for i in tab: i.write(outputfile, separator) else: printed = {} if no_unset: if no_undef: for i in tab: t = tuple(i) if t not in printed: if any([True for x in i.fields if x in [".", "?"]]): continue i.write(outputfile, separator) printed[t] = 1 else: for i in tab: t = tuple(i) if t not in printed: if any([True for x in i.fields if x in ["."]]): continue i.write(outputfile, separator) printed[t] = 1 else: if no_undef: for i in tab: t = tuple(i) if t not in printed: if any([True for x in i.fields if x in ["?"]]): continue i.write(outputfile, separator) printed[t] = 1 else: for i in tab: t = tuple(i) if t not in printed: i.write(outputfile, separator) printed[t] = 1 except (BrokenPipeError, IOError): def _void_f(*args, **kwargs): pass message("Received a boken pipe signal", type="WARNING") sys.stdout.write = _void_f sys.stdout.flush = _void_f gc.disable() close_properly(outputfile, inputfile)
def discretize_key(inputfile=None, outputfile=None, src_key=None, dest_key="disc_key", nb_levels=2, percentiles=False, percentiles_of_uniq=False, precision=2, log=False, labels=None): """ Create a new key by discretizing a numeric key. """ if nb_levels < 2: message("--nb-levels has to be greater than 2.", type="ERROR") # ------------------------------------------------------------------------- # # Check labels and nb_levels # # ------------------------------------------------------------------------- if labels is not None: labels = labels.split(",") if len(labels) != nb_levels: message( "The number of labels should be the same as the number of levels.", type="ERROR") if len(labels) != len(set(labels)): message("Redundant labels not allowed.", type="ERROR") # ------------------------------------------------------------------------- # # Load GTF. Retrieve values for src-key # # ------------------------------------------------------------------------- gtf = GTF(inputfile, check_ensembl_format=False) src_values = gtf.extract_data(src_key, as_list=True) if len([x for x in src_values if x not in ['.', '?']]) == 0: message('The key was not found in this GTF.', type="ERROR") min_val = None max_val = None dest_values = [] dest_pos = [] for p, v in enumerate(src_values): try: a = float(v) if min_val is not None: if a > max_val: max_val = a if a < min_val: min_val = a else: min_val = a max_val = a dest_values += [a] dest_pos += [p] except ValueError: pass if min_val is None: message("Did not find numeric values in the source key.", type="ERROR") if min_val == max_val: message( "The minimum and maximum values found in the source key are the same.", type="ERROR") if log: if 0 in dest_values: message("Encountered zero values before log transformation.", type="WARNING", force=True) message("Adding a pseudocount (+1).", type="WARNING", force=True) pseudo_count = 1 dest_values = list(np.log2([x + pseudo_count for x in dest_values])) # update max/min values max_val = max(dest_values) min_val = min(dest_values) # Apply the same rule as pandas.cut when bins is an int. min_val = min_val - max_val / 1000 # ------------------------------------------------------------------------- # # Compute percentiles if required # # ------------------------------------------------------------------------- if percentiles: if percentiles_of_uniq: dest_values_tmp = [min_val] + list(set(dest_values)) else: dest_values_tmp = [min_val] + dest_values n = nb_levels q = [np.percentile(dest_values_tmp, 100 / n * i) for i in range(0, n)] q = q + [np.percentile(dest_values_tmp, 100)] if len(q) != len(set(q)): message("No ties are accepted in percentiles.", type="WARNING", force=True) message("Breaks: " + str(q), type="WARNING", force=True) message("Try -u. Exiting", type="ERROR") # ------------------------------------------------------------------------- # # Create a factor # # ------------------------------------------------------------------------- if percentiles: (breaks, cat_label) = pandas.cut(dest_values, bins=q, labels=labels, retbins=True) else: (breaks, cat_label) = pandas.cut(dest_values, bins=nb_levels, labels=labels, retbins=True) if labels is None: # The include_lowest argument of pandas is not working. # Using this workaround to avoid minimum value outside of data range. cat_label[0] = min(dest_values) cat_label = [round(x, precision) for x in cat_label] if precision == 0: cat_label = [int(x) for x in cat_label] cat_label = [str(x) for x in list(zip(cat_label[:-1], cat_label[1:]))] cat_label[0] = cat_label[0].replace("(", "[") cat_label = [x.replace(")", "]") for x in cat_label] cat_label = [str(x).replace(", ", "_") for x in cat_label] # The string can be very problematic later... breaks.categories = cat_label message("Categories: " + str(list(breaks.categories)), type="INFO", force=True) # ------------------------------------------------------------------------- # # Write to disk # # ------------------------------------------------------------------------- tmp_file = make_tmp_file(prefix="discretized_keys", suffix=".txt") with tmp_file as tp_file: for p, v in zip(dest_pos, breaks): tp_file.write(str(p) + "\t" + str(v) + '\n') gtf.add_attr_to_pos(tmp_file, new_key=dest_key).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def overlapping( inputfile=None, outputfile=None, key_name=None, upstream=1500, downstream=1500, chrom_info=None, feature_type='transcript', same_strandedness=False, diff_strandedness=False, annotate_gtf=False, bool=False, annotate_all=False, invert_match=False): """ Description: Find transcripts whose body/TSS/TTS do or do not overlap with any transcript from another gene. """ # ---------------------------------------------------------------------- # Prepare key names # ---------------------------------------------------------------------- if annotate_gtf: if key_name is None: key_info = ["overlap", feature_type, "u" + str(upstream / 1000) + "k", "d" + str(downstream / 1000) + "k" ] key_name = "_".join(key_info) if invert_match: message("--annotate-gtf and --invert-match are " "mutually exclusive.", type="ERROR") if same_strandedness and diff_strandedness: message("--same-strandedness and --diff-strandedness are " "mutually exclusive.", type="ERROR") message("Using -u " + str(upstream)) message("Using -d " + str(downstream)) overlapping_tx = defaultdict(list) # Load the GTF so that it won't be lost # if GTF stream comes from stdin gtf = GTF(inputfile) message("Getting transcript in bed format") tx_feat = gtf.select_by_key("feature", "transcript") if annotate_all: overlapping_tx = gtf.extract_data(keys=["transcript_id"], as_dict=True, default_val="0") for i in overlapping_tx: overlapping_tx[i] = [] # ---------------------------------------------------------------------- # Get transcript limits # ---------------------------------------------------------------------- tx_bed = tx_feat.to_bed(name=["transcript_id", "gene_id"], sep="||") message("Getting " + feature_type + " and 'slopping'.") if feature_type == "transcript": bed_obj = tx_bed.slop(s=True, l=upstream, r=downstream, g=chrom_info.name).cut([0, 1, 2, 3, 4, 5]) elif feature_type == "promoter": bed_obj = tx_feat.get_tss(name=["transcript_id", "gene_id"], sep="||").slop(s=True, l=upstream, r=downstream, g=chrom_info.name).cut([0, 1, 2, 3, 4, 5]) elif feature_type == "tts": bed_obj = tx_feat.get_tts(name=["transcript_id", "gene_id"], sep="||").slop(s=True, l=upstream, r=downstream, g=chrom_info.name).cut([0, 1, 2, 3, 4, 5]) else: message("Not implemented yet", type="ERROR") tmp_file = make_tmp_file(feature_type + "_slopped_region", ".bed") bed_obj.saveas(tmp_file.name) overlap_regions = bed_obj.intersect(tx_bed, wb=True, s=same_strandedness, S=diff_strandedness) tmp_file = make_tmp_file(feature_type + "_overlapping_regions", ".bed") overlap_regions.saveas(tmp_file.name) for i in overlap_regions: tx_other, gn_other = i.fields[9].split("||") tx_id, gene_id = i.fields[3].split("||") if gene_id != gn_other: overlapping_tx[tx_id] += [tx_other] if bool: for k, _ in overlapping_tx.items(): if not len(overlapping_tx[k]): overlapping_tx[k] = "0" else: overlapping_tx[k] = "1" if not invert_match: if not annotate_gtf: value = ",".join(set(overlapping_tx.keys())) gtf.select_by_key("transcript_id", value).write(outputfile, gc_off=True) else: if len(overlapping_tx): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=overlapping_tx, new_key=key_name) gtf.write(outputfile, gc_off=True) else: values = ",".join(set(overlapping_tx.keys())) gtf.select_by_key("transcript_id", values, invert_match).write(outputfile, gc_off=True) gc.disable() close_properly(outputfile, inputfile)