def convert(inputfile=None, outputfile=None, format="bed", names="gene_id,transcript_id", separator="|", more_names=''): """ Convert a GTF to various format. """ if format == "bed3": gtf = GTF(inputfile, check_ensembl_format=False) for i in gtf.extract_data("seqid,start,end", as_list_of_list=True, hide_undef=False, no_na=False): i[1] = str(int(i[1]) - 1) outputfile.write("\t".join(i) + "\n") elif format in ["bed", "bed6"]: gtf = GTF(inputfile, check_ensembl_format=False).write_bed(outputfile=outputfile, name=names, sep=separator, more_name=more_names) gc.disable() close_properly(outputfile, inputfile)
def count(inputfile=None, outputfile=None, header=None, additional_text=None): """ Count the number of features in the gtf file. """ if header is not None: header = header.split(",") gtf = GTF(inputfile, check_ensembl_format=False) feat_nb = OrderedDict() for i in gtf.extract_data("feature"): i = i[0] if i in feat_nb: feat_nb[i] += 1 else: feat_nb[i] = 1 if header is not None: outputfile.write("\t".join(header) + "\n") for i in feat_nb: if additional_text is None: outputfile.write(i + "\t" + str(feat_nb[i]) + "\n") else: outputfile.write(i + "\t" + str(feat_nb[i]) + "\t" + additional_text + "\n") gc.disable() close_properly(outputfile, inputfile)
def intronic(inputfile=None, outputfile=None, names='transcript_id', separator="_", intron_nb_in_name=False, no_feature_name=False, by_transcript=False): """ Extract intronic regions. """ message("Searching for intronic regions.") # Need to load if the gtf comes from # <stdin> gtf = GTF(inputfile, check_ensembl_format=False) if not by_transcript: introns_bo = gtf.get_introns() for i in introns_bo: write_properly(chomp(str(i)), outputfile) else: introns_bo = gtf.get_introns(by_transcript=True, name=names.split(","), sep=separator, intron_nb_in_name=intron_nb_in_name, feat_name=not no_feature_name) for i in introns_bo: write_properly(chomp(str(i)), outputfile) gc.disable() close_properly(outputfile, inputfile)
def get_attr_value_list(inputfile=None, outputfile=None, key_name="gene_id", print_key_name=False, separator="\n", count=False): """ Get the list of values observed for an attributes. """ gtf = GTF(inputfile, check_ensembl_format=False) if not count: for akey in key_name.split(","): for i in gtf.get_attr_value_list(akey): if print_key_name: outputfile.write(akey + separator + i + "\n") else: outputfile.write(i + "\n") gc.disable() close_properly(outputfile, inputfile) else: if separator == "\n": separator = "\t" for akey in key_name.split(","): for i in gtf.get_attr_value_list(akey, count=True): if print_key_name: outputfile.write(akey + separator + i[0] + separator + i[1] + "\n") else: outputfile.write(i[0] + separator + i[1] + "\n") gc.disable() close_properly(outputfile, inputfile)
def midpoints(inputfile=None, outputfile=None, ft_type="transcript", names="transcript_id", separator="|"): """ Get the midpoint coordinates for the requested feature. """ message("Loading input file...") if inputfile.name == '<stdin>': is_gtf = True else: region_bo = BedTool(inputfile.name) if len(region_bo) == 0: message("Unable to find requested regions", type="ERROR") if region_bo.file_type == 'gff': is_gtf = True else: is_gtf = False if is_gtf: gtf = GTF(inputfile.name, check_ensembl_format=False) bed_obj = gtf.select_by_key("feature", ft_type).get_midpoints( name=names.split(","), sep=separator) for line in bed_obj: write_properly(chomp(str(line)), outputfile) else: for line in region_bo: diff = line.end - line.start if diff % 2 != 0: # e.g 10-13 (zero based) -> 11-13 one based # mipoint is 12 (one-based) -> 11-12 (zero based) # e.g 949-1100 (zero based) -> 950-1100 one based # mipoint is 1025 (one-based) -> 1024-1025 (zero based) # floored division (python 2)... line.end = line.start + int(diff // 2) + 1 line.start = line.end - 1 else: # e.g 10-14 (zero based) -> 11-14 one based # mipoint is 12-13 (one-based) -> 11-13 (zero based) # e.g 9-5100 (zero based) -> 10-5100 one based # mipoint is 2555-2555 (one-based) -> 2554-2555 (zero based) # floored division (python 2)... # No real center. Take both line.start = line.start + int(diff // 2) - 1 line.end = line.start + 2 outputfile.write(str(line)) gc.disable() close_properly(outputfile, inputfile)
def del_attr(inputfile=None, outputfile=None, key="transcript_id", reg_exp=False, invert_match=False): """ Delete extended attributes in the target gtf file. attr_list can be a comma-separated list of attributes. """ # ---------------------------------------------------------------------- # Read the GTF and get the list of attributes # ---------------------------------------------------------------------- gtf = GTF(inputfile, check_ensembl_format=False) attr_list = gtf.attr_extended # ---------------------------------------------------------------------- # If regExp, select the corresponding keys # ---------------------------------------------------------------------- if reg_exp: key_list = [] try: rgxp = re.compile(key) except: message("Check the regular expression please.", type="ERROR") for attr in attr_list: if rgxp.search(attr): key_list += [attr] else: key_list = key.split(",") # ---------------------------------------------------------------------- # If invert-match select all but the selected # ---------------------------------------------------------------------- key_to_del = [] if invert_match: for attr in attr_list: if attr not in key_list: key_to_del += [attr] else: key_to_del = key_list # ---------------------------------------------------------------------- # Delete the keys # ---------------------------------------------------------------------- gtf = gtf.del_attr(feat="*", keys=",".join(key_list), force=True).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def convert_ensembl(inputfile=None, outputfile=None, no_check_gene_chr=False): """ Convert the GTF file to ensembl format. """ GTF(inputfile, check_ensembl_format=False).convert_to_ensembl( check_gene_chr=not no_check_gene_chr, ).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def seqid_list(inputfile=None, outputfile=None, separator=""): """ Select the seqid/chromosomes. """ for i in GTF(inputfile, check_ensembl_format=False).get_chroms(nr=True): outputfile.write(str(i) + separator) gc.disable() close_properly(outputfile, inputfile)
def add_exon_nb(inputfile=None, outputfile=None, exon_numbering_key=None): """Add the exon number to each exon (based on 5' to 3' orientation).""" message("Calling nb_exons.", type="DEBUG") GTF(inputfile.name, check_ensembl_format=False).add_exon_number(exon_numbering_key).write( outputfile, gc_off=True) close_properly(inputfile, outputfile)
def select_by_max_exon_nb(inputfile=None, outputfile=None): """ Select transcripts based on the number of exons. """ msg = "Selecting transcript with the highest number of exon for each gene." message(msg) gtf = GTF(inputfile, check_ensembl_format=False).select_by_max_exon_nb() gtf.write(outputfile, gc_off=True)
def get_feature_list(inputfile=None, outputfile=None, separator=""): """ Get the list of features enclosed in the GTF. """ gtf = GTF(inputfile, check_ensembl_format=False) for i in gtf.get_feature_list(nr=True): outputfile.write(str(i) + separator) gc.disable() close_properly(outputfile, inputfile)
def select_by_numeric_value(inputfile=None, outputfile=None, test=None, na_omit=None): """Select lines from a GTF file based on a boolean test on numeric values. """ GTF(inputfile, check_ensembl_format=False).eval_numeric( test, na_omit=na_omit, ).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def select_by_regexp(inputfile=None, outputfile=None, key=None, regexp=None, invert_match=False): """Select lines from a GTF file based on attributes and associated values. """ GTF(inputfile, check_ensembl_format=False).select_by_regexp( key, regexp, invert_match).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def select_most_5p_tx(inputfile=None, outputfile=None, keep_gene_lines=False): """ Select the most 5' transcript of each gene. """ message("Selecting the most 5' transcript of each gene.") gtf = GTF(inputfile) if keep_gene_lines: gtf = gtf.select_5p_transcript() else: gtf = gtf.select_5p_transcript().select_by_key("feature", "gene", 1) gtf.write(outputfile, gc_off=True)
def add_prefix(inputfile=None, outputfile=None, key="transcript_id", text=None, target_feature="*", suffix=False): """ Add a prefix to target values. """ gtf = GTF(inputfile, check_ensembl_format=False) gtf.add_prefix(target_feature, key, text, suffix).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def merge_attr(inputfile=None, outputfile=None, src_key="gene_id,transcript_id", separator="|", target_feature="*", dest_key="gene_tx_ids"): """ Merge a set of attributes into a destination attribute. """ GTF(inputfile, check_ensembl_format=False).merge_attr(target_feature, src_key, dest_key, separator).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def random_list(inputfile=None, outputfile=None, number=None, ft_type=None, seed_value=None): """ Select a random list of genes or transcripts. """ message("loading the GTF.") gtf = GTF(inputfile) message("Getting ID list.") if ft_type == 'gene': id_list = gtf.extract_data("gene_id", as_list=True, nr=True, hide_undef=True, no_na=True) else: id_list = gtf.extract_data("transcript_id", as_list=True, nr=True, hide_undef=True, no_na=True) if number > len(id_list): message("To much feature. Using : " + str(len(id_list)), type="WARNING") number = len(id_list) if seed_value is not None: random.seed(seed_value, version=1) id_list = random.sample(id_list, number) message("Printing.") my_id = ft_type + "_id" gtf.select_by_key(my_id, ",".join(id_list)).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def select_by_nb_exon(inputfile=None, outputfile=None, min_exon_number=None, max_exon_number=None): """ Select transcripts based on the number of exons. """ msg = "Selecting transcript by exon number (range: [{m},{M}])" msg = msg.format(m=str(min_exon_number), M=str(max_exon_number)) message(msg) gtf = GTF(inputfile, check_ensembl_format=False).select_by_number_of_exons( min_exon_number, max_exon_number) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def select_by_tx_size(inputfile=None, outputfile=None, min_size=None, max_size=None): """ Select features by size. """ msg = "Selecting 'mature/spliced transcript by size (range: [{m},{M}])." msg = msg.format(m=str(min_size), M=str(max_size)) message(msg) GTF(inputfile ).select_by_transcript_size(min_size, max_size ).write(outputfile, gc_off=True)
def intron_sizes( inputfile=None, outputfile=None, key_name=None): """ Add a new key to transcript features containing a comma-separated list of intron sizes. """ gtf = GTF(inputfile, check_ensembl_format=False) all_tx_ids = gtf.get_tx_ids(nr=True) intron_bo = gtf.get_introns(by_transcript=True, name=["transcript_id"], intron_nb_in_name=False, feat_name=False) strands = gtf.select_by_key("feature", "transcript").extract_data("transcript_id,strand", as_dict_of_values=True, no_na=True, nr=True, hide_undef=True) intron_size = {tx: [] for tx in all_tx_ids} for bed_line in intron_bo: intron_size[bed_line.name] += [str(bed_line.end - bed_line.start)] for tx_id in intron_size: if len(intron_size[tx_id]): if strands[tx_id] == "-": intron_size[tx_id] = ",".join(reversed(intron_size[tx_id])) else: intron_size[tx_id] = ",".join(intron_size[tx_id]) else: intron_size[tx_id] = "0" if len(intron_size): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=intron_size, new_key=key_name) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def exon_sizes(inputfile=None, outputfile=None, key_name=None): """ Add a new key to transcript features containing a comma-separated list of exon-size. """ gtf = GTF(inputfile) all_tx_ids = gtf.get_tx_ids(nr=True) tx_to_size_list = dict() exons_starts = gtf.select_by_key("feature", "exon").extract_data( "transcript_id,start", as_dict_of_merged_list=True, no_na=True, nr=False) if not len(exons_starts): message("No exon found.", type="ERROR") exons_ends = gtf.select_by_key("feature", "exon").extract_data( "transcript_id,end", as_dict_of_merged_list=True, no_na=True, nr=False) strands = gtf.select_by_key("feature", "transcript").extract_data( "transcript_id,strand", as_dict_of_values=True, no_na=True, nr=True, hide_undef=True) for tx_id in all_tx_ids: size_list = [] for s, e in zip(exons_starts[tx_id], exons_ends[tx_id]): size = str(int(e) - int(s) + 1) size_list += [size] if strands[tx_id] == "-": size_list = reversed(size_list) tx_to_size_list[tx_id] = ",".join(size_list) if len(tx_to_size_list): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=tx_to_size_list, new_key=key_name) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def select_by_go(inputfile=None, outputfile=None, go_id=None, https_proxy=None, http_proxy=None, list_datasets=None, species=None, invert_match=False): """ Select lines from a GTF file based using a Gene Ontology ID (e.g GO:0050789). """ if not go_id.startswith("GO:"): go_id = "GO:" + go_id is_associated = OrderedDict() bm = Biomart(http_proxy=http_proxy, https_proxy=https_proxy) bm.get_datasets('ENSEMBL_MART_ENSEMBL') if list_datasets: for i in sorted(bm.datasets): write_properly(i.replace("_gene_ensembl", ""), outputfile) sys.exit() else: if species + "_gene_ensembl" not in bm.datasets: message("Unknow dataset/species.", type="ERROR") bm.query({'query': XML.format(species=species, go=go_id)}) for i in bm.response.content.decode().split("\n"): i = i.rstrip("\n") if i != '': is_associated[i] = 1 gtf = GTF(inputfile) gtf_associated = gtf.select_by_key("gene_id", ",".join(list(is_associated.keys())), invert_match) gtf_associated.write(outputfile, gc_off=True)
def short_long(inputfile=None, outputfile=None, longs=None, keep_gene_lines=False): """ Select the shortest transcript for each gene, Or the longuest if the \ -l arguments is used. """ gtf = GTF(inputfile, check_ensembl_format=False) if longs: gtf = gtf.select_longuest_transcripts() else: gtf = gtf.select_shortest_transcripts() if not keep_gene_lines: gtf = gtf.select_by_key("feature", "gene", 1) gtf.write(outputfile, gc_off=True)
def random_tx(inputfile=None, outputfile=None, max_transcript=None, seed_value=None): """ Select randomly up to m transcript for each gene. """ message("loading the GTF.") gtf = GTF(inputfile).select_by_key("feature", "gene", invert_match=True) message("Getting gene_id and transcript_id") gene2tx = gtf.extract_data("gene_id,transcript_id", as_dict_of_merged_list=True, no_na=True, nr=True) message("Selecting random transcript") if seed_value is not None: random.seed(seed_value, version=1) tx_to_delete = [] for gn_id in gene2tx: tx_list = gene2tx[gn_id] nb_tx = len(tx_list) max_cur = min(max_transcript, nb_tx) pos_to_keep = random.sample(list(range(len(tx_list))), max_cur) tx_list = [j for i, j in enumerate(tx_list) if i not in pos_to_keep] tx_to_delete += tx_list message("Printing results") message("Selecting transcript.") gtf.select_by_key("transcript_id", ",".join(tx_to_delete), invert_match=True).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def join_multi_file(inputfile=None, outputfile=None, target_feature=None, key_to_join=None, matrix_files=()): """ Join attributes from a set of tabulated files. """ # ----------------------------------------------------------- # load the GTF # ----------------------------------------------------------- gtf = GTF(inputfile, check_ensembl_format=False) # ----------------------------------------------------------- # Check target feature # ----------------------------------------------------------- feat_list = gtf.get_feature_list(nr=True) if target_feature is not None: target_feature_list = target_feature.split(",") for i in target_feature_list: if i not in feat_list + ["*"]: message("Feature " + i + " not found.", type="ERROR") else: target_feature = ",".join(feat_list) # ----------------------------------------------------------- # Do it # ----------------------------------------------------------- for join_file in matrix_files: gtf = gtf.add_attr_from_matrix_file(feat=target_feature, key=key_to_join, inputfile=join_file.name) gtf.write(outputfile, gc_off=True) gc.disable() close_properly(outputfile, inputfile)
def get_attr_list( inputfile=None, outputfile=None, separator="\n"): """ Get the list of attributes from a GTF file. """ gtf = GTF(inputfile, check_ensembl_format=False) attr_list = gtf.get_attr_list() n = 0 for i in attr_list: if n != len(attr_list) - 1: outputfile.write(i + separator) else: outputfile.write(i) n += 1 gc.disable() close_properly(outputfile, inputfile)
def intergenic(inputfile=None, outputfile=None, chrom_info=None): """ Extract intergenic regions. """ message("Searching for intergenic regions.") gtf = GTF(inputfile) intergenic_regions = gtf.get_intergenic(chrom_info) nb_intergenic_region = 1 for i in intergenic_regions: i.name = "region_" + str(nb_intergenic_region) write_properly(chomp(str(i)), outputfile) nb_intergenic_region += 1 gc.disable() close_properly(outputfile, inputfile)
def count_key_values(inputfile=None, outputfile=None, keys="gene_id,transcript_id", uniq=True, additional_text=None): """ Count the number values for a set of keys. """ gtf = GTF(inputfile, check_ensembl_format=False) if uniq: val_list = defaultdict(set) else: val_list = defaultdict(list) if keys == "*": key_list = gtf.get_attr_list() keys = ",".join(key_list) else: key_list = keys.split(",") for i in gtf.extract_data(keys, as_list_of_list=True): for k, v in zip(key_list, i): if v in ['.', '?']: continue if uniq: val_list[k].add(v) else: val_list[k] += [v] for i in key_list: if additional_text is None: outputfile.write(i + "\t" + str(len(val_list[i])) + "\n") else: outputfile.write(i + "\t" + str(len(val_list[i])) + "\t" + additional_text + "\n") gc.disable() close_properly(outputfile, inputfile)
def nb_exons(inputfile=None, outputfile=None, key_name=None, text_format=False): """ Count the number of exons in the gtf file. """ gtf = GTF(inputfile) n_exons = defaultdict(int) # ------------------------------------------------------------------------- # Computing number of exon for each transcript in input GTF file # # ------------------------------------------------------------------------- message("Computing number of exons for each transcript in input GTF file.") exon = gtf.select_by_key("feature", "exon") fields = exon.extract_data("transcript_id") for i in fields: tx_id = i[0] n_exons[tx_id] += 1 if text_format: for tx_id in n_exons: outputfile.write(tx_id + "\t" + str(n_exons[tx_id]) + "\ttranscript\n") else: if len(n_exons): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=n_exons, new_key=key_name) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def nb_transcripts(inputfile=None, outputfile=None, text_format=False, key_name=""): """ Compute the number of transcript per gene. """ gtf = GTF(inputfile) message("Computing the number of transcript per gene in input GTF file.") # Computation of transcript number is performed on exon lines # Just in case some transcript lines would be lacking (but they should # not...) n_tx = gtf.get_gn_to_tx() if not text_format: tmp_file = make_tmp_file(prefix="nb_tx", suffix=".txt") for i in n_tx: if not text_format: tmp_file.write(i + "\t" + str(len(n_tx[i])) + "\n") else: outputfile.write(i + "\t" + str(len(n_tx[i])) + "\n") if not text_format: tmp_file.close() gtf.add_attr_from_file(feat="gene", key="gene_id", new_key=key_name, inputfile=tmp_file.name).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)