def get_attr_list( inputfile=None, outputfile=None, separator="\n"): """ Get the list of attributes from a GTF file. """ gtf = GTF(inputfile, check_ensembl_format=False) attr_list = gtf.get_attr_list() n = 0 for i in attr_list: if n != len(attr_list) - 1: outputfile.write(i + separator) else: outputfile.write(i) n += 1 gc.disable() close_properly(outputfile, inputfile)
def count_key_values(inputfile=None, outputfile=None, keys="gene_id,transcript_id", uniq=True, additional_text=None): """ Count the number values for a set of keys. """ gtf = GTF(inputfile, check_ensembl_format=False) if uniq: val_list = defaultdict(set) else: val_list = defaultdict(list) if keys == "*": key_list = gtf.get_attr_list() keys = ",".join(key_list) else: key_list = keys.split(",") for i in gtf.extract_data(keys, as_list_of_list=True): for k, v in zip(key_list, i): if v in ['.', '?']: continue if uniq: val_list[k].add(v) else: val_list[k] += [v] for i in key_list: if additional_text is None: outputfile.write(i + "\t" + str(len(val_list[i])) + "\n") else: outputfile.write(i + "\t" + str(len(val_list[i])) + "\t" + additional_text + "\n") gc.disable() close_properly(outputfile, inputfile)
def get_attr_value_list(inputfile=None, outputfile=None, key_name="gene_id", print_key_name=False, separator="\n", count=False): """ Get the list of values observed for an attributes. """ gtf = GTF(inputfile, check_ensembl_format=False) if key_name == '*': key_name = ",".join(gtf.get_attr_list(add_basic=True)) if not count: for akey in key_name.split(","): for i in gtf.get_attr_value_list(akey): if print_key_name: outputfile.write(akey + separator + i + "\n") else: outputfile.write(i + "\n") gc.disable() close_properly(outputfile, inputfile) else: if separator == "\n": separator = "\t" for akey in key_name.split(","): for i in gtf.get_attr_value_list(akey, count=True): if print_key_name: outputfile.write(akey + separator + i[0] + separator + i[1] + "\n") else: outputfile.write(i[0] + separator + i[1] + "\n") gc.disable() close_properly(outputfile, inputfile)
def tabulate(inputfile=None, outputfile=None, key=None, no_unset=False, unique=False, no_basic=False, accept_undef=False, select_gene_ids=False, select_gene_names=False, select_transcript_ids=False, select_exon_ids=False, separator="\t", no_header=False): """Convert a GTF to tabulated format. """ # ---------------------------------------------------------------------- # Check mode # ---------------------------------------------------------------------- if select_transcript_ids: key = "transcript_id" elif select_gene_ids: key = "gene_id" elif select_gene_names: key = "gene_id" elif select_exon_ids: key = "exon_id" no_undef = False if not accept_undef: no_undef = True # ---------------------------------------------------------------------- # REad GTF and process # ---------------------------------------------------------------------- gtf = GTF(inputfile, check_ensembl_format=False) if key in ["all", "*"]: if no_basic: attr_list = gtf.get_attr_list(add_basic=False) else: attr_list = gtf.get_attr_list(add_basic=True) tab = gtf.extract_data(attr_list) else: tab = gtf.extract_data(key) if not no_header: message("Writing header") write_properly(separator.join(tab.colnames), outputfile) message("Writing") try: if not unique: if no_unset: if no_undef: for i in tab: if any([True for x in i.fields if x in [".", "?"]]): continue i.write(outputfile, separator) else: for i in tab: if any([True for x in i.fields if x in ["."]]): continue i.write(outputfile, separator) else: if no_undef: for i in tab: if any([True for x in i.fields if x in ["?"]]): continue i.write(outputfile, separator) else: for i in tab: i.write(outputfile, separator) else: printed = {} if no_unset: if no_undef: for i in tab: t = tuple(i) if t not in printed: if any([True for x in i.fields if x in [".", "?"]]): continue i.write(outputfile, separator) printed[t] = 1 else: for i in tab: t = tuple(i) if t not in printed: if any([True for x in i.fields if x in ["."]]): continue i.write(outputfile, separator) printed[t] = 1 else: if no_undef: for i in tab: t = tuple(i) if t not in printed: if any([True for x in i.fields if x in ["?"]]): continue i.write(outputfile, separator) printed[t] = 1 else: for i in tab: t = tuple(i) if t not in printed: i.write(outputfile, separator) printed[t] = 1 except (BrokenPipeError, IOError): def _void_f(*args, **kwargs): pass message("Received a boken pipe signal", type="WARNING") sys.stdout.write = _void_f sys.stdout.flush = _void_f gc.disable() close_properly(outputfile, inputfile)