示例#1
0
def get_attr_list(
        inputfile=None,
        outputfile=None,
        separator="\n"):
    """
    Get the list of attributes from a GTF file.
    """

    gtf = GTF(inputfile, check_ensembl_format=False)
    attr_list = gtf.get_attr_list()
    n = 0
    for i in attr_list:
        if n != len(attr_list) - 1:
            outputfile.write(i + separator)
        else:
            outputfile.write(i)
        n += 1

    gc.disable()
    close_properly(outputfile, inputfile)
示例#2
0
def count_key_values(inputfile=None,
                     outputfile=None,
                     keys="gene_id,transcript_id",
                     uniq=True,
                     additional_text=None):
    """
 Count the number values for a set of keys.
    """

    gtf = GTF(inputfile, check_ensembl_format=False)

    if uniq:
        val_list = defaultdict(set)
    else:
        val_list = defaultdict(list)

    if keys == "*":
        key_list = gtf.get_attr_list()
        keys = ",".join(key_list)
    else:
        key_list = keys.split(",")

    for i in gtf.extract_data(keys, as_list_of_list=True):

        for k, v in zip(key_list, i):
            if v in ['.', '?']:
                continue
            if uniq:
                val_list[k].add(v)
            else:
                val_list[k] += [v]

    for i in key_list:
        if additional_text is None:
            outputfile.write(i + "\t" + str(len(val_list[i])) + "\n")
        else:
            outputfile.write(i + "\t" + str(len(val_list[i])) + "\t" +
                             additional_text + "\n")
    gc.disable()
    close_properly(outputfile, inputfile)
示例#3
0
def get_attr_value_list(inputfile=None,
                        outputfile=None,
                        key_name="gene_id",
                        print_key_name=False,
                        separator="\n",
                        count=False):
    """
    Get the list of values observed for an attributes.
    """

    gtf = GTF(inputfile, check_ensembl_format=False)

    if key_name == '*':
        key_name = ",".join(gtf.get_attr_list(add_basic=True))

    if not count:
        for akey in key_name.split(","):
            for i in gtf.get_attr_value_list(akey):
                if print_key_name:
                    outputfile.write(akey + separator + i + "\n")
                else:
                    outputfile.write(i + "\n")
        gc.disable()
        close_properly(outputfile, inputfile)

    else:
        if separator == "\n":
            separator = "\t"

        for akey in key_name.split(","):
            for i in gtf.get_attr_value_list(akey, count=True):
                if print_key_name:
                    outputfile.write(akey + separator + i[0] + separator +
                                     i[1] + "\n")
                else:
                    outputfile.write(i[0] + separator + i[1] + "\n")
        gc.disable()
        close_properly(outputfile, inputfile)
示例#4
0
def tabulate(inputfile=None,
             outputfile=None,
             key=None,
             no_unset=False,
             unique=False,
             no_basic=False,
             accept_undef=False,
             select_gene_ids=False,
             select_gene_names=False,
             select_transcript_ids=False,
             select_exon_ids=False,
             separator="\t",
             no_header=False):
    """Convert a GTF to tabulated format.
    """

    # ----------------------------------------------------------------------
    # Check mode
    # ----------------------------------------------------------------------

    if select_transcript_ids:
        key = "transcript_id"

    elif select_gene_ids:
        key = "gene_id"

    elif select_gene_names:
        key = "gene_id"

    elif select_exon_ids:
        key = "exon_id"

    no_undef = False
    if not accept_undef:
        no_undef = True
    # ----------------------------------------------------------------------
    # REad GTF and process
    # ----------------------------------------------------------------------

    gtf = GTF(inputfile, check_ensembl_format=False)

    if key in ["all", "*"]:
        if no_basic:
            attr_list = gtf.get_attr_list(add_basic=False)
        else:
            attr_list = gtf.get_attr_list(add_basic=True)
        tab = gtf.extract_data(attr_list)
    else:
        tab = gtf.extract_data(key)

    if not no_header:
        message("Writing header")
        write_properly(separator.join(tab.colnames),
                       outputfile)

    message("Writing")

    try:
        if not unique:
            if no_unset:
                if no_undef:
                    for i in tab:
                        if any([True for x in i.fields if x in [".", "?"]]):
                            continue
                        i.write(outputfile, separator)
                else:
                    for i in tab:
                        if any([True for x in i.fields if x in ["."]]):
                            continue
                        i.write(outputfile, separator)

            else:
                if no_undef:
                    for i in tab:
                        if any([True for x in i.fields if x in ["?"]]):
                            continue
                        i.write(outputfile, separator)
                else:
                    for i in tab:
                        i.write(outputfile, separator)

        else:
            printed = {}
            if no_unset:
                if no_undef:
                    for i in tab:
                        t = tuple(i)
                        if t not in printed:
                            if any([True for x in i.fields if x in [".", "?"]]):
                                continue
                            i.write(outputfile, separator)
                        printed[t] = 1
                else:
                    for i in tab:
                        t = tuple(i)
                        if t not in printed:
                            if any([True for x in i.fields if x in ["."]]):
                                continue
                            i.write(outputfile, separator)
                        printed[t] = 1
            else:
                if no_undef:
                    for i in tab:
                        t = tuple(i)
                        if t not in printed:
                            if any([True for x in i.fields if x in ["?"]]):
                                continue
                            i.write(outputfile, separator)
                        printed[t] = 1
                else:
                    for i in tab:
                        t = tuple(i)
                        if t not in printed:
                            i.write(outputfile, separator)
                        printed[t] = 1

    except (BrokenPipeError, IOError):
        def _void_f(*args, **kwargs):
            pass

        message("Received a boken pipe signal", type="WARNING")
        sys.stdout.write = _void_f
        sys.stdout.flush = _void_f

    gc.disable()
    close_properly(outputfile, inputfile)