Пример #1
0
def intronic(inputfile=None,
             outputfile=None,
             names='transcript_id',
             separator="_",
             intron_nb_in_name=False,
             no_feature_name=False,
             by_transcript=False):
    """
 Extract intronic regions.
    """

    message("Searching for intronic regions.")

    # Need to load if the gtf comes from
    # <stdin>
    gtf = GTF(inputfile, check_ensembl_format=False)

    if not by_transcript:
        introns_bo = gtf.get_introns()

        for i in introns_bo:
            write_properly(chomp(str(i)), outputfile)
    else:

        introns_bo = gtf.get_introns(by_transcript=True,
                                     name=names.split(","),
                                     sep=separator,
                                     intron_nb_in_name=intron_nb_in_name,
                                     feat_name=not no_feature_name)
        for i in introns_bo:
            write_properly(chomp(str(i)), outputfile)

    gc.disable()
    close_properly(outputfile, inputfile)
Пример #2
0
def bed_to_gtf(inputfile=None,
               outputfile=None,
               ft_type="transcript",
               source="Unknown"):
    """
 Convert a bed file to a gtf. This will make the poor bed feel as if it was a
 nice gtf (but with lots of empty fields...). May be helpful sometimes...
    """

    message("Converting the bed file into GTF file.")

    if inputfile.name == '<stdin>':
        tmp_file = make_tmp_file(prefix="input_bed", suffix=".bed")
        for i in inputfile:
            write_properly(chomp(str(i)), tmp_file)

        tmp_file.close()
        inputfile.close()

        bed_obj = BedTool(tmp_file.name)
    else:
        bed_obj = BedTool(inputfile.name)

    n = 1
    for i in bed_obj:

        if i.strand == "":
            i.strand = "."
        if i.name == "":
            i.name = str("feature_" + str(n))
        if i.score == "":
            i.score = "0"

        if ft_type == "exon":
            key_value = "gene_id \"" + i.name + "\"; " + \
                        "transcript_id \"" + i.name + "\"; " + \
                        "exon_id \"" + i.name + "\";"
        elif ft_type == "gene":
            key_value = "gene_id \"" + i.name + "\";"
        else:
            key_value = "gene_id \"" + i.name + "\"; " + \
                        "transcript_id \"" + i.name + "\";"

        if pygtftk.utils.ADD_CHR == 1:
            chrom_out = "chr" + i.chrom
        else:
            chrom_out = i.chrom

        list_out = [
            chrom_out, source, ft_type,
            str(i.start + 1),
            str(i.end),
            str(i.score), i.strand, ".", key_value
        ]

        write_properly("\t".join(list_out), outputfile)

        n += 1
    gc.disable()
    close_properly(outputfile)
Пример #3
0
 def __call__(self, parser, namespace, values, option_string=None):
     from pandas import __version__ as pandas_ver
     from pybedtools import __version__ as pybedtools_ver
     from pyBigWig import __version__ as bigwig_ver
     from pygtftk import __path__ as pygtftk_path
     from pygtftk.cmd_manager import CmdManager
     import subprocess
     from pygtftk.utils import chomp
     info_sys = []
     info_sys += ['\n- pygtftk version : ' + __version__]
     info_sys += ['- pygtftk installation path : ' + pygtftk_path[0]]
     info_sys += ['- pygtftk config directory : ' + CmdManager.config_dir]
     info_sys += [
         '- pygtftk personal plugins : ' +
         os.path.join(CmdManager.config_dir, 'plugins')
     ]
     info_sys += ['- python version : ' + str(sys.version_info)]
     info_sys += ['- python path : ' + str(sys.prefix)]
     info_sys += ['- pandas version : ' + pandas_ver]
     bedtools_ver = chomp(
         subprocess.Popen("bedtools --version",
                          shell=True,
                          stdout=subprocess.PIPE).stdout.read().decode())
     info_sys += ['- Bedtools version : ' + bedtools_ver]
     info_sys += ['- pybedtools version : ' + pybedtools_ver]
     info_sys += ['- pyBigWig version : ' + bigwig_ver]
     info_sys += ['- uname : ' + str(os.uname())]
     print("\n".join(info_sys))
     sys.exit()
Пример #4
0
def midpoints(inputfile=None,
              outputfile=None,
              ft_type="transcript",
              names="transcript_id",
              separator="|"):
    """
 Get the midpoint coordinates for the requested feature.
    """

    message("Loading input file...")
    if inputfile.name == '<stdin>':
        is_gtf = True
    else:
        region_bo = BedTool(inputfile.name)
        if len(region_bo) == 0:
            message("Unable to find requested regions", type="ERROR")

        if region_bo.file_type == 'gff':
            is_gtf = True
        else:
            is_gtf = False

    if is_gtf:

        gtf = GTF(inputfile.name, check_ensembl_format=False)

        bed_obj = gtf.select_by_key("feature", ft_type).get_midpoints(
            name=names.split(","), sep=separator)
        for line in bed_obj:
            write_properly(chomp(str(line)), outputfile)

    else:
        for line in region_bo:

            diff = line.end - line.start

            if diff % 2 != 0:
                # e.g 10-13 (zero based) -> 11-13 one based
                # mipoint is 12 (one-based) -> 11-12 (zero based)
                # e.g 949-1100 (zero based) -> 950-1100 one based
                # mipoint is 1025 (one-based) -> 1024-1025 (zero based)
                # floored division (python 2)...
                line.end = line.start + int(diff // 2) + 1
                line.start = line.end - 1
            else:
                # e.g 10-14 (zero based) -> 11-14 one based
                # mipoint is 12-13 (one-based) -> 11-13 (zero based)
                # e.g 9-5100 (zero based) -> 10-5100 one based
                # mipoint is 2555-2555 (one-based) -> 2554-2555 (zero based)
                # floored division (python 2)...
                # No real center. Take both

                line.start = line.start + int(diff // 2) - 1
                line.end = line.start + 2

            outputfile.write(str(line))

    gc.disable()
    close_properly(outputfile, inputfile)
Пример #5
0
def intergenic(inputfile=None, outputfile=None, chrom_info=None):
    """
 Extract intergenic regions.
    """

    message("Searching for intergenic regions.")

    gtf = GTF(inputfile)

    intergenic_regions = gtf.get_intergenic(chrom_info)

    nb_intergenic_region = 1

    for i in intergenic_regions:
        i.name = "region_" + str(nb_intergenic_region)
        write_properly(chomp(str(i)), outputfile)
        nb_intergenic_region += 1

    gc.disable()
    close_properly(outputfile, inputfile)
Пример #6
0
def feature_size(inputfile=None,
                 outputfile=None,
                 ft_type="transcript",
                 names="transcript_id",
                 key_name='feature_size',
                 separator="|",
                 bed=False):
    """
 Get the size and limits (start/end) of features enclosed in the GTF. If bed
 format is requested returns the limits zero-based half open and the size as a score.
 Otherwise output GTF file with 'feat_size' as a new key and size as value.
    """

    message("Computing feature sizes.")

    gtf = GTF(inputfile)

    feat_list = gtf.get_feature_list(nr=True) + ['mature_rna']

    if ft_type not in feat_list + ["*"]:
        message("Unable to find requested feature.", type="ERROR")

    names = names.split(",")

    if ft_type != 'mature_rna':

        if bed:
            bed_obj = gtf.select_by_key("feature",
                                        ft_type).to_bed(name=names,
                                                        sep=separator,
                                                        add_feature_type=True)

            for i in bed_obj:
                i.score = str(i.end - i.start)
                write_properly(chomp(str(i)), outputfile)
        else:

            tmp_file = make_tmp_file(prefix="feature_size", suffix=".txt")

            elmt = gtf.extract_data("feature,start,end",
                                    as_list_of_list=True,
                                    no_na=False,
                                    hide_undef=False)

            for i in elmt:
                if i[0] != ft_type and ft_type != "*":
                    tmp_file.write("?\n")
                else:
                    tmp_file.write(str(int(i[2]) - int(i[1]) + 1) + "\n")

            tmp_file.close()

            gtf.add_attr_column(tmp_file, key_name).write(outputfile,
                                                          gc_off=True)

    else:

        tx_size = gtf.get_transcript_size()

        if bed:
            bed_obj = gtf.select_by_key("feature", 'transcript').to_bed(
                ['transcript_id'] + names,
                add_feature_type=False,
                sep=separator,
                more_name=['mature_rna'])

            for i in bed_obj:
                names = i.name.split(separator)
                tx_id = names.pop(0)
                i.score = tx_size[tx_id]
                i.name = separator.join(names)
                write_properly(chomp(str(i)), outputfile)
        else:

            if len(tx_size):
                gtf = gtf.add_attr_from_dict(feat="transcript",
                                             key="transcript_id",
                                             a_dict=tx_size,
                                             new_key=key_name)

            gtf.write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Пример #7
0
def col_from_tab(inputfile=None,
                 outputfile=None,
                 columns=None,
                 invert_match=False,
                 no_header=False,
                 unique=False,
                 more_col=None,
                 output_separator="\t",
                 separator="\t"):
    """Select columns from a tabulated file based on their names."""

    line_set = dict()

    if re.search(",", columns):
        columns = columns.split(",")
    else:
        columns = [columns]

    if more_col:
        more_col_name, more_col_value = more_col.split(":")
    else:
        more_col_name = more_col_value = None

    for p, line in enumerate(inputfile):

        line = chomp(line)
        line = line.split(separator)

        if p == 0:

            if not invert_match:

                pos_list = list()

                for i in range(len(columns)):

                    pos = line.index(columns[i]) if columns[i] in line else -1

                    if pos > -1:
                        pos_list.append(pos)
                    else:
                        message("Column " + columns[i] + " not found",
                                type="ERROR")

            else:

                pos_list = list(range(len(line)))

                for i in range(len(columns)):

                    pos = line.index(columns[i]) if columns[i] in line else -1

                    if pos > -1:
                        pos_list.remove(pos)
                    else:
                        message("Column " + columns[i] + " not found",
                                type="ERROR")

            if not no_header:
                header_list = [line[k] for k in pos_list]
                if more_col:
                    header_list += [more_col_name]
                header = output_separator.join(header_list)
                write_properly(header, outputfile)
        else:
            out_list = [line[k] for k in pos_list]
            if more_col:
                out_list += [more_col_value]
            out = output_separator.join(out_list)
            if unique:
                if out not in line_set:
                    write_properly(out, outputfile)
                    line_set[out] = 1
            else:
                write_properly(out, outputfile)
Пример #8
0
def control_list(in_file=None,
                 out_dir=None,
                 reference_gene_file=None,
                 log2=False,
                 page_width=None,
                 page_height=None,
                 user_img_file=None,
                 page_format=None,
                 pseudo_count=1,
                 set_colors=None,
                 dpi=300,
                 rug=False,
                 jitter=False,
                 skip_first=False):
    # -------------------------------------------------------------------------
    #
    # Check in_file content
    #
    # -------------------------------------------------------------------------

    for p, line in enumerate(in_file):

        line = chomp(line)
        line = line.split("\t")

        if len(line) > 2:
            message("Need a two columns file.",
                    type="ERROR")
        if skip_first:
            if p == 0:
                continue
        try:
            fl = float(line[1])
        except ValueError:
            msg = "It seems that column 2 of input file"
            msg += " contains non numeric values. "
            msg += "Check that no header is present and that "
            msg += "columns are ordered properly. "
            msg += "Or use '--skip-first'. "
            message(msg, type="ERROR")

        if log2:
            fl = fl + pseudo_count
            if fl <= 0:
                message("Can not log transform negative/zero values. Add a pseudo-count.",
                        type="ERROR")

    # -------------------------------------------------------------------------
    #
    # Check colors
    #
    # -------------------------------------------------------------------------

    set_colors = set_colors.split(",")

    if len(set_colors) != 2:
        message("Need two colors. Please fix.", type="ERROR")

    mcolors_name = mcolors.cnames

    for i in set_colors:
        if i not in mcolors_name:
            if not is_hex_color(i):
                message(i + " is not a valid color. Please fix.", type="ERROR")

    # -------------------------------------------------------------------------
    #
    # Preparing output files
    #
    # -------------------------------------------------------------------------

    # Preparing pdf file name
    file_out_list = make_outdir_and_file(out_dir, ["control_list.txt",
                                                   "reference_list.txt",
                                                   "diagnostic_diagrams." + page_format],
                                         force=True)

    control_file, reference_file_out, img_file = file_out_list

    if user_img_file is not None:

        os.unlink(img_file.name)
        img_file = user_img_file

        if not img_file.name.endswith(page_format):
            msg = "Image format should be: {f}. Please fix.".format(f=page_format)
            message(msg, type="ERROR")

        test_path = os.path.abspath(img_file.name)
        test_path = os.path.dirname(test_path)

        if not os.path.exists(test_path):
            os.makedirs(test_path)

    # -------------------------------------------------------------------------
    #
    # Read the reference list
    #
    # -------------------------------------------------------------------------

    try:
        reference_genes = pd.read_csv(reference_gene_file.name, sep="\t", header=None)
    except pd.errors.EmptyDataError:
        message("No genes in --reference-gene-file.", type="ERROR")

    reference_genes.rename(columns={reference_genes.columns.values[0]: 'gene'}, inplace=True)

    # -------------------------------------------------------------------------
    #
    # Delete duplicates
    #
    # -------------------------------------------------------------------------

    before = len(reference_genes)
    reference_genes = reference_genes.drop_duplicates(['gene'])
    after = len(reference_genes)

    msg = "%d duplicate lines have been deleted in reference file."
    message(msg % (before - after))

    # -------------------------------------------------------------------------
    #
    # Read expression data and add the pseudo_count
    #
    # -------------------------------------------------------------------------

    if skip_first:
        exp_data = pd.read_csv(in_file.name, sep="\t",
                               header=None, index_col=None,
                               skiprows=[0], names=['exprs'])
    else:

        exp_data = pd.read_csv(in_file.name, sep="\t", names=['exprs'], index_col=0)

    exp_data.exprs = exp_data.exprs.values + pseudo_count

    # -------------------------------------------------------------------------
    #
    # log transformation
    #
    # -------------------------------------------------------------------------

    ylabel = 'Expression'

    if log2:
        if len(exp_data.exprs.values[exp_data.exprs.values == 0]):
            message("Can't use log transformation on zero or negative values. Use -p.",
                    type="ERROR")
        else:
            exp_data.exprs = np.log2(exp_data.exprs.values)
            ylabel = 'log2(Expression)'

    # -------------------------------------------------------------------------
    #
    # Are reference gene found in control list
    #
    # -------------------------------------------------------------------------

    # Sort in increasing order
    exp_data = exp_data.sort_values('exprs')

    #  Vector with positions indicating which in the
    # expression data list are found in reference_gene

    reference_genes_found = [x for x in reference_genes['gene'] if x in exp_data.index]

    msg = "Found %d genes of the reference in the provided signal file" % len(reference_genes_found)
    message(msg)

    not_found = [x for x in reference_genes['gene'] if x not in exp_data.index]

    if len(not_found):
        if len(not_found) == len(reference_genes):
            message("Genes from reference file where not found in signal file (n=%d)." % len(not_found), type="ERROR")
        else:
            message("List of reference genes not found :%s" % not_found)
    else:
        message("All reference genes were found.")

    # -------------------------------------------------------------------------
    #
    # Search for genes with matched signal
    #
    # -------------------------------------------------------------------------

    exp_data_save = exp_data.copy()

    control_list = list()

    nb_candidate_left = exp_data.shape[0] - len(reference_genes_found)

    message("Searching for genes with matched signal.")

    if nb_candidate_left < len(reference_genes_found):
        message("Not enough element to perform selection. Exiting", type="ERROR")

    for i in reference_genes_found:
        not_candidates = reference_genes_found + control_list
        not_candidates = list(set(not_candidates))

        diff = abs(exp_data.loc[i] - exp_data)
        control_list.extend(diff.loc[np.setdiff1d(diff.index, not_candidates)].idxmin(axis=0, skipna=True).tolist())

    # -------------------------------------------------------------------------
    #
    # Prepare a dataframe for plotting
    #
    # -------------------------------------------------------------------------

    message("Preparing a dataframe for plotting.")

    reference = exp_data_save.loc[reference_genes_found].sort_values('exprs')
    reference = reference.assign(genesets=['Reference'] * reference.shape[0])

    control = exp_data_save.loc[control_list].sort_values('exprs')
    control = control.assign(genesets=['Control'] * control.shape[0])

    data = pd.concat([reference, control])
    data['sets'] = pd.Series(['sets' for x in data.index.tolist()], index=data.index)
    data['genesets'] = Categorical(data['genesets'])

    # -------------------------------------------------------------------------
    #
    # Diagnostic plots
    #
    # -------------------------------------------------------------------------

    p = ggplot(data, aes(x='sets', y='exprs', fill='genesets'))

    p += scale_fill_manual(values=dict(zip(['Reference', 'Control'], set_colors)))

    p += geom_violin(color=None)

    p += xlab('Gene sets') + ylab(ylabel)

    p += facet_wrap('~genesets')

    if rug:
        p += geom_rug()

    if jitter:
        p += geom_jitter()

    p += theme_bw()
    p += theme(axis_text_x=element_blank())

    # -------------------------------------------------------------------------
    # Turn warning off. Both pandas and plotnine use warnings for deprecated
    # functions. I need to turn they off although I'm not really satisfied with
    # this solution...
    # -------------------------------------------------------------------------

    def fxn():
        warnings.warn("deprecated", DeprecationWarning)

    # -------------------------------------------------------------------------
    #
    # Saving
    #
    # -------------------------------------------------------------------------

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        fxn()
        message("Saving diagram to file : " + img_file.name)
        message("Be patient. This may be long for large datasets.")

        try:
            p.save(filename=img_file.name, width=page_width, height=page_height, dpi=dpi, limitsize=False)
        except PlotnineError as err:
            message("Plotnine message: " + err.message)
            message("Plotnine encountered an error.", type="ERROR")

    # -------------------------------------------------------------------------
    #
    # write results
    #
    # -------------------------------------------------------------------------

    exp_data_save.loc[reference_genes_found].sort_values('exprs').to_csv(reference_file_out.name, sep="\t")
    exp_data_save.loc[control_list].sort_values('exprs').to_csv(control_file.name, sep="\t")
Пример #9
0
def get_5p_3p_coords(inputfile=None,
                     outputfile=None,
                     ft_type="transcript",
                     names="transcript_id",
                     separator="|",
                     more_names='',
                     transpose=0,
                     invert=False,
                     explicit=False):
    """
    Get the 5p or 3p coordinate for each feature (e.g TSS or TTS for a transcript).
    """

    if more_names is None:
        more_names = []
    else:
        more_names = more_names.split(',')

    if not invert:
        message("Computing 5' coordinates of '" + ft_type + "'.")
    else:
        message("Computing 3' coordinates of '" + ft_type + "'.")

    gtf = GTF(inputfile, check_ensembl_format=False)

    if names != "*":
        nms = names.split(",")
    else:

        nms = gtf.select_by_key("feature", "transcript").get_attr_list(add_basic=False)

    if not invert:

        bed_obj = gtf.get_5p_end(feat_type=ft_type,
                                 name=nms,
                                 sep=separator,
                                 more_name=more_names,
                                 explicit=explicit)

    else:

        bed_obj = gtf.get_3p_end(feat_type=ft_type,
                                 name=nms,
                                 sep=separator,
                                 more_name=more_names,
                                 explicit=explicit)

    if not len(bed_obj):
        message("Requested feature could not be found. Use convert_ensembl maybe.",
                type="ERROR")

    if transpose == 0:
        for i in bed_obj:
            write_properly(chomp(str(i)), outputfile)
    else:
        for i in bed_obj:
            out_list = list()
            if i.strand == "+":
                out_list = [i.chrom,
                            str(i.start + transpose),
                            str(i.end + transpose),
                            i.name,
                            i.score,
                            i.strand]
            elif i.strand == "-":
                out_list = [i.chrom,
                            str(i.start - transpose),
                            str(i.end - transpose),
                            i.name,
                            i.score,
                            i.strand]
            outputfile.write("\t".join(out_list) + "\n")
    gc.disable()
    close_properly(outputfile, inputfile)