示例#1
0
        m1_matrix, m2_matrix, similarity_matrix_sub = subset_matrix(
            similarity_matrix, motif_names_1, motif_names_2)

        col_linkage = linkage(ssd.squareform(m1_matrix)) if (
            len(motif_names_1) > 1 and len(motif_names_2) > 1) else None
        row_linkage = linkage(ssd.squareform(m2_matrix)) if (
            len(motif_names_1) > 1 and len(motif_names_2) > 1) else None

        #Plot similarity heatmap between file1 and file2
        plot_heatmap(similarity_matrix_sub, pdf_out, col_linkage, row_linkage,
                     args.dpi, x_label, y_label, args.color, args.ncc,
                     args.nrc, args.zscore)

    # ClusterMotifs finished
    logger.end()


#--------------------------------------------------------------------------------------------------------#
if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser = add_motifclust_arguments(parser)
    args = parser.parse_args()

    if len(sys.argv[1:]) == 0:
        parser.print_help()
        sys.exit()

    run_motifclust(args)
示例#2
0
def run_motifclust(args):

    ###### Check input arguments ######
    check_required(args, ["motifs"])  #Check input arguments
    check_files([args.motifs])  #Check if files exist
    out_cons_img = os.path.join(args.outdir, "consensus_motifs_img")
    make_directory(out_cons_img)
    out_prefix = os.path.join(args.outdir, args.prefix)

    ###### Create logger and write argument overview ######
    logger = TobiasLogger("ClusterMotifs", args.verbosity)
    logger.begin()

    parser = add_motifclust_arguments(argparse.ArgumentParser())
    logger.arguments_overview(parser, args)
    #logger.output_files([])

    out_prefix = os.path.join(args.outdir, args.prefix)

    #----------------------------------------- Check for gimmemotifs ----------------------------------------#

    try:
        from gimmemotifs.motif import Motif
        from gimmemotifs.comparison import MotifComparer
        sns.set_style(
            "ticks"
        )  #set style back to ticks, as this is set globally during gimmemotifs import

    except ModuleNotFoundError:
        logger.error(
            "MotifClust requires the python package 'gimmemotifs'. You can install it using 'pip install gimmemotifs' or 'conda install gimmemotifs'."
        )
        sys.exit(1)

    except ImportError as e:  #gimmemotifs was installed, but there was an error during import

        pandas_version = pd.__version__
        python_version = platform.python_version()

        if e.name == "collections" and (
                version.parse(python_version) >= version.parse("3.10.0")
        ):  #collections error from norns=0.1.5 and from other packages on python=3.10
            logger.error(
                "Due to package dependency errors, 'TOBIAS ClusterMotifs' is not available for python>=3.10. Current python version is '{0}'. Please downgrade python in order to use this tool."
                .format(python_version))
            sys.exit(1)

        elif e.name == "pandas.core.indexing" and (
                version.parse(pandas_version) >= version.parse("1.3.0")):
            logger.error(
                "Package 'gimmemotifs' version < 0.17.0 requires 'pandas' version < 1.3.0. Current pandas version is {0}."
                .format(pandas_version))
            sys.exit(1)

        else:  #other import error
            logger.error(
                "Tried to import package 'gimmemotifs' but failed with error: '{0}'"
                .format(repr(e)))
            logger.error("Traceback:")
            raise e

    except Exception as e:
        logger.error(
            "Tried to import package 'gimmemotifs' but failed with error: '{0}'"
            .format(repr(e)))
        logger.error(
            "Please check that 'gimmemotifs' was successfully installed.")
        sys.exit(1)

    #Check gimmemotifs version vs. metric given
    import gimmemotifs
    gimme_version = gimmemotifs.__version__
    if gimme_version == "0.17.0" and args.dist_method in ["pcc", "akl"]:
        logger.warning(
            "The dist_method given ('{0}') is invalid for gimmemotifs version 0.17.0. Please choose another --dist_method. See also: https://github.com/vanheeringen-lab/gimmemotifs/issues/243"
            .format(args.dist_method))
        sys.exit(1)

    #---------------------------------------- Reading motifs from file(s) -----------------------------------#
    logger.info("Reading input file(s)")

    motif_list = MotifList()  #list containing OneMotif objects
    motif_dict = {}  #dictionary containing separate motif lists per file

    if sys.version_info < (
            3, 7, 0):  # workaround for deepcopy with python version < 3.5
        copy._deepcopy_dispatch[type(re.compile(''))] = lambda r, _: r

    for f in args.motifs:
        logger.debug("Reading {0}".format(f))

        motif_format = get_motif_format(open(f).read())
        sub_motif_list = MotifList().from_file(f)  #MotifList object

        logger.stats("- Read {0} motifs from {1} (format: {2})".format(
            len(sub_motif_list), f, motif_format))

        motif_list.extend(sub_motif_list)
        motif_dict[f] = sub_motif_list

    #Check whether ids are unique
    #TODO

    #---------------------------------------- Motif stats ---------------------------------------------------#
    logger.info("Creating matrix statistics")

    gimmemotifs_list = [
        motif.get_gimmemotif().gimme_obj for motif in motif_list
    ]

    #Stats for all motifs
    full_motifs_out = out_prefix + "_stats_motifs.txt"
    motifs_stats = get_motif_stats(gimmemotifs_list)
    write_motif_stats(motifs_stats, full_motifs_out)

    #---------------------------------------- Motif clustering ----------------------------------------------#
    logger.info("Clustering motifs")

    clusters = motif_list.cluster(threshold=args.threshold,
                                  metric=args.dist_method,
                                  clust_method=args.clust_method)
    logger.stats("- Identified {0} clusters".format(len(clusters)))

    #Write out overview of clusters
    cluster_dict = {
        cluster_id: [
            motif.get_gimmemotif().gimme_obj.id
            for motif in clusters[cluster_id]
        ]
        for cluster_id in clusters
    }
    cluster_f = out_prefix + "_" + "clusters.yml"
    logger.info("- Writing clustering to {0}".format(cluster_f))
    write_yaml(cluster_dict, cluster_f)

    # Save similarity matrix to file
    matrix_out = out_prefix + "_matrix.txt"
    logger.info("- Saving similarity matrix to the file: " + str(matrix_out))
    motif_list.similarity_matrix.to_csv(matrix_out, sep='\t')

    #Plot dendrogram
    logger.info("Plotting clustering dendrogram")
    dendrogram_f = out_prefix + "_" + "dendrogram." + args.type  #plot format pdf/png
    plot_dendrogram(motif_list.similarity_matrix.columns,
                    motif_list.linkage_mat, 12, dendrogram_f, "Clustering",
                    args.threshold, args.dpi)

    #---------------------------------------- Consensus motif -----------------------------------------------#
    logger.info("Building consensus motif for each cluster")

    consensus_motifs = MotifList()
    for cluster_id in clusters:
        consensus = clusters[cluster_id].create_consensus(
            metric=args.dist_method
        )  #MotifList object with create_consensus method
        consensus.id = cluster_id if len(
            clusters[cluster_id]) > 1 else clusters[cluster_id][
                0].id  #set original motif id if cluster length = 1

        consensus_motifs.append(consensus)

    #Write out consensus motif file
    out_f = out_prefix + "_consensus_motifs." + args.cons_format
    logger.info("- Writing consensus motifs to: {0}".format(out_f))
    consensus_motifs.to_file(out_f, args.cons_format)

    #Create logo plots
    out_cons_img = os.path.join(args.outdir, "consensus_motifs_img")
    logger.info(
        "- Making logo plots for consensus motifs (output folder: {0})".format(
            out_cons_img))
    for motif in consensus_motifs:
        filename = os.path.join(out_cons_img,
                                motif.id + "_consensus." + args.type)
        motif.logo_to_file(filename)

    #---------------------------------------- Plot heatmap --------------------------------------------------#

    logger.info("Plotting similarity heatmap")
    logger.info(
        "Note: Can take a while for --type=pdf. Try \"--type png\" for speed up."
    )
    args.nrc = False
    args.ncc = False
    args.zscore = "None"
    clust_linkage = motif_list.linkage_mat
    similarity_matrix = motif_list.similarity_matrix

    pdf_out = out_prefix + "_heatmap_all." + args.type
    x_label = "All motifs"
    y_label = "All motifs"
    plot_heatmap(similarity_matrix, pdf_out, clust_linkage, clust_linkage,
                 args.dpi, x_label, y_label, args.color, args.ncc, args.nrc,
                 args.zscore)

    # Plot heatmaps for each combination of motif files
    comparisons = itertools.combinations(args.motifs, 2)
    for i, (motif_file_1, motif_file_2) in enumerate(comparisons):

        pdf_out = out_prefix + "_heatmap" + str(i) + "." + args.type
        logger.info("Plotting comparison of {0} and {1} motifs to the file ".
                    format(motif_file_1, motif_file_2) + str(pdf_out))

        x_label, y_label = motif_file_1, motif_file_2

        #Create subset of matrices for row/col clustering
        motif_names_1 = [
            motif.get_gimmemotif().gimme_obj.id
            for motif in motif_dict[motif_file_1]
        ]
        motif_names_2 = [
            motif.get_gimmemotif().gimme_obj.id
            for motif in motif_dict[motif_file_2]
        ]

        m1_matrix, m2_matrix, similarity_matrix_sub = subset_matrix(
            similarity_matrix, motif_names_1, motif_names_2)

        col_linkage = linkage(ssd.squareform(m1_matrix)) if (
            len(motif_names_1) > 1 and len(motif_names_2) > 1) else None
        row_linkage = linkage(ssd.squareform(m2_matrix)) if (
            len(motif_names_1) > 1 and len(motif_names_2) > 1) else None

        #Plot similarity heatmap between file1 and file2
        plot_heatmap(similarity_matrix_sub, pdf_out, col_linkage, row_linkage,
                     args.dpi, x_label, y_label, args.color, args.ncc,
                     args.nrc, args.zscore)

    # ClusterMotifs finished
    logger.end()
示例#3
0
def run_motifclust(args):

    ###### Check input arguments ######
    check_required(args, ["motifs"])  #Check input arguments
    check_files([args.motifs])  #Check if files exist
    out_cons_img = os.path.join(args.outdir, "consensus_motifs_img")
    make_directory(out_cons_img)
    out_prefix = os.path.join(args.outdir, args.prefix)

    ###### Create logger and write argument overview ######
    logger = TobiasLogger("ClusterMotifs", args.verbosity)
    logger.begin()

    parser = add_motifclust_arguments(argparse.ArgumentParser())
    logger.arguments_overview(parser, args)
    #logger.output_files([])

    out_prefix = os.path.join(args.outdir, args.prefix)

    #----------------------------------------- Check for gimmemotifs ----------------------------------------#

    try:
        from gimmemotifs.motif import Motif
        from gimmemotifs.comparison import MotifComparer
        sns.set_style(
            "ticks"
        )  #set style back to ticks, as this is set globally during gimmemotifs import
    except:
        logger.error(
            "MotifClust requires the python package 'gimmemotifs'. You can install it using 'pip install gimmemotifs' or 'conda install gimmemotifs'."
        )
        sys.exit()

    #---------------------------------------- Reading motifs from file(s) -----------------------------------#
    logger.info("Reading input file(s)")

    motif_list = MotifList()  #list containing OneMotif objects
    motif_dict = {}  #dictionary containing separate motif lists per file

    if sys.version_info < (
            3, 7, 0):  # workaround for deepcopy with python version < 3.5
        copy._deepcopy_dispatch[type(re.compile(''))] = lambda r, _: r

    for f in args.motifs:
        logger.debug("Reading {0}".format(f))

        motif_format = get_motif_format(open(f).read())
        sub_motif_list = MotifList().from_file(f)  #MotifList object

        logger.stats("- Read {0} motifs from {1} (format: {2})".format(
            len(sub_motif_list), f, motif_format))

        motif_list.extend(sub_motif_list)
        motif_dict[f] = sub_motif_list

    #Check whether ids are unique
    #TODO

    #---------------------------------------- Motif stats ---------------------------------------------------#
    logger.info("Creating matrix statistics")

    gimmemotifs_list = [
        motif.get_gimmemotif().gimme_obj for motif in motif_list
    ]

    #Stats for all motifs
    full_motifs_out = out_prefix + "_stats_motifs.txt"
    motifs_stats = get_motif_stats(gimmemotifs_list)
    write_motif_stats(motifs_stats, full_motifs_out)

    #---------------------------------------- Motif clustering ----------------------------------------------#
    logger.info("Clustering motifs")

    clusters = motif_list.cluster(threshold=args.threshold,
                                  metric=args.dist_method,
                                  clust_method=args.clust_method)
    logger.stats("- Identified {0} clusters".format(len(clusters)))

    #Write out overview of clusters
    cluster_dict = {
        cluster_id: [
            motif.get_gimmemotif().gimme_obj.id
            for motif in clusters[cluster_id]
        ]
        for cluster_id in clusters
    }
    cluster_f = out_prefix + "_" + "clusters.yml"
    logger.info("- Writing clustering to {0}".format(cluster_f))
    write_yaml(cluster_dict, cluster_f)

    # Save similarity matrix to file
    matrix_out = out_prefix + "_matrix.txt"
    logger.info("- Saving similarity matrix to the file: " + str(matrix_out))
    motif_list.similarity_matrix.to_csv(matrix_out, sep='\t')

    #Plot dendrogram
    logger.info("Plotting clustering dendrogram")
    dendrogram_f = out_prefix + "_" + "dendrogram." + args.type  #plot format pdf/png
    plot_dendrogram(motif_list.similarity_matrix.columns,
                    motif_list.linkage_mat, 12, dendrogram_f, "Clustering",
                    args.threshold, args.dpi)

    #---------------------------------------- Consensus motif -----------------------------------------------#
    logger.info("Building consensus motif for each cluster")

    consensus_motifs = MotifList()
    for cluster_id in clusters:
        consensus = clusters[cluster_id].create_consensus(
        )  #MotifList object with create_consensus method
        consensus.id = cluster_id if len(
            clusters[cluster_id]) > 1 else clusters[cluster_id][
                0].id  #set original motif id if cluster length = 1

        consensus_motifs.append(consensus)

    #Write out consensus motif file
    out_f = out_prefix + "_consensus_motifs." + args.cons_format
    logger.info("- Writing consensus motifs to: {0}".format(out_f))
    consensus_motifs.to_file(out_f, args.cons_format)

    #Create logo plots
    out_cons_img = os.path.join(args.outdir, "consensus_motifs_img")
    logger.info(
        "- Making logo plots for consensus motifs (output folder: {0})".format(
            out_cons_img))
    for motif in consensus_motifs:
        filename = os.path.join(out_cons_img,
                                motif.id + "_consensus." + args.type)
        motif.logo_to_file(filename)

    #---------------------------------------- Plot heatmap --------------------------------------------------#

    logger.info("Plotting similarity heatmap")
    logger.info(
        "Note: Can take a while for --type=pdf. Try \"--type png\" for speed up."
    )
    args.nrc = False
    args.ncc = False
    args.zscore = "None"
    clust_linkage = motif_list.linkage_mat
    similarity_matrix = motif_list.similarity_matrix

    pdf_out = out_prefix + "_heatmap_all." + args.type
    x_label = "All motifs"
    y_label = "All motifs"
    plot_heatmap(similarity_matrix, pdf_out, clust_linkage, clust_linkage,
                 args.dpi, x_label, y_label, args.color, args.ncc, args.nrc,
                 args.zscore)

    # Plot heatmaps for each combination of motif files
    comparisons = itertools.combinations(args.motifs, 2)
    for i, (motif_file_1, motif_file_2) in enumerate(comparisons):

        pdf_out = out_prefix + "_heatmap" + str(i) + "." + args.type
        logger.info("Plotting comparison of {0} and {1} motifs to the file ".
                    format(motif_file_1, motif_file_2) + str(pdf_out))

        x_label, y_label = motif_file_1, motif_file_2

        #Create subset of matrices for row/col clustering
        motif_names_1 = [
            motif.get_gimmemotif().gimme_obj.id
            for motif in motif_dict[motif_file_1]
        ]
        motif_names_2 = [
            motif.get_gimmemotif().gimme_obj.id
            for motif in motif_dict[motif_file_2]
        ]

        m1_matrix, m2_matrix, similarity_matrix_sub = subset_matrix(
            similarity_matrix, motif_names_1, motif_names_2)

        col_linkage = linkage(ssd.squareform(m1_matrix)) if (
            len(motif_names_1) > 1 and len(motif_names_2) > 1) else None
        row_linkage = linkage(ssd.squareform(m2_matrix)) if (
            len(motif_names_1) > 1 and len(motif_names_2) > 1) else None

        #Plot similarity heatmap between file1 and file2
        plot_heatmap(similarity_matrix_sub, pdf_out, col_linkage, row_linkage,
                     args.dpi, x_label, y_label, args.color, args.ncc,
                     args.nrc, args.zscore)

    # ClusterMotifs finished
    logger.end()