def combine_replicates(TaXon_table_xlsx, suffix_list, path_to_outdirs):

    import PySimpleGUI as sg
    import pandas as pd
    import numpy as np
    from pathlib import Path

    TaXon_table_file = Path(TaXon_table_xlsx)

    # create output file
    output_file = Path(
        str(path_to_outdirs) + "/" + "TaXon_tables" + "/" +
        str(TaXon_table_file.stem) + "_derep.xlsx")
    TaXon_table_xlsx = pd.ExcelFile(TaXon_table_xlsx)
    df = pd.read_excel(TaXon_table_xlsx, 'TaXon table', header=0)

    sample_names = df.columns[10:]
    unique_sample_names_list, samples_to_process_list = [], []

    for sample in sample_names:
        sample_name = sample.split("_")[0:-1]
        unique_sample_names_list.append("_".join(sample_name))

    unique_sample_names_set = sorted(set(unique_sample_names_list))

    replicates_dict = {}

    for sample in unique_sample_names_set:

        for i, suffix in enumerate(suffix_list):
            replicates_dict["rep_" + str(i)] = sample + "_" + suffix_list[i]

        combined = sample + "_comb"

        replicate_names_list = list(replicates_dict.values())

        try:
            df[combined] = df[replicate_names_list].sum(axis=1)
            df = df.drop(replicate_names_list, axis=1)
        except:
            print("Warning! No replicates found for:    " + sample)

    df.to_excel(output_file, index=False, sheet_name='TaXon table')
    closing_text = "Taxon table is found under:\n" + '/'.join(
        str(output_file).split("/")[-4:])
    sg.Popup(closing_text, title="Finished", keep_on_top=True)

    from taxontabletools.create_log import ttt_log
    ttt_log("replicate merging", "processing", TaXon_table_file.name,
            output_file.name, "nan", path_to_outdirs)
def convert_to_presence_absence(TaXon_table_xlsx, path_to_outdirs):

    from pathlib import Path
    import PySimpleGUI as sg
    import  pandas as pd

    TaXon_table_xlsx =  Path(TaXon_table_xlsx)
    TaXon_table_df = pd.read_excel(TaXon_table_xlsx, header=0)
    # create presence/absence table
    presence_absence_list = []
    for col in TaXon_table_df.values.tolist():
        presence_absence_list.append(col[0:10] + [int(1) if reads != 0 else int(0) for reads in col[10:]])
    df_pa = pd.DataFrame(presence_absence_list)
    df_pa.columns = TaXon_table_df.columns.tolist()
    output_file = Path(str(path_to_outdirs) + "/" + "TaXon_tables" + "/" + TaXon_table_xlsx.stem + "_pa.xlsx")
    df_pa.to_excel(output_file, index=False, sheet_name = 'TaXon table')
    closing_text = "Presence absence tables is found in: " + str(path_to_outdirs) + "/TaXon_tables/"
    sg.Popup(closing_text, title="Finished", keep_on_top=True)

    from taxontabletools.create_log import ttt_log
    ttt_log("presence absence conversion", "processing", TaXon_table_xlsx.name, output_file.name, "nan", path_to_outdirs)
示例#3
0
def venn_diagram(file_a, file_b, file_c, venn_diagram_name, path_to_outdirs,
                 clustering_unit):

    import os
    import PySimpleGUI as sg
    import pandas as pd
    from pandas import DataFrame
    import numpy as np
    import matplotlib.pyplot as plt
    from matplotlib_venn import venn2
    from matplotlib_venn import venn3
    from matplotlib.pyplot import plot, ion, show
    from pathlib import Path

    file_a = Path(file_a)
    file_b = Path(file_b)

    venn_font = 20

    if file_c == False:
        ############################################################################
        # use venn2

        count = 0

        G = "G_" + clustering_unit
        allowed_taxa = [
            "A_Phylum", "B_Class", "C_Order", "D_Family", "E_Genus",
            "F_Species", G
        ]

        venn_dict = {}

        ############################################################################
        ## create the progress bar window
        layout = [[sg.Text('Progress bar')],
                  [
                      sg.ProgressBar(1000,
                                     orientation='h',
                                     size=(20, 20),
                                     key='progressbar')
                  ], [sg.Cancel()]]
        window_progress_bar = sg.Window('Progress bar', layout)
        progress_bar = window_progress_bar['progressbar']
        progress_update = 167 * 2
        ############################################################################

        for taxon in allowed_taxa:

            output_name = taxon
            taxon = taxon[2:]
            col_name = taxon

            if taxon in ["ASVs", "ESVs", "OTUs", "zOTUs"]:
                col_name = taxon
                taxon = "ID"

            data_file_a = pd.read_excel(file_a, 'TaXon table', header=0)
            data_file_b = pd.read_excel(file_b, 'TaXon table', header=0)

            file_name_a = file_a.stem
            file_name_b = file_b.stem

            taxa_file_a = data_file_a[taxon].values.tolist()
            taxa_file_b = data_file_b[taxon].values.tolist()

            taxa_unique_a = list(dict.fromkeys(taxa_file_a))
            taxa_unique_b = list(dict.fromkeys(taxa_file_b))

            taxa_labels_a = []
            taxa_labels_b = []
            taxa_sizes_a = []
            taxa_sizes_b = []

            for taxon_name in taxa_unique_a:
                if "nan" != str(taxon_name):
                    taxa_labels_a.append(str(taxon_name))
                    taxa_sizes_a.append(taxa_file_a.count(taxon_name))

            for taxon_name in taxa_unique_b:
                if "nan" != str(taxon_name):
                    taxa_labels_b.append(str(taxon_name))
                    taxa_sizes_b.append(taxa_file_b.count(taxon_name))

            taxa_labels_a = sorted(taxa_labels_a)
            taxa_labels_b = sorted(taxa_labels_b)

            a_only = set(taxa_labels_a) - set(taxa_labels_b)
            len_a_only = len(a_only)
            b_only = set(taxa_labels_b) - set(taxa_labels_a)
            len_b_only = len(b_only)
            shared = set(taxa_labels_a) & set(taxa_labels_b)
            len_shared = len(shared)

            venn_dict[col_name + "_a_only"] = a_only
            venn_dict[col_name + "_shared"] = shared
            venn_dict[col_name + "_b_only"] = b_only

            plt.figure(figsize=(20, 10))
            out = venn2(subsets=(len_a_only, len_b_only, len_shared),
                        set_labels=(file_name_a, file_name_b))
            for text in out.set_labels:
                text.set_fontsize(venn_font)
            for x in range(len(out.subset_labels)):
                if out.subset_labels[x] is not None:
                    out.subset_labels[x].set_fontsize(venn_font)

            dirName = Path(
                str(path_to_outdirs) + "/Venn_diagrams/" + venn_diagram_name)
            if not os.path.exists(dirName):
                os.mkdir(dirName)

            output_pdf = Path(str(dirName) + "/" + output_name + ".pdf")
            plt.title(output_name[2:])
            plt.savefig(output_pdf, bbox_inches='tight')

            if taxon == "Species":
                answer = sg.PopupYesNo('Show last plot?', keep_on_top=True)
                if answer == "Yes":
                    plt.show(block=False)
                    sg.Popup("Close")

            plt.close()

            ############################################################################
            event, values = window_progress_bar.read(timeout=10)
            if event == 'Cancel' or event is None:
                window_progress_bar.Close()
                raise RuntimeError
            # update bar with loop value +1 so that bar eventually reaches the maximum
            progress_update += 167
            progress_bar.UpdateBar(progress_update)
            ############################################################################

        window_progress_bar.Close()

        output_xlsx = Path(str(dirName) + "/" + "Venn_comparison_results.xlsx")
        df = pd.DataFrame.from_dict(venn_dict, orient='index').transpose()
        df.to_excel(output_xlsx, index=False)

        sg.Popup("Venn diagrams are found in",
                 path_to_outdirs,
                 "Venn_diagrams/",
                 title="Finished",
                 keep_on_top=True)

        from taxontabletools.create_log import ttt_log
        ttt_log("venn diagram", "analysis", file_a.name, output_xlsx.name,
                venn_diagram_name, path_to_outdirs)
        ttt_log("venn diagram", "analysis", file_b.name, output_xlsx.name,
                venn_diagram_name, path_to_outdirs)

    else:
        ############################################################################
        # use venn3

        if file_c == '':
            sg.PopupError("Please provide a file", keep_on_top=True)
            raise RuntimeError()

        file_c = Path(file_c)

        count = 0

        G = "G_" + clustering_unit
        allowed_taxa = [
            "A_Phylum", "B_Class", "C_Order", "D_Family", "E_Genus",
            "F_Species", G
        ]

        venn_dict = {}

        ############################################################################
        ## create the progress bar window
        layout = [[sg.Text('Progress bar')],
                  [
                      sg.ProgressBar(1000,
                                     orientation='h',
                                     size=(20, 20),
                                     key='progressbar')
                  ], [sg.Cancel()]]
        window_progress_bar = sg.Window('Progress bar', layout)
        progress_bar = window_progress_bar['progressbar']
        progress_update = 167 * 2
        ############################################################################

        for taxon in allowed_taxa:

            output_name = taxon
            taxon = taxon[2:]
            col_name = taxon

            if taxon in ["ASVs", "ESVs", "OTUs", "zOTUs"]:
                col_name = taxon
                taxon = "ID"

            data_file_a = pd.read_excel(file_a, 'TaXon table', header=0)
            data_file_b = pd.read_excel(file_b, 'TaXon table', header=0)
            data_file_c = pd.read_excel(file_c, 'TaXon table', header=0)

            file_name_a = file_a.stem
            file_name_b = file_b.stem
            file_name_c = file_c.stem

            taxa_file_a = data_file_a[taxon].values.tolist()
            taxa_file_b = data_file_b[taxon].values.tolist()
            taxa_file_c = data_file_c[taxon].values.tolist()

            taxa_unique_a = list(dict.fromkeys(taxa_file_a))
            taxa_unique_b = list(dict.fromkeys(taxa_file_b))
            taxa_unique_c = list(dict.fromkeys(taxa_file_c))

            taxa_labels_a = []
            taxa_labels_b = []
            taxa_labels_c = []
            taxa_sizes_a = []
            taxa_sizes_b = []
            taxa_sizes_c = []

            for taxon_name in taxa_unique_a:
                if "nan" != str(taxon_name):
                    taxa_labels_a.append(str(taxon_name))
                    taxa_sizes_a.append(taxa_file_a.count(taxon_name))

            for taxon_name in taxa_unique_b:
                if "nan" != str(taxon_name):
                    taxa_labels_b.append(str(taxon_name))
                    taxa_sizes_b.append(taxa_file_b.count(taxon_name))

            for taxon_name in taxa_unique_c:
                if "nan" != str(taxon_name):
                    taxa_labels_c.append(str(taxon_name))
                    taxa_sizes_c.append(taxa_file_c.count(taxon_name))

            taxa_labels_a = sorted(taxa_labels_a)
            taxa_labels_b = sorted(taxa_labels_b)
            taxa_labels_c = sorted(taxa_labels_c)

            a_only = set(taxa_labels_a) - set(taxa_labels_b) - set(
                taxa_labels_c)
            len_a_only = len(a_only)
            b_only = set(taxa_labels_b) - set(taxa_labels_a) - set(
                taxa_labels_c)
            len_b_only = len(b_only)
            c_only = set(taxa_labels_c) - set(taxa_labels_a) - set(
                taxa_labels_b)
            len_c_only = len(c_only)

            shared_all = set(taxa_labels_a) & set(taxa_labels_b) & set(
                taxa_labels_c)
            len_shared_all = len(shared_all)
            shared_a_b = set(
                taxa_labels_a) & set(taxa_labels_b) - set(taxa_labels_c)
            len_shared_a_b = len(shared_a_b)
            shared_a_c = set(
                taxa_labels_a) & set(taxa_labels_c) - set(taxa_labels_b)
            len_shared_a_c = len(shared_a_c)
            shared_b_c = set(
                taxa_labels_b) & set(taxa_labels_c) - set(taxa_labels_a)
            len_shared_b_c = len(shared_b_c)

            venn_dict[col_name + "_a_only"] = a_only
            venn_dict[col_name + "_b_only"] = b_only
            venn_dict[col_name + "_c_only"] = c_only
            venn_dict[col_name + "_shared_all"] = shared_all
            venn_dict[col_name + "_shared_a_b"] = shared_a_b
            venn_dict[col_name + "_shared_a_c"] = shared_a_c
            venn_dict[col_name + "_shared_b_c"] = shared_b_c

            plt.figure(figsize=(20, 10))
            out = venn3(subsets=(len_a_only, len_b_only, len_shared_a_b,
                                 len_c_only, len_shared_a_c, len_shared_b_c,
                                 len_shared_all),
                        set_labels=(file_name_a, file_name_b, file_name_c))
            for text in out.set_labels:
                text.set_fontsize(venn_font)
            for x in range(len(out.subset_labels)):
                if out.subset_labels[x] is not None:
                    out.subset_labels[x].set_fontsize(venn_font)

            dirName = Path(
                str(path_to_outdirs) + "/Venn_diagrams/" + venn_diagram_name)
            if not os.path.exists(dirName):
                os.mkdir(dirName)

            output_pdf = Path(str(dirName) + "/" + output_name + ".pdf")
            plt.title(output_name[2:])
            plt.savefig(output_pdf, bbox_inches='tight')

            if taxon == "Species":
                answer = sg.PopupYesNo('Show last plot?', keep_on_top=True)
                if answer == "Yes":
                    plt.show(block=False)
                    sg.Popup("Close")

            plt.close()

            ############################################################################
            event, values = window_progress_bar.read(timeout=10)
            if event == 'Cancel' or event is None:
                window_progress_bar.Close()
                raise RuntimeError
            # update bar with loop value +1 so that bar eventually reaches the maximum
            progress_update += 167
            progress_bar.UpdateBar(progress_update)
            ############################################################################

        window_progress_bar.Close()

        output_xlsx = Path(str(dirName) + "/" + "Venn_comparison_results.xlsx")
        df = pd.DataFrame.from_dict(venn_dict, orient='index').transpose()
        df.to_excel(output_xlsx, index=False)

        sg.Popup("Venn diagrams are found in",
                 path_to_outdirs,
                 "Venn_diagrams/",
                 title="Finished",
                 keep_on_top=True)

        from taxontabletools.create_log import ttt_log
        ttt_log("venn diagram", "analysis", file_a.name, output_xlsx.name,
                venn_diagram_name, path_to_outdirs)
        ttt_log("venn diagram", "analysis", file_b.name, output_xlsx.name,
                venn_diagram_name, path_to_outdirs)
        ttt_log("venn diagram", "analysis", file_c.name, output_xlsx.name,
                venn_diagram_name, path_to_outdirs)
def replicate_analysis(TaXon_table_xlsx, height, width, suffix_list,
                       path_to_outdirs, template, theme, font_size,
                       custom_colors, clustering_unit):

    import PySimpleGUI as sg
    import pandas as pd
    import numpy as np
    from statistics import mean
    from pathlib import Path
    import matplotlib.pyplot as plt
    from matplotlib_venn import venn2
    from matplotlib_venn import venn3
    from matplotlib.pyplot import plot, ion, show
    import matplotlib.gridspec as gridspec
    import math, os, webbrowser
    import plotly.express as px
    from plotly.subplots import make_subplots
    import plotly.graph_objects as go
    from collections import OrderedDict

    color1 = theme[0]
    color2 = theme[1]
    opacity_value = theme[2]

    height = int(height)
    width = int(width)

    TaXon_table_xlsx = Path(TaXon_table_xlsx)
    TaXon_table_df = pd.read_excel(TaXon_table_xlsx)

    sample_names = TaXon_table_df.columns[10:].tolist()
    OTUs = TaXon_table_df["ID"].values.tolist()

    derep_sample_names_dict = {}
    unique_sample_names_list = []
    replicates_dict = {}

    for sample in sample_names:
        sample_name = sample.split("_")[0:-1]
        unique_sample_names_list.append("_".join(sample_name))

    unique_sample_names_set = sorted(set(unique_sample_names_list))

    ############################################################################
    ## create the progress bar window
    layout = [[sg.Text('Progress bar')],
              [
                  sg.ProgressBar(1000,
                                 orientation='h',
                                 size=(20, 20),
                                 key='progressbar')
              ], [sg.Cancel()]]
    window_progress_bar = sg.Window('Progress bar', layout, keep_on_top=True)
    progress_bar = window_progress_bar['progressbar']
    progress_update = 0
    progress_increase = 1000 / len(unique_sample_names_set) + 1
    ############################################################################

    replicate_perc_shared_dict = {}
    fig_main_dict = {}
    reads_dict = {}

    ## create an output folder
    replicate_analysis_name = Path(TaXon_table_xlsx).name.replace(".xlsx", "")
    dirName = Path(
        str(path_to_outdirs) + "/Replicate_analysis/" +
        replicate_analysis_name)
    if not os.path.exists(dirName):
        os.mkdir(dirName)

    for sample in unique_sample_names_set:
        for i, suffix in enumerate(suffix_list):
            replicates_dict["rep_" + str(i)] = sample + "_" + suffix_list[i]

        replicate_names_list = list(replicates_dict.values())

        try:
            ## calculate the number of shared OTUs
            shared_OTUs_list = [
                row for row in
                TaXon_table_df[replicate_names_list].values.tolist()
                if 0 not in row
            ]
            present_OTUs_list = [
                row for row in
                TaXon_table_df[replicate_names_list].values.tolist()
                if row != [0] * len(replicate_names_list)
            ]
            perc_shared = round(
                len(shared_OTUs_list) / len(present_OTUs_list) * 100, 2)
            replicate_perc_shared_dict[sample] = perc_shared

            ## calculate the percentage of reads that is discarded and kept
            reads_total = sum([
                sum(row) for row in
                TaXon_table_df[replicate_names_list].values.tolist()
            ])
            reads_kept_perc = round(
                sum([sum(row)
                     for row in shared_OTUs_list]) / reads_total * 100, 2)
            reads_discarded_perc = round(100 - reads_kept_perc, 2)
            reads_dict[sample] = [reads_kept_perc, reads_discarded_perc]

            ## create left sided OTU plot
            fig_dict = {}
            for i, OTU in enumerate(present_OTUs_list):
                if 0 not in OTU:
                    fig_main_dict[i + 1, sample, "Blue",
                                  "shared"] = [sum(OTU) / reads_total * 100]
                else:
                    fig_main_dict[i + 1, sample, "Red", "non-shared"] = [
                        sum(OTU) / reads_total * 100
                    ]
        except:
            print("Warning! No replicates found for:    " + sample)

        ############################################################################
        event, values = window_progress_bar.read(timeout=10)
        if event == 'Cancel' or event is None:
            window_progress_bar.Close()
            raise RuntimeError
        # update bar with loop value +1 so that bar eventually reaches the maximum
        progress_update += progress_increase
        progress_bar.UpdateBar(progress_update)
        ############################################################################

    window_progress_bar.Close()

    ########################################################################################################################
    ## figure 1 shared OTUs
    samples = list(replicate_perc_shared_dict.keys())
    shared_otus = list(replicate_perc_shared_dict.values())
    y_title = "shared " + clustering_unit
    fig = px.bar(x=samples,
                 y=shared_otus,
                 labels={
                     "y": y_title,
                     "x": "Sample",
                     "text": y_title
                 },
                 text=shared_otus)
    y_title = 'shared ' + clustering_unit + ' (%)'
    fig.update_yaxes(title=y_title, range=[0, 100], dtick=10, autorange=False)
    fig.update_xaxes(title='', tickmode='linear')
    fig.update_xaxes(tickangle=-90)
    fig.update_layout(width=int(width),
                      height=int(height),
                      template=template,
                      font_size=font_size,
                      title_font_size=font_size)
    fig.update_traces(marker_color=color1,
                      marker_line_color=color2,
                      marker_line_width=1.5,
                      opacity=opacity_value)

    ## write files
    output_pdf = Path(
        str(dirName) + "/" + TaXon_table_xlsx.stem + "_shared_" +
        clustering_unit + ".pdf")
    output_html = Path(
        str(dirName) + "/" + TaXon_table_xlsx.stem + "_shared_" +
        clustering_unit + ".html")
    fig.write_image(str(output_pdf))
    fig.write_html(str(output_html))

    ########################################################################################################################
    ## figure 2 kept/discarded reads
    samples = list(reads_dict.keys())
    #discarded_reads = [reads[1] for reads in list(reads_dict.values())]
    shared_reads = [reads[0] for reads in list(reads_dict.values())]
    fig = px.bar(x=samples,
                 y=shared_reads,
                 labels={
                     "y": "shared reads (%)",
                     "x": "Sample",
                     "text": "shared reads (%)"
                 },
                 text=shared_reads)
    fig.update_yaxes(title='shared reads (%)',
                     range=[0, 100],
                     dtick=10,
                     autorange=False)
    fig.update_xaxes(title='', tickmode='linear')
    fig.update_xaxes(tickangle=-90)
    fig.update_layout(width=int(width),
                      height=int(height),
                      template=template,
                      font_size=font_size,
                      title_font_size=font_size)
    fig.update_traces(marker_color=color1,
                      marker_line_color=color2,
                      marker_line_width=1.5,
                      opacity=opacity_value)

    ## write files
    output_pdf2 = Path(
        str(dirName) + "/" + TaXon_table_xlsx.stem + "_shared" +
        clustering_unit + "_reads.pdf")
    output_html2 = Path(
        str(dirName) + "/" + TaXon_table_xlsx.stem + "_shared" +
        clustering_unit + "_reads.html")
    fig.write_image(str(output_pdf2))
    fig.write_html(str(output_html2))

    ########################################################################################################################
    ## figure 3 OTU left side plot
    ## sort the dict by abundance
    fig_main_dict_sorted = dict(
        sorted(fig_main_dict.items(), key=lambda item: item[1], reverse=True))
    ## collect y values >> read abundances
    ## collect x values >> rank
    # y1, x1, m1, t1 = [], [], [], []
    # y2, x2, m2, t2 = [], [], [], []
    # i = 0
    #
    # for key, value in fig_main_dict_sorted.items():
    #     if key[3] == 'shared':
    #         y1 = y1 + value
    #         x1.append(i)
    #         m1.append(key[2])
    #         t1.append(key[3])
    #         i += 1
    #     else:
    #         y2 = y2 + value
    #         x2.append(i)
    #         m2.append(key[2])
    #         t2.append(key[3])
    #         i += 1
    #
    # max_reads = math.ceil(max(y1 + y2)) +1
    # n_ranked_OTUs = i
    # n_shared = len(y1)
    # n_nonshared = len(y2)
    # name1 = "shared (n=" + str(n_shared) + ")"
    # name2 = "non shared (n=" + str(n_nonshared) + ")"
    # c1 = color_discrete_sequence[0]
    # c2 = color_discrete_sequence[1]
    #
    # fig = make_subplots(rows=2, cols=1, shared_xaxes=True)
    # fig.add_trace(go.Scatter(x=x1, y=y1, mode='markers', marker=dict(size=8, color=c1), name=name1),row=1, col=1)
    # fig.add_trace(go.Scatter(x=x2, y=y2, mode='markers', marker=dict(size=8, color=c2), name=name2),row=2, col=1)
    # fig.update_xaxes(title='', showticklabels=True, row=1, col=1)
    # fig.update_xaxes(title='ranked OTUs (by read abundance)', showticklabels=True, row=2, col=1)
    # if log_transform == True:
    #     fig.update_yaxes(title="reads (log)", range=[0,max_reads])
    #     out_sub = "_log_"
    # else:
    #     fig.update_yaxes(title="reads (%)", range=[0,105])
    #     out_sub = "_rel_"
    # fig.update_layout(width=int(width), height=int(height), template=template, font_size=font_size, title_font_size=font_size)
    #
    # ## add annotations to both plots
    # if add_annotations == True:
    #     for annotation in [1.0, 0.1, 0.01]:
    #         ## store annotation as text
    #         text = "<" + str(annotation) + "%"
    #         ## search for the x-axis rank of the annotation
    #         x_pos = []
    #         try:
    #             for rank, reads in zip(x1, y1):
    #                 if round(reads, 3) <= annotation:
    #                     x_pos = rank
    #                     break
    #         except:
    #             pass
    #         if x_pos != []:
    #             ## y_pos is annotation
    #             fig.add_annotation(x=x_pos, y=2.5,
    #                             text=text, showarrow=True, font=dict(size=font_size-2),
    #                             align="center", arrowhead=2, arrowsize=1, arrowwidth=2, arrowcolor="Black", ax=20, ay=-30,
    #                             bordercolor=c1, borderwidth=1, borderpad=3, bgcolor=c1, opacity=0.9,
    #                             row=1, col=1)
    #
    #         ## search for the x-axis rank of the annotation
    #         x_pos = []
    #         try:
    #             for rank, reads in zip(x2, y2):
    #                 if round(reads, 3) <= annotation:
    #                     x_pos = rank
    #                     break
    #         except:
    #             pass
    #         if x_pos != []:
    #             ## y_pos is annotation
    #             fig.add_annotation(x=x_pos, y=2.5,
    #                             text=text, showarrow=True, font=dict(size=font_size-2),
    #                             align="center", arrowhead=2, arrowsize=1, arrowwidth=2, arrowcolor="Black", ax=20, ay=-30,
    #                             bordercolor=c2, borderwidth=1, borderpad=3, bgcolor=c2, opacity=0.9,
    #                             row=2, col=1)

    ########################################################################################################################
    ## figure 4 OTU bar plot v2
    y1, y2 = [], []

    for key, value in fig_main_dict_sorted.items():
        if key[3] == 'shared':
            y1 = y1 + value
        else:
            y2 = y2 + value

    categories = [[100, 10], [10, 1], [1, 0.1], [0.1, 0]]
    bar_plot_dict = {}
    n_OTUs_shared, n_OTUs_nonshared, names = [], [], []

    fig = go.Figure()

    for category in categories:
        upper = category[0]
        lower = category[1]
        shared = len([y for y in y1 if (y > lower and y < upper)])
        nonshared = len([y for y in y2 if (y > lower and y < upper)])
        n_OTUs = shared + nonshared
        shared_perc = shared / n_OTUs * 100
        nonshared_perc = nonshared / n_OTUs * 100
        n_OTUs_shared.append(shared_perc)
        n_OTUs_nonshared.append(nonshared_perc)
        if category != [0.1, 0]:
            text = str(category[0]) + "%-" + str(category[1]) + "%"
            names.append(text)
            fig.add_annotation(x=text,
                               y=100,
                               text="n=" + str(n_OTUs),
                               font=dict(size=font_size - 2),
                               showarrow=False,
                               yshift=10)
        else:
            text = "<0.1%"
            names.append(text)
            fig.add_annotation(x=text,
                               y=100,
                               text="n=" + str(n_OTUs),
                               font=dict(size=font_size - 2),
                               showarrow=False,
                               yshift=10)

    fig.add_trace(
        go.Bar(x=names,
               y=n_OTUs_shared,
               name='shared',
               marker_color=custom_colors[0]))
    fig.add_trace(
        go.Bar(x=names,
               y=n_OTUs_nonshared,
               name='non-shared',
               marker_color=custom_colors[1]))

    fig.update_layout(width=int(width),
                      height=int(height),
                      template=template,
                      font_size=font_size,
                      title_font_size=font_size)
    y_title = clustering_unit + ' per bin (%)'
    fig.update_yaxes(title=y_title)
    fig.update_xaxes(title='read abundance')

    ## write files
    output_pdf3 = Path(
        str(dirName) + "/" + TaXon_table_xlsx.stem + "_shared_nonshared.pdf")
    output_html3 = Path(
        str(dirName) + "/" + TaXon_table_xlsx.stem + "_shared_nonshared.html")
    fig.write_image(str(output_pdf3))
    fig.write_html(str(output_html3))

    ########################################################################################################################

    ## write statistics file
    output_txt = Path(
        str(dirName) + "/" + TaXon_table_xlsx.stem + "_stats.txt")
    f = open(output_txt, "w")
    avg_shared_otus = round(mean(shared_otus), 2)
    avg_shared_reads = round(mean(shared_reads), 2)
    n_samples = len(samples)
    text = "Average shared " + clustering_unit + ": " + str(
        avg_shared_otus) + "%\n" + "Average shared reads: " + str(
            avg_shared_reads) + "%\n" + "Number of samples: " + str(n_samples)
    f.write(text)
    f.close()

    ## ask to show file
    answer = sg.PopupYesNo(text + '\n\nShow all three plots?',
                           keep_on_top=True)
    if answer == "Yes":
        webbrowser.open('file://' + str(output_html3))
        webbrowser.open('file://' + str(output_html2))
        webbrowser.open('file://' + str(output_html))

    ## print closing text
    closing_text = "The three plots are found under:\n" + "Projects/Replicate_analysis/"
    sg.Popup(closing_text, title="Finished", keep_on_top=True)

    ## write log
    from taxontabletools.create_log import ttt_log
    ttt_log("replicate analysis", "analysis", TaXon_table_xlsx.name,
            output_pdf.name, "nan", path_to_outdirs)
示例#5
0
def rarefaction_curve_taxa(TaXon_table_xlsx, repetitions, path_to_outdirs, template, font_size, taxonomic_level_1, taxonomic_level_2, color_discrete_sequence):

    import random
    import PySimpleGUI as sg
    import pandas as pd
    import numpy as np
    import plotly.graph_objects as go
    import plotly.express as px
    from pathlib import Path
    import webbrowser

    ## load the TaXon table
    TaXon_table_file = Path(TaXon_table_xlsx)
    TaXon_table_xlsx = pd.ExcelFile(TaXon_table_xlsx)
    df = pd.read_excel(TaXon_table_xlsx, 'TaXon table', header=0)
    df = df.replace(np.nan,"nan")

    ## create a y axis title text
    taxon_title = taxonomic_level_1.lower()

    ## adjust taxonomic level if neccessary
    if taxonomic_level_1 in ["ASVs", "ESVs", "OTUs", "zOTUs"]:
        taxon_title = taxonomic_level_1
        taxonomic_level_1 = "ID"

    ## collect available samples
    available_samples = df.columns.tolist()[10:]
    sample_dict_clean = {}

    ## collect available taxa
    available_taxa = [taxon for taxon in set(df[taxonomic_level_2].values.tolist()) if taxon != "nan"]

    ## create a color dict
    ## extend to the color dict
    color_discrete_sequence = color_discrete_sequence * len(available_taxa)
    color_dict = {}
    for i, taxon in enumerate(available_taxa):
        color_dict[taxon] = color_discrete_sequence[i]

    ## collect the increase for each taxon
    increase_dict = {}

    ## create an empty figure
    fig = go.Figure()

    for taxon in sorted(available_taxa):
        df_filtered = df.loc[df[taxonomic_level_2] == taxon]

        # iterate through all available samples
        for sample in available_samples:
            # create a dict for the read numbers of the respective sample for each species
            sample_OTU_list = df_filtered[[sample, taxonomic_level_1]].values.tolist()
            # select only the present Species
            sample_species_list = list(set([OTU[1] for OTU in sample_OTU_list if (OTU[0] != 0 and OTU[1] != "nan")]))
            # store the species in a dictionary
            sample_dict_clean[sample] = sample_species_list

        # draw once for each sample
        number_of_draws = len(sample_dict_clean.keys())

        # dictionary to store the drawing results
        draw_dictionary = {}

        for n_reps in range(0, repetitions):
            # store the original dictionary to start over again
            # a copy of the original dictionary is required, because the samples will be removed with each draw
            # thus for each replicate a new dictionary to draw from has to be created
            sample_dict_to_draw = dict(sample_dict_clean)

            species_list = []
            species_set = []

            for i in range(0, number_of_draws):
                # choose a random sample from the dictionary
                random_choice = random.choice(list(sample_dict_to_draw.keys()))
                # extract the OTU IDs from the chosen sample and add them to the already existing OTU IDs
                species_list = species_list + sample_dict_clean[random_choice]
                # create a unique set
                species_set = set(species_list)
                # number of OTUs
                n_species = len(species_set)
                # now add the unique OTU list to the output dictionary
                # if the key is not in the dict, create a new entry (= OTU ID plus number of OTUs)
                if i not in draw_dictionary.keys():
                    draw_dictionary[i] = [n_species]
                # if the key already exists, calculate the sum of the already existing number of OTUs and the new number of OTUs
                else:
                    # create a new list to store the current number of OTUs
                    add_species_list = draw_dictionary[i]
                    add_species_list.append(n_species)
                    draw_dictionary[i] = add_species_list

                # remove the sample to draw only once
                sample_dict_to_draw.pop(random_choice)

        # create a dict to store the average number of OTUs per draw
        rarefaction_dict_average, rarefaction_dict_stdef = {}, {}

        def average(lst):
            return sum(lst) / len(lst)

        # iterate through the draw_dictionary and calculate the average number of OTUs
        for key, value in draw_dictionary.items():
            average_species = average(draw_dictionary[key])
            stdef_species = np.std(draw_dictionary[key], dtype=np.float64)
            rarefaction_dict_average[key] = average_species
            rarefaction_dict_stdef[key] = stdef_species

        ## add to plot
        draws = [i+1 for i in rarefaction_dict_average.keys()]
        n_species = list(rarefaction_dict_average.values())
        increase_dict[taxon] = n_species
        error_bar = list(rarefaction_dict_stdef.values())
        fig.add_trace(go.Scatter(x=draws, y=n_species, name=taxon, marker_color=color_dict[taxon], error_y=dict(type='data', array=error_bar, thickness=0.5, width=3, visible=True)))

    ## update figure
    y_axis_title = "# " + taxon_title
    fig.update_layout(title_text="repetitions = " + str(n_reps+1), yaxis_title=y_axis_title, xaxis_title="# samples")
    fig.update_layout(height=700, width=1200, template="simple_white", showlegend=True, font_size=font_size, title_font_size=font_size)
    fig.update_xaxes(rangemode="tozero")
    fig.update_yaxes(rangemode="tozero")
    fig.update_layout(height=800, width=1200, template=template, showlegend=True, font_size=font_size, title_font_size=font_size)

    ## write files
    out_name = taxonomic_level_1.lower() + "_" + taxonomic_level_2.lower()
    output_pdf = Path(str(path_to_outdirs) + "/" + "Rarefaction_curves" + "/" + TaXon_table_file.name + "_rarefaction_" + out_name + ".pdf")
    output_html = Path(str(path_to_outdirs) + "/" + "Rarefaction_curves" + "/" + TaXon_table_file.name + "_rarefaction_" + out_name + ".html")
    fig.write_image(str(output_pdf))
    fig.write_html(str(output_html))

    ## ask to show file
    answer = sg.PopupYesNo('Show plot?', keep_on_top=True)
    if answer == "Yes":
        webbrowser.open('file://' + str(output_html))

    ## print closing text
    closing_text = "Rarefaction curves are found in: " + str(path_to_outdirs) + "/rarefaction_curves/"
    sg.Popup(closing_text, title="Finished", keep_on_top=True)

    ## write log
    from taxontabletools.create_log import ttt_log
    ttt_log("rarefaction curve per taxon", "analysis", TaXon_table_file.name, output_pdf.name, "nan", path_to_outdirs)
示例#6
0
def create_krona_chart_multi(TaXon_table_xlsx, path_to_outdirs):

    import subprocess, os, webbrowser
    import PySimpleGUI as sg
    import pandas as pd
    from pandas import DataFrame
    import numpy as np
    from pathlib import Path

    try:
        subprocess.call(["ktImportText"], stdout=open(os.devnull, 'wb'))
    except:
        sg.PopupError(
            "Krona tools must be manually installed first!" + "\n" * 2 +
            "Note: Krona tools is currently not supported on Windows!" + "\n",
            title="Error")
        raise RuntimeError("Krona tools needs to be installed")

    TaXon_table_xlsx = Path(TaXon_table_xlsx)
    TaXon_table_df = pd.read_excel(TaXon_table_xlsx)
    TaXon_table_samples = TaXon_table_df.columns.tolist()[10:]
    TaXon_table_df = TaXon_table_df.replace(np.nan, '__', regex=True)

    samples = TaXon_table_df.columns.tolist()[10:]
    columns = TaXon_table_df.columns.tolist()[:10]

    # check for presence absence data
    # otherwise abort and print error message
    pa_test = set([
        val for sublist in TaXon_table_df[TaXon_table_samples].values.tolist()
        for val in sublist
    ])
    if pa_test == {1, 0}:
        pa_data = True
    else:
        pa_data = False

    ## create an output folder
    krona_chart_name = Path(TaXon_table_xlsx).name.replace(".xlsx", "")
    dirName = Path(str(path_to_outdirs) + "/Krona_charts/" + krona_chart_name)
    if not os.path.exists(dirName):
        os.mkdir(dirName)

    ## store the names of the sample tsv files
    sample_tsv_path = []

    ## write a seperate tsv file for each sample in the TaXon table
    for sample in samples:
        row1 = ["sample-ID", "", "", "", "", "", ""]
        row2 = [
            "count", "phylum", "class", "order", "family", "genus", "species"
        ]
        krona_taxonomy_list = []
        krona_taxonomy_list.append(row1)
        krona_taxonomy_list.append(row2)

        for OTU in TaXon_table_df[columns + [sample]].values.tolist():
            taxonomy = OTU[1:7]
            reads = sum(OTU[10:])
            if reads != 0:
                if pa_data == True:
                    krona_taxonomy_list.append([1] + taxonomy)
                else:
                    krona_taxonomy_list.append([reads] + taxonomy)

        ## store the data in df
        krona_taxonomy_df = pd.DataFrame(krona_taxonomy_list)
        krona_table_tsv = Path(
            str(dirName) + "/" + sample.replace(" ", "_") + "_krona_table.tsv")
        sample_tsv_path.append(str(krona_table_tsv))
        # write krona table to tsv
        krona_taxonomy_df.to_csv(krona_table_tsv,
                                 sep="\t",
                                 header=False,
                                 index=False)

    krona_chart_html = Path(str(dirName) + "_krona_multi.html")
    os.system("ktImportText " + ' '.join(sample_tsv_path) + " -o " +
              str(krona_chart_html))

    # finish script
    answer = sg.PopupYesNo('Show plot?', keep_on_top=True)
    if answer == "Yes":
        webbrowser.open('file://' + str(krona_chart_html))

    closing_text = "Krona chart is found under:\n" + '/'.join(
        str(krona_chart_html).split("/")[-4:])
    sg.Popup(closing_text, title="Finished", keep_on_top=True)

    from taxontabletools.create_log import ttt_log
    ttt_log("krona chart", "analysis", TaXon_table_xlsx.name,
            krona_chart_html.name, "nan", path_to_outdirs)
示例#7
0
def beta_diversity(TaXon_table_xlsx, width, heigth, cmap, meta_data_to_test,
                   taxonomic_level, path_to_outdirs, template, font_size,
                   diss_metric):

    import pandas as pd
    import numpy as np
    from skbio.diversity import beta_diversity
    from skbio.stats.distance import anosim
    import plotly.express as px
    from pathlib import Path
    import PySimpleGUI as sg
    import webbrowser

    TaXon_table_xlsx = Path(TaXon_table_xlsx)
    Meta_data_table_xlsx = Path(
        str(path_to_outdirs) + "/" + "Meta_data_table" + "/" +
        TaXon_table_xlsx.stem + "_metadata.xlsx")
    TaXon_table_df = pd.read_excel(TaXon_table_xlsx,
                                   header=0).fillna("unidentified")
    TaXon_table_samples = TaXon_table_df.columns.tolist()[10:]
    Meta_data_table_df = pd.read_excel(Meta_data_table_xlsx,
                                       header=0).fillna("nan")
    Meta_data_table_samples = Meta_data_table_df['Samples'].tolist()

    metadata_list = Meta_data_table_df[meta_data_to_test].values.tolist()
    metadata_loc = Meta_data_table_df.columns.tolist().index(meta_data_to_test)

    ## drop samples with metadata called nan (= empty)
    drop_samples = [
        i[0] for i in Meta_data_table_df.values.tolist()
        if i[metadata_loc] == "nan"
    ]

    if drop_samples != []:
        ## filter the TaXon table
        TaXon_table_df = TaXon_table_df.drop(drop_samples, axis=1)
        TaXon_table_samples = TaXon_table_df.columns.tolist()[10:]
        ## also remove empty OTUs
        row_filter_list = []
        for row in TaXon_table_df.values.tolist():
            reads = set(row[10:])
            if reads != {0}:
                row_filter_list.append(row)
        columns = TaXon_table_df.columns.tolist()
        TaXon_table_df = pd.DataFrame(row_filter_list, columns=columns)
        Meta_data_table_df = pd.DataFrame(
            [
                i for i in Meta_data_table_df.values.tolist()
                if i[0] not in drop_samples
            ],
            columns=Meta_data_table_df.columns.tolist())
        Meta_data_table_samples = Meta_data_table_df['Samples'].tolist()

    metadata_list = Meta_data_table_df[meta_data_to_test].values.tolist()

    ## create a y axis title text
    taxon_title = taxonomic_level

    ## adjust taxonomic level if neccessary
    if taxonomic_level in ["ASVs", "ESVs", "OTUs", "zOTUs"]:
        taxon_title = taxonomic_level
        taxonomic_level = "ID"

    # check if the meta data differs
    if len(set(Meta_data_table_df[meta_data_to_test])) == len(
            Meta_data_table_df['Samples'].tolist()):
        sg.Popup(
            "The meta data is unique for all samples. Please adjust the meta data table!",
            title=("Error"))
        raise RuntimeError

    # check if the meta data differs
    if len(set(Meta_data_table_df[meta_data_to_test])) == 1:
        sg.Popup(
            "The meta data is similar for all samples. Please adjust the meta data table!",
            title=("Error"))
        raise RuntimeError

    if sorted(TaXon_table_samples) == sorted(Meta_data_table_samples):

        ## collect samples for plot
        samples = Meta_data_table_samples

        ## extract the relevant data
        TaXon_table_df = TaXon_table_df[[taxonomic_level] + samples]
        ## define an aggregation function to combine multiple hit of one taxonimic level
        aggregation_functions = {}
        ## define samples functions
        for sample in samples:
            ## 'sum' will calculate the sum of p/a data
            aggregation_functions[sample] = 'sum'
        ## define taxon level function
        aggregation_functions[taxonomic_level] = 'first'
        ## create condensed dataframe
        df_new = TaXon_table_df.groupby(
            TaXon_table_df[taxonomic_level]).aggregate(aggregation_functions)
        if 'unidentified' in df_new.index:
            df_new = df_new.drop('unidentified')

        ## collect reads
        data = df_new[samples].transpose().values.tolist()
        ## calculate dissimilarity distances
        dissimilarity_dm = beta_diversity(diss_metric, data, samples)

        anosim_results = anosim(dissimilarity_dm,
                                metadata_list,
                                permutations=999)
        anosim_r = round(anosim_results['test statistic'], 5)
        anosim_p = anosim_results['p-value']
        textbox = "Anosim (" + meta_data_to_test + ", " + taxon_title + ")<br>" + "R = " + str(
            anosim_r) + "<br>" + "p = " + str(anosim_p)

        matrix = dissimilarity_dm.data
        matrix_df = pd.DataFrame(matrix)
        matrix_df.columns = samples
        matrix_df.index = samples

        # create plot
        color_label = diss_metric + " distance"
        fig = px.imshow(matrix,
                        x=samples,
                        y=samples,
                        color_continuous_scale=cmap,
                        labels=dict(color=color_label))
        fig.update_layout(height=int(heigth),
                          width=int(width),
                          template=template,
                          showlegend=True,
                          title=textbox,
                          font_size=font_size,
                          title_font_size=font_size)

        # finish script
        output_pdf = Path(
            str(path_to_outdirs) + "/" + "Beta_diversity" + "/" +
            TaXon_table_xlsx.stem + "_" + meta_data_to_test + "_" +
            taxon_title + "_" + diss_metric + ".pdf")
        output_html = Path(
            str(path_to_outdirs) + "/" + "Beta_diversity" + "/" +
            TaXon_table_xlsx.stem + "_" + meta_data_to_test + "_" +
            taxon_title + "_" + diss_metric + ".html")
        output_xlsx = Path(
            str(path_to_outdirs) + "/" + "Beta_diversity" + "/" +
            TaXon_table_xlsx.stem + "_" + meta_data_to_test + "_" +
            taxon_title + "_" + diss_metric + ".xlsx")
        fig.write_image(str(output_pdf))
        fig.write_html(str(output_html))
        matrix_df.to_excel(output_xlsx)

        ## ask to show plot
        answer = sg.PopupYesNo('Show plot?', keep_on_top=True)
        if answer == "Yes":
            webbrowser.open('file://' + str(output_html))

        ## write to log file
        sg.Popup("Beta diversity estimate are found in",
                 path_to_outdirs,
                 "/Beta_diversity/",
                 title="Finished",
                 keep_on_top=True)
        from taxontabletools.create_log import ttt_log
        ttt_log("beta diversity", "analysis", TaXon_table_xlsx.name,
                output_pdf.name, meta_data_to_test, path_to_outdirs)

    else:
        sg.PopupError(
            "Error: The samples between the taxon table and meta table do not match!",
            keep_on_top=True)
def site_occupancy_heatmap(TaXon_table_xlsx, path_to_outdirs, template, height, width, meta_data_to_test, taxonomic_level, font_size, color_discrete_sequence, add_categories_sum):

    import PySimpleGUI as sg
    import pandas as pd
    import numpy as np
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    from pathlib import Path
    import webbrowser, os

    TaXon_table_xlsx =  Path(TaXon_table_xlsx)
    Meta_data_table_xlsx = Path(str(path_to_outdirs) + "/" + "Meta_data_table" + "/" + TaXon_table_xlsx.stem + "_metadata.xlsx")
    TaXon_table_df = pd.read_excel(TaXon_table_xlsx, header=0).fillna("unidentified")
    TaXon_table_samples = TaXon_table_df.columns.tolist()[10:]
    Meta_data_table_df = pd.read_excel(Meta_data_table_xlsx, header=0).fillna("nan")
    Meta_data_table_samples = Meta_data_table_df['Samples'].tolist()

    ## drop samples with metadata called nan (= empty)
    drop_samples = [i[0] for i in Meta_data_table_df.values.tolist() if i[1] == "nan"]

    if drop_samples != []:
        ## filter the TaXon table
        TaXon_table_df = TaXon_table_df.drop(drop_samples, axis=1)
        TaXon_table_samples = TaXon_table_df.columns.tolist()[10:]
        ## also remove empty OTUs
        row_filter_list = []
        for row in TaXon_table_df.values.tolist():
            reads = set(row[10:])
            if reads != {0}:
                row_filter_list.append(row)
        columns = TaXon_table_df.columns.tolist()
        TaXon_table_df = pd.DataFrame(row_filter_list, columns=columns)
        Meta_data_table_df = pd.DataFrame([i for i in Meta_data_table_df.values.tolist() if i[0] not in drop_samples], columns=Meta_data_table_df.columns.tolist())
        Meta_data_table_samples = Meta_data_table_df['Samples'].tolist()

    metadata_list = Meta_data_table_df[meta_data_to_test].values.tolist()

    ## create a y axis title text
    taxon_title = taxonomic_level

    ## adjust taxonomic level if neccessary
    if taxonomic_level in ["ASVs", "ESVs", "OTUs", "zOTUs"]:
        taxon_title = taxonomic_level
        taxonomic_level = "ID"

    if len(set(metadata_list)) == 1:
        sg.PopupError("Please choose more than one meta data category.")
    else:

        if sorted(TaXon_table_samples) == sorted(Meta_data_table_samples):

            ## define variables
            samples = TaXon_table_samples
            OTU_abundances_dict = {}
            samples_metadata_list = []

            ## extract the relevant data
            TaXon_table_df = TaXon_table_df[[taxonomic_level] + samples]
            ## define an aggregation function to combine multiple hit of one taxonimic level
            aggregation_functions = {}
            ## define samples functions
            for sample in samples:
                ## 'sum' will calculate the sum of p/a data
                aggregation_functions[sample] = 'sum'
            ## define taxon level function
            aggregation_functions[taxonomic_level] = 'first'
            ## create condensed dataframe
            TaXon_table_df = TaXon_table_df.groupby(TaXon_table_df[taxonomic_level]).aggregate(aggregation_functions)
            if 'unidentified' in TaXon_table_df.index:
                TaXon_table_df = TaXon_table_df.drop('unidentified')

            ## create a list of samples for each category
            category_dict = {}
            for sample, category in zip(Meta_data_table_samples, metadata_list):
                if category not in category_dict.keys():
                    category_dict[category] = [sample]
                else:
                    category_dict[category] = category_dict[category] + [sample]

            ## collect all available taxa
            taxa = TaXon_table_df[taxonomic_level].values.tolist()

            ## check if the respective species are present in the collections
            taxon_presence_dict = {}
            n_rows, row_heights = [], []

            color_discrete_sequence = color_discrete_sequence * len(category_dict.keys())

            if (taxonomic_level == "Species" or taxonomic_level == "Genus"):
                x_values = ["<i>" + taxon + "</i>" for taxon in taxa]
            else:
                x_values = taxa

            if add_categories_sum == True:
                for samples in category_dict.values():
                    row_heights.append(len(samples))
                row_heights.append(len(set(metadata_list)))
                fig = make_subplots(rows=len(set(metadata_list)) + 1, cols=1, shared_xaxes=True, vertical_spacing=0.05, row_heights=row_heights)
            else:
                for samples in category_dict.values():
                    row_heights.append(len(samples))
                fig = make_subplots(rows=len(set(metadata_list)), cols=1, shared_xaxes=True, vertical_spacing=0.05, row_heights=row_heights)

            row = 1
            for metadata, samples in category_dict.items():
                if type(samples) == "str":
                    samples = [samples]
                z_values = []
                for sample in samples:
                    reads = TaXon_table_df[sample].values.tolist()
                    z_values = z_values + [[1 if x > 0 else 0 for x in reads]]
                y_values = samples
                fig.add_trace(go.Heatmap(z=z_values, x=x_values, y=y_values, showscale=False, xgap=1, ygap=1, hoverongaps = False, colorscale=[[0, "White"], [1, color_discrete_sequence[row-1]]]), row=row, col=1)
                row += 1

            if add_categories_sum == True:
                z_values, y_values = [], []
                for metadata, samples in category_dict.items():
                    reads = [sum(reads) for reads in TaXon_table_df[samples].values.tolist()]
                    z_values = z_values + [[1 if x > 0 else 0 for x in reads]]
                    y_values.append(metadata)
                fig.add_trace(go.Heatmap(z=z_values[::-1], x=x_values, y=y_values[::-1], showscale=False, xgap=1, ygap=1, hoverongaps = False, colorscale=[[0, "White"], [1, "Grey"]]), row=row, col=1)
                row += 1

            fig.update_layout(width=int(width), height=int(height), template="seaborn", font_size=font_size, yaxis_nticks=5, title_font_size=font_size)
            fig.update_xaxes(tickmode='linear')
            fig.update_yaxes(tickmode='linear')
            fig.update_xaxes(tickangle=-90)

            occupancy_plot_directory = Path(str(path_to_outdirs) + "/" + "Site_occupancy_plots" + "/" + TaXon_table_xlsx.stem)
            if not os.path.exists(occupancy_plot_directory):
                os.mkdir(occupancy_plot_directory)

            ## define output files
            output_pdf = Path(str(occupancy_plot_directory) + "/" + taxonomic_level + "_" + meta_data_to_test + "_heatmap.pdf")
            output_html = Path(str(occupancy_plot_directory) + "/" + taxonomic_level + "_" + meta_data_to_test + "_heatmap.html")

            ## write output files
            fig.write_image(str(output_pdf))
            fig.write_html(str(output_html))

            ## ask to show file
            answer = sg.PopupYesNo('Show plot?', keep_on_top=True)
            if answer == "Yes":
                webbrowser.open('file://' + str(output_html))

            ## print closing text
            closing_text = "Site occupancy heatmaps are found under:\n" + '/'.join(str(output_pdf).split("/")[-4:])
            sg.Popup(closing_text, title="Finished", keep_on_top=True)

            ## write to log
            from taxontabletools.create_log import ttt_log
            placeholder = TaXon_table_xlsx.name + " (multiple site occupancy plots)"
            ttt_log("site occupancy", "analysis", TaXon_table_xlsx.name, "", meta_data_to_test, path_to_outdirs)


        else:
            sg.Popup("The metdata table and taXon table are not matching!")
def replicate_consistency_filter(TaXon_table_xlsx, suffix_list, path_to_outdirs, consistency):

    import PySimpleGUI as sg
    import pandas as pd
    import numpy as np
    from pathlib import Path

    TaXon_table_xlsx = Path(TaXon_table_xlsx)
    TaXon_table_df = pd.read_excel(TaXon_table_xlsx)

    sample_names = TaXon_table_df.columns[10:].tolist()
    OTUs = TaXon_table_df["ID"].values.tolist()

    derep_sample_names_dict =  {}
    unique_sample_names_list = []
    replicates_dict = {}

    for sample in sample_names:
        sample_name = sample.split("_")[0:-1]
        unique_sample_names_list.append("_".join(sample_name))

    unique_sample_names_set = sorted(set(unique_sample_names_list))

    ############################################################################
    ## create the progress bar window
    layout = [[sg.Text('Progress bar')],
              [sg.ProgressBar(1000, orientation='h', size=(20, 20), key='progressbar')],
              [sg.Cancel()]]
    window_progress_bar = sg.Window('Progress bar', layout, keep_on_top=True)
    progress_bar = window_progress_bar['progressbar']
    progress_update = 0
    progress_increase = 1000 / len(unique_sample_names_set) + 1
    ############################################################################

    ## merge and replicate consistency version
    if consistency == True:
        no_replicates_list = []
        for sample in unique_sample_names_set:

            for i, suffix in enumerate(suffix_list):
                replicates_dict["rep_" + str(i)] = sample + "_" + str(suffix_list[i])

            replicate_names_list = list(replicates_dict.values())

            try:
                new_df = TaXon_table_df[replicate_names_list]
                header = new_df.columns.tolist()
                processed_reads = []

                for n_reads in new_df.values.tolist():
                    if 0 in n_reads:
                        if len(set(n_reads)) > 1:
                            n_reads = len(n_reads) * [0]
                    processed_reads.append(n_reads)

                df_out = pd.DataFrame(processed_reads)
                df_out.columns = header
                TaXon_table_df = TaXon_table_df.drop(replicate_names_list, axis=1)
                TaXon_table_df[sample] = df_out.sum(axis=1)

            except:
                no_replicates_list.append(sample)

            ############################################################################
            event, values = window_progress_bar.read(timeout=10)
            if event == 'Cancel'  or event is None:
                print('Cancel')
                window_progress_bar.Close()
                raise RuntimeError
            # update bar with loop value +1 so that bar eventually reaches the maximum
            progress_update += progress_increase
            progress_bar.UpdateBar(progress_update)
            ############################################################################

        window_progress_bar.Close()

        if len(no_replicates_list) == len(unique_sample_names_set):
            sg.PopupError("No replicates found. Please check your replicate suffixes.")
        else:
            dropped_OTUs_list = []
            # filter for 0 hit OTUs (can happen after consistency filtering)
            columns = TaXon_table_df.columns.tolist()
            TaXon_table_list = TaXon_table_df.values.tolist()
            TaXon_table_list_final = []
            for entry in TaXon_table_list:
                if sum(entry[10:]) != 0:
                    TaXon_table_list_final.append(entry)
                else:
                    print("Dropped:", entry[0], "(0 reads)")
                    dropped_OTUs_list.append(entry[0])

            taxon_tables_directory = Path(str(path_to_outdirs) + "/" + "TaXon_tables" + "/" + TaXon_table_xlsx.stem)
            output_xlsx = Path(str(taxon_tables_directory) + "_cons.xlsx")

            TaXon_table_df = pd.DataFrame(TaXon_table_list_final, columns=columns)
            TaXon_table_df.to_excel(output_xlsx, sheet_name='TaXon table', index=False)

            closing_text = "Taxon table is found under:\n" + '/'.join(str(output_xlsx).split("/")[-4:]) + "\n\n" + str(len(dropped_OTUs_list)) + " OTUs were removed."
            sg.Popup(closing_text, title="Finished", keep_on_top=True)

            from taxontabletools.create_log import ttt_log
            ttt_log("replicate consistency", "processing", TaXon_table_xlsx.name, output_xlsx.name, "consistency merged", path_to_outdirs)

    ## merge only version
    else:
        no_replicates_list = []
        for sample in unique_sample_names_set:

            for i, suffix in enumerate(suffix_list):
                replicates_dict["rep_" + str(i)] = sample + "_" + str(suffix_list[i])

            replicate_names_list = list(replicates_dict.values())

            try:
                new_df = TaXon_table_df[replicate_names_list]
                TaXon_table_df = TaXon_table_df.drop(replicate_names_list, axis=1)
                TaXon_table_df[sample] = new_df.sum(axis=1)
            except:
                no_replicates_list.append(sample)

            ############################################################################
            event, values = window_progress_bar.read(timeout=10)
            if event == 'Cancel'  or event is None:
                print('Cancel')
                window_progress_bar.Close()
                raise RuntimeError
            # update bar with loop value +1 so that bar eventually reaches the maximum
            progress_update += progress_increase
            progress_bar.UpdateBar(progress_update)
            ############################################################################

        window_progress_bar.Close()

        if len(no_replicates_list) == len(unique_sample_names_set):
            sg.PopupError("No replicates found. Please check your replicate suffixes.")

        else:
            taxon_tables_directory = Path(str(path_to_outdirs) + "/" + "TaXon_tables" + "/" + TaXon_table_xlsx.stem)
            output_xlsx = Path(str(taxon_tables_directory) + "_merged.xlsx")

            TaXon_table_df.to_excel(output_xlsx, sheet_name='TaXon table', index=False)

            closing_text = "Taxon table is found under:\n" + '/'.join(str(output_xlsx).split("/")[-4:])
            sg.Popup(closing_text, title="Finished", keep_on_top=True)

            from taxontabletools.create_log import ttt_log
            ttt_log("replicate merging", "processing", TaXon_table_xlsx.name, output_xlsx.name, "merged", path_to_outdirs)
示例#10
0
def taxon_table_converter_qiime2(read_table_tsv, taxonomy_results_xlsx,
                                 TaXon_table_name, sheet_name,
                                 path_to_outdirs):

    # read_table_tsv = "/Users/tillmacher/Downloads/tutorial_read_table_qiime2.tsv"
    # taxonomy_results_xlsx = "/Users/tillmacher/Downloads/tutorial_taxonomy_table.xlsx"

    import PySimpleGUI as sg
    import pandas as pd
    from pandas import DataFrame
    import numpy as np
    from pathlib import Path

    taxonomy_results_xlsx = Path(taxonomy_results_xlsx)
    read_table_tsv = Path(read_table_tsv)

    # create filename and path for output file
    Output_name = TaXon_table_name + ".xlsx"
    Output_file = path_to_outdirs / "TaXon_tables" / Output_name

    # store the file name for later use
    file_name = taxonomy_results_xlsx.name

    # create datafrmes for both files
    taxonomy_df = pd.read_excel(taxonomy_results_xlsx, sheet_name, header=0)
    if sheet_name == "BOLDigger hit":
        taxonomy_df = taxonomy_df.drop(columns=['Flags'])

    read_table_df = pd.read_csv(Path(read_table_tsv), sep="\t")

    # drop the first row
    read_table_df = read_table_df.iloc[1:]
    read_table_df = read_table_df.reset_index(drop=True)

    ## create a new dataframe
    TaXon_table_df = taxonomy_df

    # check if all OTU are correctly sorted and present in both files
    if taxonomy_df["ID"].to_list() == read_table_df["id"].to_list():

        ## append the sequences to the TaXon stable
        TaXon_table_df["seq"] = read_table_df["Sequence"].values.tolist()

        ## remove the sequence column from the read table
        read_table_df.drop('Sequence', axis='columns', inplace=True)

        ## remove the ID column from the read table
        read_table_df.drop('id', axis='columns', inplace=True)

        ## add samples to the dataframe
        TaXon_table_df = pd.concat([TaXon_table_df, read_table_df], axis=1)

        ## check if species are present as "Genus" + "Epithet"
        new_species_column = []
        for OTU in TaXon_table_df[["Genus",
                                   "Species"]].fillna("nan").values.tolist():
            if (OTU != ["nan", "nan"] and OTU[1] != 'nan'):
                if OTU[0] not in OTU[1]:
                    new_species_column.append(OTU[0] + " " + OTU[1])
                else:
                    new_species_column.append(OTU[1])
            else:
                new_species_column.append("")

        ## add new species column to the dataframe
        TaXon_table_df["Species"] = new_species_column

        ## save the newly created Taxon table in TaXon format as excel file
        TaXon_table_df.to_excel(Output_file,
                                sheet_name='TaXon table',
                                index=False)

        closing_text = "Taxon table is found under:\n" + '/'.join(
            str(Output_file).split("/")[-4:])
        sg.Popup(closing_text, title="Finished", keep_on_top=True)

        from taxontabletools.create_log import ttt_log
        input = taxonomy_results_xlsx.name + " + " + read_table_tsv.name
        ttt_log("taXon table converter", "processing", input, Output_file.name,
                "qiime2", path_to_outdirs)

    else:
        sg.PopupError(
            "Error: The IDs of the read table and taxonomy table do not match!"
        )
def site_occupancy_barchart(TaXon_table_xlsx, meta_data_to_test, taxonomic_level, path_to_outdirs, x_site_occ, y_site_occ, template, theme, font_size):

    import os, webbrowser
    import pandas as pd
    from pandas import DataFrame
    from pathlib import Path
    import plotly.graph_objects as go
    import PySimpleGUI as sg

    color1 = theme[0]
    color2 = theme[1]
    opacity_value = theme[2]

    ## adjust taxonomic level if neccessary
    if taxonomic_level in ["ASVs", "ESVs", "OTUs", "zOTUs"]:
        taxon_title = taxonomic_level
        taxonomic_level = "ID"

    TaXon_table_xlsx = Path(TaXon_table_xlsx)
    TaXon_table_df = pd.read_excel(TaXon_table_xlsx, header = 0)
    TaXon_table_samples = TaXon_table_df.columns.tolist()[10:]
    Meta_data_table_xlsx = Path(str(path_to_outdirs) + "/" + "Meta_data_table" + "/" + TaXon_table_xlsx.stem + "_metadata.xlsx")
    Meta_data_table_df = pd.read_excel(Meta_data_table_xlsx, header = 0).fillna("nan")
    Meta_data_table_samples = Meta_data_table_df['Samples'].tolist()

    metadata_list = Meta_data_table_df[meta_data_to_test].values.tolist()
    metadata_loc = Meta_data_table_df.columns.tolist().index(meta_data_to_test)

    ## drop samples with metadata called nan (= empty)
    drop_samples = [i[0] for i in Meta_data_table_df.values.tolist() if i[metadata_loc] == "nan"]

    if drop_samples != []:
        ## filter the TaXon table
        TaXon_table_df = TaXon_table_df.drop(drop_samples, axis=1)
        TaXon_table_samples = TaXon_table_df.columns.tolist()[10:]
        ## also remove empty OTUs
        row_filter_list = []
        for row in TaXon_table_df.values.tolist():
            reads = set(row[10:])
            if reads != {0}:
                row_filter_list.append(row)
        columns = TaXon_table_df.columns.tolist()
        TaXon_table_df = pd.DataFrame(row_filter_list, columns=columns)
        Meta_data_table_df = pd.DataFrame([i for i in Meta_data_table_df.values.tolist() if i[0] not in drop_samples], columns=Meta_data_table_df.columns.tolist())
        Meta_data_table_samples = Meta_data_table_df['Samples'].tolist()

    TaXon_table_n_samples = len(TaXon_table_samples)
    n_sites = len(set(Meta_data_table_df[meta_data_to_test].tolist()))

    answer = "Ask"
    output_message = "No"

    if (sorted(TaXon_table_samples) == sorted(Meta_data_table_samples) and TaXon_table_n_samples != n_sites):

        site_occupancy_dict = {}

        sites = set(Meta_data_table_df[meta_data_to_test].tolist())

        for site in sites:
            # this can either be a species name or the above specified taxonomic level
            present_OTU_list = []

            # extract samples that belong to the site from the metadata file
            included_samples_list = Meta_data_table_df[Meta_data_table_df.values  == site]['Samples'].values.tolist()

            # count the number of samples per site to calculate the site occupancy
            n_samples = len(included_samples_list)

            # create a list of all species (or the specified taxonomic level)
            if taxonomic_level == "OTUs":
                taxonomic_level = "ID"
            overall_included_species_list = TaXon_table_df[taxonomic_level].values.tolist()
            # make the list unique
            overall_included_species_set = set(overall_included_species_list)
            # remove potential 'nan's from the list
            overall_included_species_set = [x for x in overall_included_species_set if str(x) != 'nan']

            # create a set of species that is present at the sites
            for sample in included_samples_list:

                OTUs_per_species_list = []

                # check the read abundaces for each sample
                read_abundace_list = TaXon_table_df[sample].values.tolist()

                # enumerate the read abundaces for each sample and collect all lines that have more than one read
                for i, read_abundance in enumerate(read_abundace_list):
                    species = TaXon_table_df[taxonomic_level][i]
                    # if reads are present, collect the species name (or the specified taxonomic level) from the TaXon table
                    if read_abundance != 0:
                        OTUs_per_species_list.append(species)

                # remove all nans
                OTUs_per_species_list = [x for x in OTUs_per_species_list if str(x) != 'nan']
                # make list unique
                OTUs_per_species_list = list(set(OTUs_per_species_list))
                # append to list of species for the current site
                present_OTU_list.append(OTUs_per_species_list)

            # flatten the list of present species per site
            present_OTU_list_flattened = [val for sublist in present_OTU_list for val in sublist]

            # store occupancy of each species in a dict, will be accessed by position in list
            occupancy_dict = {}

            # count the number of occurences for each species and calculate the occpancy based on the number of samples
            for species in overall_included_species_set:
                count = present_OTU_list_flattened.count(species)
                occupancy = count / n_samples * 100
                occupancy_dict[species] = occupancy

            occupancy_dict = {k: v for k, v in sorted(occupancy_dict.items(), key=lambda item: item[1])}
            occupancy_list = list(occupancy_dict.values())
            species_list = list(occupancy_dict.keys())

            if (taxonomic_level == "Species" or taxonomic_level == "Genus"):
                x_values = ["<i>" + taxon + "</i>" for taxon in species_list]
            else:
                x_values = species_list

            occupancy_plot_directory = Path(str(path_to_outdirs) + "/" + "Site_occupancy_plots" + "/" + TaXon_table_xlsx.stem)
            if not os.path.exists(occupancy_plot_directory):
                os.mkdir(occupancy_plot_directory)

            fig = go.Figure(data=[go.Bar(x=x_values, y=occupancy_list)])
            fig.update_traces(marker_color=color1, marker_line_color=color2,marker_line_width=0.6, opacity=opacity_value)
            fig.update_layout(title_text=site + " (" + taxonomic_level + ")", yaxis_title="occupancy (%)")
            fig.update_layout(height=int(y_site_occ), width=int(x_site_occ), template=template, font_size=font_size, title_font_size=font_size)
            fig.update_yaxes(range=[0,100])
            fig.update_xaxes(tickmode='linear')
            fig.update_xaxes(tickangle=-90)


            output_pdf = Path(str(occupancy_plot_directory) + "/" + site + "_" + taxonomic_level + ".pdf")
            output_html = Path(str(occupancy_plot_directory) + "/" + site + "_" + taxonomic_level + ".html")
            occupancy_table = Path(str(occupancy_plot_directory) + "/" + site + "_" + taxonomic_level + ".xlsx")
            fig.write_image(str(output_pdf))
            fig.write_html(str(output_html))
            occupancy_df = pd.DataFrame(occupancy_list, species_list)
            occupancy_df.columns = ["Occupancy"]
            occupancy_df.index.name = "Taxon"
            occupancy_df = occupancy_df.sort_values("Occupancy")
            # sort the table numerical if OTUs were chosen
            if taxonomic_level == "ID":
                sort_list = []
                for OTU in occupancy_df.index.tolist():
                    sort_list.append(int(OTU.split("_")[1]))
                occupancy_df["sort"] = sort_list
                occupancy_df = occupancy_df.sort_values("sort")
                occupancy_df = occupancy_df.drop("sort", axis=1)
            occupancy_df.to_excel(occupancy_table)

        ## ask to show file
        answer = sg.PopupYesNo('Show plot?', keep_on_top=True)
        if answer == "Yes":
            webbrowser.open('file://' + str(output_html))

        ## print closing text
        closing_text = "Site occupancy plots are found under:\n" + '/'.join(str(output_pdf).split("/")[-4:])
        sg.Popup(closing_text, title="Finished", keep_on_top=True)

        ## write to log
        from taxontabletools.create_log import ttt_log
        placeholder = TaXon_table_xlsx.name + " (multiple site occupancy plots)"
        ttt_log("site occupancy", "analysis", TaXon_table_xlsx.name, placeholder, meta_data_to_test, path_to_outdirs)

    else:
        sg.PopupError("Please check your Metadata file and Taxon table file: The samples do not match or the metadata is unique for all samples!", keep_on_top=True)
示例#12
0
def read_proportions_pie(TaXon_table_xlsx, taxonomic_level, path_to_outdirs, width_value, height_value, template, font_size, color_discrete_sequence):

    import PySimpleGUI as sg
    import pandas as pd
    import numpy as np
    import plotly.graph_objects as go
    from pathlib import Path
    import os, webbrowser

    TaXon_table_xlsx = Path(TaXon_table_xlsx)
    TaXon_table_df = pd.read_excel(TaXon_table_xlsx).fillna("unidentified")
    samples_list = TaXon_table_df.columns.tolist()[10:]
    Species_read_proportion_dict = {}

    # check for presence absence data
    # otherwise abort and print error message
    pa_test = set([val for sublist in TaXon_table_df[samples_list].values.tolist() for val in sublist])
    if pa_test == {1,0}:
        sg.Popup("Please do not use presence absence data!", title=("Error"))
        raise RuntimeError

    ## check for the taxonmic level to analyse
    if taxonomic_level not in ["ASVs", "ESVs", "OTUs", "zOTUs"]:
        ## create a y axis title text
        taxon_title = taxonomic_level
        answer = sg.PopupYesNo("Shall missing taxonomy be replaced by the best hit?\n\nYes => Replace missing taxonomy with the best available hit.\nNo  => Display missing taxonomy as \'unidentified\'.", title="Plotting strategy")
        if answer == "Yes":
            ## replace nan with the best hit
            taxon_levels_dict = {"Phylum": 1, "Class": 2, "Order": 3, "Family": 4, "Genus": 5, "Species": 6}
            value_taxonomic_level = taxon_levels_dict[taxonomic_level]
            best_hit_list = []
            for taxon in TaXon_table_df[list(taxon_levels_dict.keys())].values.tolist():
                ## human readable range => e.g. from 5 to 0 for species level
                for test in range(value_taxonomic_level-1,-1,-1):
                    if taxon[test] != "unidentified":
                        best_hit_list.append(taxon[test])
                        break
            TaXon_table_df[taxonomic_level] = best_hit_list
    else:
        taxon_title = taxonomic_level
        taxonomic_level = "ID"

    ############################################################################
    ## create the progress bar window
    layout = [[sg.Text('Progress bar')],
              [sg.ProgressBar(1000, orientation='h', size=(20, 20), key='progressbar')],
              [sg.Cancel()]]
    window_progress_bar = sg.Window('Progress bar', layout, keep_on_top=True)
    progress_bar = window_progress_bar['progressbar']
    progress_update = 0
    progress_increase = 1000 / len(samples_list) + 1
    ############################################################################

    TaXon_table_df_2 = ""

    for sample in samples_list:
        df = TaXon_table_df[['ID', "Phylum", "Class", "Order", "Family", "Genus", "Species", sample]]
        df_2 = df[[sample]]/df[[sample]].sum()
        df = df.assign(perc=df_2.values)
        df["perc"] = df.groupby([taxonomic_level])['perc'].transform('sum')
        df_3 = df.drop_duplicates(subset=[taxonomic_level, 'perc'])
        df_3 = df_3.drop([sample], axis=1)
        df_3 = df_3.rename(columns={"perc": sample})
        if TaXon_table_df_2 is "":
            TaXon_table_df_2 = df_3
        else:
            TaXon_table_df_2 = TaXon_table_df_2.join(df_3[[sample]])

        ############################################################################
        event, values = window_progress_bar.read(timeout=10)
        if event == 'Cancel'  or event is None:
            window_progress_bar.Close()
            raise RuntimeError
        # update bar with loop value +1 so that bar eventually reaches the maximum
        progress_update += progress_increase
        progress_bar.UpdateBar(progress_update)
        ############################################################################

    window_progress_bar.Close()

    ## create dataframe for plot
    plot_df = TaXon_table_df_2[samples_list]
    plot_df.index = TaXon_table_df_2[taxonomic_level]

    ##############################################################################
    ## create a subfolder for better sorting and overview
    dirName = Path(str(path_to_outdirs) + "/" + "Read_proportions_plots" + "/" + TaXon_table_xlsx.stem + "/")
    dirName_samples = Path(str(path_to_outdirs) + "/" + "Read_proportions_plots" + "/" + TaXon_table_xlsx.stem + "/samples")
    if not os.path.exists(dirName):
        os.mkdir(dirName)
    if not os.path.exists(dirName_samples):
        os.mkdir(dirName_samples)

    ## read abundance pie chart per sample
    for sample in samples_list:
        sample_df = plot_df.loc[plot_df[sample] > 0.0, [sample]]
        labels = sample_df.index.tolist()
        values = sample_df[sample].values.tolist()
        fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
        fig.update_layout(title=sample, annotations=[dict(text=taxonomic_level, x=0.5, y=0.5, showarrow=False)])
        fig.update_traces(textposition='inside')
        fig.update_layout(width=int(width_value), height=int(height_value), template=template, font_size=font_size, title_font_size=font_size)

        output_pdf = Path(str(dirName_samples) + "/" + sample + "_" + taxon_title + "_pie.pdf")
        output_html = Path(str(dirName_samples) + "/" + sample + "_" + taxon_title + "_pie.html")

        fig.write_image(str(output_pdf))
        fig.write_html(str(output_html))

    ## main read abundance pie chart
    main_df = pd.DataFrame(TaXon_table_df[taxonomic_level].values.tolist(), list(TaXon_table_df[samples_list].sum(axis=1)), columns=["Taxon"])
    main_df["Reads"] = main_df.index
    df_2 = main_df["Reads"]/main_df["Reads"].sum()
    main_df = main_df.assign(perc=df_2.values*100)

    fig = go.Figure(data=[go.Pie(labels=main_df["Taxon"], values=main_df["perc"], marker_colors=color_discrete_sequence, hole=.3)])
    fig.update_traces(textposition='inside')
    fig.update_layout(annotations=[dict(text=taxon_title, x=0.5, y=0.5, showarrow=False)])
    fig.update_layout(width=int(width_value), height=int(height_value), template=template, font_size=font_size, title_font_size=font_size)

    ## write files
    output_pdf = Path(str(dirName) + "/" + taxonomic_level + "_pie.pdf")
    output_html = Path(str(dirName) + "/" + taxonomic_level + "_pie.html")
    output_xlsx = Path(str(dirName) + "/" + taxonomic_level + "_pie.xlsx")
    fig.write_image(str(output_pdf))
    fig.write_html(str(output_html))

    ## ask to show file
    answer = sg.PopupYesNo('Show plot?', keep_on_top=True)
    if answer == "Yes":
        webbrowser.open('file://' + str(output_html))

    ## print closing text
    closing_text = "Read proportion plot is found under:\n" + '/'.join(str(output_pdf).split("/")[-4:])
    sg.Popup(closing_text, title="Finished", keep_on_top=True)

    ## write log
    from taxontabletools.create_log import ttt_log
    ttt_log("read proportions pie chart", "analysis", TaXon_table_xlsx.name, output_pdf.name, "", path_to_outdirs)
示例#13
0
def read_proportions_bar(TaXon_table_xlsx, taxonomic_level, path_to_outdirs, width_value, height_value, template, font_size, color_discrete_sequence):

    import PySimpleGUI as sg
    import pandas as pd
    import numpy as np
    import plotly.express as px
    from pathlib import Path
    import os, webbrowser

    TaXon_table_xlsx = Path(TaXon_table_xlsx)
    TaXon_table_df = pd.read_excel(TaXon_table_xlsx).fillna("unidentified")
    samples_list = TaXon_table_df.columns.tolist()[10:]
    Species_read_proportion_dict = {}

    # check for presence absence data
    # otherwise abort and print error message
    pa_test = set([val for sublist in TaXon_table_df[samples_list].values.tolist() for val in sublist])
    if pa_test == {1,0}:
        sg.Popup("Please do not use presence absence data!", title=("Error"))
        raise RuntimeError

    ## check for the taxonmic level to analyse
    if taxonomic_level not in ["ASVs", "ESVs", "OTUs", "zOTUs"]:
        ## create a y axis title text
        taxon_title = taxonomic_level.lower()
        # ask how the to handle missing taxonomy
        answer = sg.PopupYesNo("Shall missing taxonomy be replaced by the best hit?\n\nYes => Replace missing taxonomy with the best available hit.\nNo  => Display missing taxonomy as \'unidentified\'.", title="Plotting strategy")
        if answer == "Yes":
            ## replace nan with the best hit
            taxon_levels_dict = {"Phylum": 1, "Class": 2, "Order": 3, "Family": 4, "Genus": 5, "Species": 6}
            value_taxonomic_level = taxon_levels_dict[taxonomic_level]
            best_hit_list = []
            for taxon in TaXon_table_df[list(taxon_levels_dict.keys())].values.tolist():
                ## human readable range => e.g. from 5 to 0 for species level
                for test in range(value_taxonomic_level-1,-1,-1):
                    if taxon[test] != "unidentified":
                        best_hit_list.append(taxon[test])
                        break
            TaXon_table_df[taxonomic_level] = best_hit_list
    else:
        taxon_title = taxonomic_level
        taxonomic_level = "ID"

    ##############################################################################
    ## create a subfolder for better sorting and overview
    dirName = Path(str(path_to_outdirs) + "/" + "Read_proportions_plots" + "/" + TaXon_table_xlsx.stem + "/")
    if not os.path.exists(dirName):
        os.mkdir(dirName)

    output_pdf = Path(str(dirName) + "/" + taxon_title + "_bar.pdf")
    output_html = Path(str(dirName) + "/" + taxon_title + "_bar.html")
    output_xlsx = Path(str(dirName) + "/" + taxon_title + "_bar.xlsx")

    ############################################################################
    ## create the progress bar window
    layout = [[sg.Text('Progress bar')],
              [sg.ProgressBar(1000, orientation='h', size=(20, 20), key='progressbar')],
              [sg.Cancel()]]
    window_progress_bar = sg.Window('Progress bar', layout, keep_on_top=True)
    progress_bar = window_progress_bar['progressbar']
    progress_update = 0
    progress_increase = 1000 / len(samples_list) + 1
    ############################################################################

    TaXon_table_df_2 = ""

    for sample in samples_list:
        df = TaXon_table_df[['ID', "Phylum", "Class", "Order", "Family", "Genus", "Species", sample]]
        df_2 = df[[sample]]/df[[sample]].sum()
        df = df.assign(perc=df_2.values*100)
        df["perc"] = df.groupby([taxonomic_level])['perc'].transform('sum')
        df_3 = df.drop_duplicates(subset=[taxonomic_level, 'perc'])
        df_3 = df_3.drop([sample], axis=1)
        df_3 = df_3.rename(columns={"perc": sample})
        if TaXon_table_df_2 is "":
            TaXon_table_df_2 = df_3
        else:
            TaXon_table_df_2 = TaXon_table_df_2.join(df_3[[sample]])

        ############################################################################
        event, values = window_progress_bar.read(timeout=10)
        if event == 'Cancel'  or event is None:
            window_progress_bar.Close()
            raise RuntimeError
        # update bar with loop value +1 so that bar eventually reaches the maximum
        progress_update += progress_increase
        progress_bar.UpdateBar(progress_update)
        ############################################################################

    window_progress_bar.Close()

    ## create dataframe for plotly express
    plot_df = ''
    for sample in samples_list:
        if plot_df is '':
            plot_df = pd.DataFrame([[sample] + entry for entry in TaXon_table_df_2[[taxonomic_level, sample]].values.tolist()], columns=["Sample", "Taxon", "Reads"])
        else:
            df = pd.DataFrame([[sample] + entry for entry in TaXon_table_df_2[[taxonomic_level, sample]].values.tolist()], columns=["Sample", "Taxon", "Reads"])
            plot_df = plot_df.append(df)

    n_taxa = len(TaXon_table_df_2[taxonomic_level].values.tolist())
    plot_df["Color"] = list(np.linspace(0,100,n_taxa)) * len(samples_list)

    fig = px.bar(plot_df, x="Sample", y="Reads", color="Taxon", color_discrete_sequence=color_discrete_sequence, labels={"Color": "Taxon"})
    fig.update_layout(barmode='stack', width=int(width_value), height=int(height_value), template=template, font_size=font_size, title_font_size=font_size)
    fig.update_yaxes(title_text="reads (%)")
    fig.update_xaxes(title_text="")

    ## write files
    fig.write_image(str(output_pdf))
    fig.write_html(str(output_html))

    ## ask to show file
    answer = sg.PopupYesNo('Show plot?', keep_on_top=True)
    if answer == "Yes":
        webbrowser.open('file://' + str(output_html))

    ## print closing text
    closing_text = "Read proportion plot is found under:\n" + '/'.join(str(output_pdf).split("/")[-4:])
    sg.Popup(closing_text, title="Finished", keep_on_top=True)

    ## print closing text
    from taxontabletools.create_log import ttt_log
    ttt_log("read proportions bar plot", "analysis", TaXon_table_xlsx.name, output_pdf.name, "", path_to_outdirs)
示例#14
0
def read_proportions_heatmap(TaXon_table_xlsx, taxonomic_level, path_to_outdirs, width_value, height_value, template, font_size):

    import PySimpleGUI as sg
    import pandas as pd
    import numpy as np
    import plotly.express as px
    import plotly.graph_objects as go
    from pathlib import Path
    import os, webbrowser

    TaXon_table_xlsx = Path(TaXon_table_xlsx)
    TaXon_table_df = pd.read_excel(TaXon_table_xlsx).fillna("unidentified")
    samples_list = TaXon_table_df.columns.tolist()[10:]
    Species_read_proportion_dict = {}

    # check for presence absence data
    # otherwise abort and print error message
    pa_test = set([val for sublist in TaXon_table_df[samples_list].values.tolist() for val in sublist])
    if pa_test == {1,0}:
        sg.Popup("Please do not use presence absence data!", title=("Error"))
        raise RuntimeError

    ## check for the taxonmic level to analyse
    if taxonomic_level not in ["ASVs", "ESVs", "OTUs", "zOTUs"]:
        ## create a y axis title text
        taxon_title = taxonomic_level.lower()
        # ask how the to handle missing taxonomy
        answer = sg.PopupYesNo("Shall missing taxonomy be replaced by the best hit?\n\nYes => Replace missing taxonomy with the best available hit.\nNo  => Display missing taxonomy as \'unidentified\'.", title="Plotting strategy")
        if answer == "Yes":
            ## replace nan with the best hit
            taxon_levels_dict = {"Phylum": 1, "Class": 2, "Order": 3, "Family": 4, "Genus": 5, "Species": 6}
            value_taxonomic_level = taxon_levels_dict[taxonomic_level]
            best_hit_list = []
            for taxon in TaXon_table_df[list(taxon_levels_dict.keys())].values.tolist():
                ## human readable range => e.g. from 5 to 0 for species level
                for test in range(value_taxonomic_level-1,-1,-1):
                    if taxon[test] != "unidentified":
                        best_hit_list.append(taxon[test])
                        break
            TaXon_table_df[taxonomic_level] = best_hit_list
    else:
        taxon_title = taxonomic_level
        taxonomic_level = "ID"


    ##############################################################################
    ## create a subfolder for better sorting and overview
    dirName = Path(str(path_to_outdirs) + "/" + "Read_proportions_plots" + "/" + TaXon_table_xlsx.stem + "/")
    if not os.path.exists(dirName):
        os.mkdir(dirName)

    output_pdf = Path(str(dirName) + "/" + taxonomic_level + "_heatmap.pdf")
    output_html = Path(str(dirName) + "/" + taxonomic_level + "_heatmap.html")
    output_xlsx = Path(str(dirName) + "/" + taxonomic_level + "_heatmap.xlsx")

    ############################################################################
    ## create the progress bar window
    layout = [[sg.Text('Progress bar')],
              [sg.ProgressBar(1000, orientation='h', size=(20, 20), key='progressbar')],
              [sg.Cancel()]]
    window_progress_bar = sg.Window('Progress bar', layout, keep_on_top=True)
    progress_bar = window_progress_bar['progressbar']
    progress_update = 0
    progress_increase = 1000 / len(samples_list) + 1
    ############################################################################

    TaXon_table_df_2 = ""

    for sample in samples_list:
        df = TaXon_table_df[['ID', "Phylum", "Class", "Order", "Family", "Genus", "Species", sample]]
        df_2 = df[[sample]]/df[[sample]].sum()
        df = df.assign(perc=df_2.values * 100)
        df["perc"] = df.groupby([taxonomic_level])['perc'].transform('sum')
        df_3 = df.drop_duplicates(subset=[taxonomic_level, 'perc'])
        df_3 = df_3.drop([sample], axis=1)
        df_3 = df_3.rename(columns={"perc": sample})
        if TaXon_table_df_2 is "":
            TaXon_table_df_2 = df_3
        else:
            TaXon_table_df_2 = TaXon_table_df_2.join(df_3[[sample]])

        ############################################################################
        event, values = window_progress_bar.read(timeout=10)
        if event == 'Cancel'  or event is None:
            window_progress_bar.Close()
            raise RuntimeError
        # update bar with loop value +1 so that bar eventually reaches the maximum
        progress_update += progress_increase
        progress_bar.UpdateBar(progress_update)
        ############################################################################

    window_progress_bar.Close()

    ## create plot
    ## ask if a subplot shall be generated
    plot_df = TaXon_table_df_2[samples_list]
    plot_df.index = TaXon_table_df_2[taxonomic_level]

    ## custom colorscale
    cs=[
    [0, "rgb(220,220,220)"],

    [0.00001, "rgb(255,255,255)"],
    [0.05, "rgb(255,255,255)"],

    [0.05, "rgb(242,242,255)"],
    [0.1, "rgb(242,242,255)"],

    [0.1, "rgb(229,229,255)"],
    [0.15, "rgb(229,229,255)"],

    [0.15, "rgb(216,216,255)"],
    [0.2, "rgb(216,216,255)"],

    [0.2, "rgb(203,203,255)"],
    [0.25, "rgb(203,203,255)"],

    [0.25, "rgb(190,190,255)"],
    [0.3, "rgb(190,190,255)"],

    [0.3, "rgb(177,177,255)"],
    [0.35, "rgb(177,177,255)"],

    [0.35, "rgb(164,164,255)"],
    [0.4, "rgb(164,164,255)"],

    [0.4, "rgb(155,155,255)"],
    [0.45, "rgb(155,155,255)"],

    [0.45, "rgb(138,138,255)"],
    [0.5, "rgb(138,138,255)"],

    [0.5,"rgb(125,125,255)"],
    [0.55,"rgb(125,125,255)"],

    [0.55, "rgb(112,112,255)"],
    [0.6, "rgb(112,112,255)"],

    [0.6, "rgb(99,99,255)"],
    [0.65, "rgb(99,99,255)"],

    [0.65, "rgb(86,86,255)"],
    [0.7, "rgb(86,86,255)"],

    [0.7, "rgb(73,73,255)"],
    [0.75, "rgb(73,73,255)"],

    [0.75, "rgb(60,60,255)"],
    [0.8, "rgb(60,60,255)"],

    [0.8, "rgb(47,47,255)"],
    [0.85, "rgb(47,47,255)"],

    [0.85, "rgb(34,34,255)"],
    [0.9, "rgb(34,34,255)"],

    [0.9, "rgb(21,21,255)"],
    [0.95, "rgb(21,21,255)"],

    [0.95, "rgb(8,8,255)"],
    [1, "rgb(8,8,255)"],

    ]

    if (taxonomic_level == "Species" or taxonomic_level == "Genus"):
        y_values = ["<i>" + taxon + "</i>" for taxon in plot_df.index.tolist()[::-1]]
    else:
        y_values = plot_df.index.tolist()[::-1]

    ## v2 heatmap
    fig = go.Figure(data=go.Heatmap(
        z=plot_df.values.tolist()[::-1],
        x=plot_df.columns.tolist(),
        y=y_values,
        colorscale=cs))

    fig.update_layout(width=int(width_value), height=int(height_value), template=template, font_size=font_size, title_font_size=font_size, yaxis_nticks=len(plot_df.index.tolist()), xaxis_nticks=len(plot_df.index.tolist()), legend_title_text='reads (%)')

    ## write files
    fig.write_image(str(output_pdf))
    fig.write_html(str(output_html))

    ## ask to show file
    answer = sg.PopupYesNo('Show plot?', keep_on_top=True)
    if answer == "Yes":
        webbrowser.open('file://' + str(output_html))

    ## print closing text
    closing_text = "Read proportion plots are found in: " + str(path_to_outdirs) + "/Read_proportion_plots/"
    sg.Popup(closing_text, title="Finished", keep_on_top=True)

    ## write log
    from taxontabletools.create_log import ttt_log
    ttt_log("read proportions heatmap", "analysis", TaXon_table_xlsx.name, output_pdf.name, "", path_to_outdirs)
def gbif_check_taxonomy(TaXon_table_xlsx, path_to_outdirs):

    import requests_html, json
    import PySimpleGUI as sg
    import pandas as pd
    from pandas import DataFrame
    import numpy as np
    from pathlib import Path

    TaXon_table_xlsx = Path(TaXon_table_xlsx)
    TaXon_table_df = pd.read_excel(TaXon_table_xlsx)
    taxon_levels = ["Phylum", "Class", "Order", "Family", "Genus", "Species"]
    OTUs_list = TaXon_table_df["ID"].values.tolist()

    taxonomy_check_dict = {}

    ############################################################################
    ## create the progress bar window
    layout = [[sg.Text('Progress bar')],
              [
                  sg.ProgressBar(1000,
                                 orientation='h',
                                 size=(20, 20),
                                 key='progressbar')
              ], [sg.Cancel()]]
    window_progress_bar = sg.Window('Progress bar', layout, keep_on_top=True)
    progress_bar = window_progress_bar['progressbar']
    progress_update = 0
    progress_increase = 1000 / len(OTUs_list)
    ############################################################################

    for OTU in TaXon_table_df[[
            "Phylum", "Class", "Order", "Family", "Genus", "Species"
    ]].fillna("").values.tolist():
        for i, taxonomy in enumerate(OTU):
            if taxonomy == "":
                phylum_name = OTU[0]
                taxon_name = OTU[i - 1]
                taxonomy_check = taxon_levels[0:i]
                result = gbif_parent_check(phylum_name, taxon_name,
                                           taxonomy_check)
                query = OTU[0:i]
                if (query != result and result != "ERROR"):
                    if len(query) != 6:
                        add = 6 - len(query)
                        query = query + [''] * add
                    if len(result) != 6:
                        add = 6 - len(result)
                        result = result + [''] * add
                    query = ",".join(query)
                    taxonomy_check_dict[query] = result
                break

            elif i == 5:
                phylum_name = OTU[0]
                taxon_name = OTU[5]
                taxonomy_check = taxon_levels
                result = gbif_parent_check(phylum_name, taxon_name,
                                           taxonomy_check)
                if (OTU != result and result != "ERROR"):
                    query = ",".join(OTU)
                    taxonomy_check_dict[query] = result

        ############################################################################
        event, values = window_progress_bar.read(timeout=10)
        if event == 'Cancel' or event is None:
            window_progress_bar.Close()
            raise RuntimeError
        # update bar with loop value +1 so that bar eventually reaches the maximum
        progress_update += progress_increase
        progress_bar.UpdateBar(progress_update)
        ############################################################################

    window_progress_bar.Close()

    TaXon_table_list = []

    for OTU in TaXon_table_df.fillna("").values.tolist():
        taxonomy = OTU[1:7]
        search_key = ','.join(taxonomy)
        if (search_key in taxonomy_check_dict.keys()
                and taxonomy_check_dict[search_key] != [''] * 6):
            replacement_taxonomy = taxonomy_check_dict[search_key]
            replacement_OTU = [OTU[0]] + replacement_taxonomy + OTU[7:]
            TaXon_table_list.append(replacement_OTU)
        else:
            TaXon_table_list.append(OTU)

    file_name = TaXon_table_xlsx.stem
    output_name = Path(
        str(path_to_outdirs) + "/TaXon_tables/" + file_name + "_gbif" +
        ".xlsx")
    df_new = pd.DataFrame(TaXon_table_list,
                          columns=(TaXon_table_df.columns.values.tolist()))
    df_new.to_excel(output_name, sheet_name='TaXon table', index=False)

    change_log_list = []
    for key, value in taxonomy_check_dict.items():
        change_log_list.append(["Input:"] + key.split(","))
        change_log_list.append(["Gbif:"] + value)

    change_log_df = pd.DataFrame(change_log_list,
                                 columns=(["Change"] + taxon_levels))
    change_log_name = Path(
        str(path_to_outdirs) + "/GBIF/" + file_name + "_gbif_log" + ".xlsx")
    change_log_df = pd.DataFrame(change_log_list,
                                 columns=(["Change"] + taxon_levels))
    change_log_df.to_excel(change_log_name,
                           sheet_name='TaXon table',
                           index=False)

    closing_text = "Taxon table is found under:\n" + '/'.join(
        str(output_name).split("/")
        [-4:]) + "\n\n" + "Log file is found under:\n" + '/'.join(
            str(change_log_name).split("/")[-4:])
    sg.Popup(closing_text, title="Finished", keep_on_top=True)

    from taxontabletools.create_log import ttt_log
    ttt_log("gbif check", "processing", TaXon_table_xlsx.name,
            output_name.name, "nan", path_to_outdirs)
示例#16
0
def create_metadata_table(TaXon_table_xlsx, path_to_outdirs):

    import PySimpleGUI as sg
    import pandas as pd
    from pandas import DataFrame
    import numpy as np
    import sys, subprocess, os
    from pathlib import Path

    def open_table(table):
        if sys.platform == "win32":
            os.startfile(table)
        else:
            opener = "open" if sys.platform == 'darwin' else 'xdg-open'
            subprocess.call([opener, table])

    TaXon_table_xlsx = Path(TaXon_table_xlsx)
    Meta_data_table_xlsx = Path(
        str(path_to_outdirs) + "/" + "Meta_data_table" + "/" +
        TaXon_table_xlsx.stem + "_metadata.xlsx")

    TaXon_table_xslx_df = pd.read_excel(TaXon_table_xlsx)
    samples_list = TaXon_table_xslx_df.columns.tolist()[10:]
    samples_metadata_list = []

    ############################################################################
    ## create the progress bar window
    layout = [[sg.Text('Progress bar')],
              [
                  sg.ProgressBar(1000,
                                 orientation='h',
                                 size=(20, 20),
                                 key='progressbar')
              ], [sg.Cancel()]]
    window_progress_bar = sg.Window('Progress bar', layout, keep_on_top=True)
    progress_bar = window_progress_bar['progressbar']
    progress_update = 0
    progress_increase = 1000 / len(samples_list) + 1
    ############################################################################

    for sample in samples_list:
        sample_metadata = []
        sample_metadata.append(sample)
        for part in sample.split("_"):
            sample_metadata.append(part)
        samples_metadata_list.append(sample_metadata)

        ############################################################################
        event, values = window_progress_bar.read(timeout=10)
        if event == 'Cancel' or event is None:
            window_progress_bar.Close()
            raise RuntimeError
        # update bar with loop value +1 so that bar eventually reaches the maximum
        progress_update += progress_increase
        progress_bar.UpdateBar(progress_update)
        ############################################################################

    window_progress_bar.Close()

    answer = "No"
    if Meta_data_table_xlsx.exists():
        answer = sg.PopupYesNo("Metadata tables already exists! Overwrite?")
        if answer == "Yes":
            metadata_df = pd.DataFrame(samples_metadata_list)
            metadata_df.columns = ["Samples"] + [
                "col_" + str(column)
                for column in metadata_df.columns.tolist()[1:]
            ]
            metadata_df.to_excel(Meta_data_table_xlsx, index=False)

            answer = sg.PopupYesNo("Open metadata table?",
                                   title="Finished",
                                   keep_on_top=True)
            if answer == "Yes":
                open_table(Meta_data_table_xlsx)

            from taxontabletools.create_log import ttt_log
            ttt_log("meta data table", "analysis", TaXon_table_xlsx.name,
                    Meta_data_table_xlsx.name, "nan", path_to_outdirs)
    else:
        metadata_df = pd.DataFrame(samples_metadata_list)
        metadata_df.columns = ["Samples"] + [
            "col_" + str(column) for column in metadata_df.columns.tolist()[1:]
        ]
        metadata_df.to_excel(Meta_data_table_xlsx, index=False)

        answer = sg.PopupYesNo("Open metadata table?",
                               title="Finished",
                               keep_on_top=True)
        if answer == "Yes":
            open_table(Meta_data_table_xlsx)

        from taxontabletools.create_log import ttt_log
        ttt_log("meta data table", "analysis", TaXon_table_xlsx.name,
                Meta_data_table_xlsx.name, "nan", path_to_outdirs)
def per_taxon_analysis(TaXon_table_xlsx, height, width, taxonomic_level, path_to_outdirs, template, theme, font_size, clustering_unit):

    import PySimpleGUI as sg
    import pandas as pd
    import numpy as np
    from pathlib import Path
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    import itertools, webbrowser

    ## save the taxon title
    taxon_title = clustering_unit

    ## collect plot variables
    color1 = theme[0]
    color2 = theme[1]
    opacity_value = theme[2]
    height = int(height)
    width = int(width)

    ## load taxon table
    TaXon_table_xlsx = Path(TaXon_table_xlsx)
    TaXon_table_df = pd.read_excel(TaXon_table_xlsx)
    TaXon_table_df = TaXon_table_df.fillna("nan")

    ## collect the taxa to test on
    taxa = sorted(list(set([taxon for taxon in TaXon_table_df[taxonomic_level].values.tolist() if taxon != "nan"])))

    ## check if there are more than 8 taxa
    answer = "Yes"
    if len(taxa) > 8:
        answer = sg.PopupYesNo("There are more than 8 taxa detected. This can render the plot difficult to read. Continue anyway?")

    if answer == "Yes":
        ## collect the OTUs
        OTUs = TaXon_table_df["ID"].values.tolist()

        ## count the number of OTUs per taxon
        n_OTUs = [TaXon_table_df[taxonomic_level].values.tolist().count(taxon) for taxon in taxa]

        ## collect all OTUs on species level
        OTU_species = [OTU for OTU in TaXon_table_df[[taxonomic_level, "Species"]].values.tolist() if OTU[1] != "nan"]
        OTU_species.sort()
        OTU_species = list(k for k,_ in itertools.groupby(OTU_species))
        OTU_species = [OTU[0] for OTU in OTU_species]
        n_species = [OTU_species.count(taxon) for taxon in taxa]

        ## count reads for each taxon
        n_reads = []
        for taxon in taxa:
            n_reads.append(sum([sum(OTU[10:]) for OTU in TaXon_table_df[TaXon_table_df[taxonomic_level]==taxon].values.tolist()]))

        if (taxonomic_level == "Species" or taxonomic_level == "Genus"):
            x_values = ["<i>" + taxon + "</i>" for taxon in taxa]
        else:
            x_values = taxa

        ## calculate the read proportions
        reads_sum = sum(n_reads)
        n_reads = [round(reads / reads_sum * 100, 2) for reads in n_reads]

        ## create subplots
        fig = make_subplots(rows=1, cols=2, subplot_titles=("A)", "B)"))

        ## percentage of reads per taxonomic level
        hovertext = 'Taxon: %{x}, Reads: %{y}'
        fig.add_trace(go.Bar(hovertemplate=hovertext, name="",x=x_values, y=n_reads),row=1, col=1)
        fig.update_yaxes(title_text = "reads (%)", title_standoff=5, row=1, col=1)
        fig.update_traces(marker_color=color1, marker_line_color=color2, marker_line_width=1, opacity=opacity_value, showlegend=False, row=1, col=1)

        ## Number of OTUs
        hovertext = 'Taxon: %{x}, OTUs: %{y}'
        title_text = "# " + taxon_title
        fig.add_trace(go.Bar(hovertemplate=hovertext, name="",x=x_values, y=n_OTUs, text=n_OTUs, showlegend=False),row=1, col=2)
        fig.update_traces(marker_color=color1, marker_line_color=color2, marker_line_width=1, opacity=opacity_value, row=1, col=2)
        fig.update_yaxes(title_text=title_text, title_standoff=5, row=1, col=2, rangemode="tozero")

        ## Number of OTUs on species level
        hovertext = 'Taxon: %{x}, Species: %{text}'
        fig.add_trace(go.Scatter(textposition = "top center", hovertemplate=hovertext, text=n_species, name="Species",x=x_values, y=n_OTUs, showlegend=False, mode='text'),row=1, col=2)
        fig.update_traces(marker_color=color1, marker_line_color=color2, marker_line_width=1, opacity=opacity_value, row=1, col=2)

        ## fig.add_annotation( text='─ Species', align='left', showarrow=False, xref='paper', yref='paper', x=1.05, y=0.5, bordercolor='black', borderwidth=1)

        ## update the layout
        fig.update_layout(barmode='stack', height=int(height), width=int(width), template=template, showlegend=False, font_size=font_size, title_font_size=font_size)
        fig.update_xaxes(tickmode='linear')
        fig.update_xaxes(tickangle=-90)

        ## write ouput files
        output_pdf = Path(str(path_to_outdirs) + "/Per_taxon_statistics/" + TaXon_table_xlsx.stem + "_" + taxonomic_level + ".pdf")
        output_html = Path(str(path_to_outdirs) + "/Per_taxon_statistics/" + TaXon_table_xlsx.stem + "_" + taxonomic_level + ".html")
        fig.write_image(str(output_pdf))
        fig.write_html(str(output_html))

        ## ask to show file
        answer = sg.PopupYesNo('Show plot?', keep_on_top=True)
        if answer == "Yes":
            webbrowser.open('file://' + str(output_html))

        ## print closing text
        closing_text = "Plots are found under: " + str(path_to_outdirs) + "/Per_taxon_statistics/"
        sg.Popup(closing_text, title="Finished", keep_on_top=True)

        ## write to log
        from taxontabletools.create_log import ttt_log
        ttt_log("per taxon statistics", "analysis", TaXon_table_xlsx.name, output_pdf.name, "", path_to_outdirs)
def filter_samples(TaXon_table_xlsx, selected_samples, appendix_name,
                   path_to_outdirs, sample_filter_method):

    import PySimpleGUI as sg
    import pandas as pd
    from pandas import DataFrame
    from pathlib import Path

    TaXon_table_file = Path(TaXon_table_xlsx)

    TaXon_table_xlsx_path = TaXon_table_xlsx
    TaXon_table_xlsx = pd.ExcelFile(TaXon_table_xlsx)
    df = pd.read_excel(TaXon_table_xlsx, 'TaXon table', header=0)
    n_old_OTUs = len(df["ID"].values.tolist())

    if type(selected_samples) == str:
        selected_samples = [selected_samples]

    if sample_filter_method == "exclude":
        for sample in selected_samples:
            df = df.drop(sample, axis=1)
    else:
        available_samples = df.columns.tolist()[10:]
        for sample in available_samples:
            if sample not in selected_samples:
                df = df.drop(sample, axis=1)

    header = df.columns.values.tolist()

    row_filter_list = []

    for row in df.values.tolist():
        reads = set(row[10:])
        if reads != {0}:
            row_filter_list.append(row)

    df = pd.DataFrame(row_filter_list)
    df.columns = header

    file_name = TaXon_table_file.stem
    output_name = Path(
        str(path_to_outdirs) + "/" + "TaXon_tables" + "/" + file_name + "_" +
        appendix_name + ".xlsx")
    df.to_excel(output_name, sheet_name='TaXon table', index=False)

    ## print results for the user
    n_remaining_OTUs = len(df["ID"].values.tolist())
    diff_abs = n_old_OTUs - n_remaining_OTUs
    diff_rel = round(100 - n_remaining_OTUs / n_old_OTUs * 100, 2)

    ## finish script
    closing_text = "Removed " + str(diff_abs) + " OTUs (" + str(
        diff_rel) + "%).\n\n" + "Taxon table is found under:\n" + '/'.join(
            str(output_name).split("/")[-4:])
    sg.Popup(closing_text, title="Finished", keep_on_top=True)

    from taxontabletools.create_log import ttt_log
    log_text = str(diff_abs) + " OTUs ; " + str(diff_rel) + "%"
    from taxontabletools.create_log import ttt_log
    ttt_log("sample filter", "processing", TaXon_table_file.name,
            output_name.name, log_text, path_to_outdirs)
示例#19
0
def betadiv_clustering(TaXon_table_xlsx, height, width, threshold,
                       betadiv_linkage, taxonomic_level, path_to_outdirs,
                       template, font_size, diss_metric):

    from scipy.cluster.hierarchy import dendrogram, linkage
    import plotly.figure_factory as ff
    import numpy as np
    import pandas as pd
    from skbio.diversity import beta_diversity
    from pathlib import Path
    import PySimpleGUI as sg
    import webbrowser

    ## import table
    TaXon_table_xlsx = Path(TaXon_table_xlsx)
    TaXon_table_df = pd.read_excel(TaXon_table_xlsx,
                                   header=0).fillna("unidentified")

    ## create a y axis title text
    taxon_title = taxonomic_level.lower()

    ## adjust taxonomic level if neccessary
    if taxonomic_level in ["ASVs", "ESVs", "OTUs", "zOTUs"]:
        taxon_title = taxonomic_level
        taxonomic_level = "ID"

    ## collect samples for plot
    samples = TaXon_table_df.columns.tolist()[10:]

    ## extract the relevant data
    TaXon_table_df = TaXon_table_df[[taxonomic_level] + samples]
    ## define an aggregation function to combine multiple hit of one taxonimic level
    aggregation_functions = {}
    ## define samples functions
    for sample in samples:
        ## 'sum' will calculate the sum of p/a data
        aggregation_functions[sample] = 'sum'
    ## define taxon level function
    aggregation_functions[taxonomic_level] = 'first'
    ## create condensed dataframe
    df_new = TaXon_table_df.groupby(
        TaXon_table_df[taxonomic_level]).aggregate(aggregation_functions)
    if 'unidentified' in df_new.index:
        df_new = df_new.drop('unidentified')

    ## collect reads
    data = df_new[samples].transpose().values.tolist()
    ## calculate jaccard distances
    dissimilarity_dm = beta_diversity(diss_metric, data, samples)
    ## convert to distance matrix
    X1 = dissimilarity_dm.data
    matrix_df = pd.DataFrame(X1)
    matrix_df.columns = samples
    matrix_df.index = samples
    ## convert to 2D array
    X2 = dissimilarity_dm.condensed_form()
    ## cluster dendrogram
    fig = ff.create_dendrogram(
        X1,
        labels=samples,
        color_threshold=float(threshold),
        orientation="left",
        linkagefun=lambda x: linkage(X2, betadiv_linkage, metric=diss_metric))
    fig.update_yaxes(ticks="")
    fig.update_xaxes(title="A")
    title = str(diss_metric) + " distance"
    fig.update_layout(xaxis_title=title,
                      height=int(height),
                      width=int(width),
                      template=template,
                      font_size=font_size,
                      title_font_size=font_size)

    # finish script
    output_pdf = Path(
        str(path_to_outdirs) + "/" + "Beta_diversity" + "/" +
        TaXon_table_xlsx.stem + "_" + taxon_title + "_dendrogram_" +
        diss_metric + ".pdf")
    output_html = Path(
        str(path_to_outdirs) + "/" + "Beta_diversity" + "/" +
        TaXon_table_xlsx.stem + "_" + taxon_title + "_dendrogram_" +
        diss_metric + ".html")
    output_xlsx = Path(
        str(path_to_outdirs) + "/" + "Beta_diversity" + "/" +
        TaXon_table_xlsx.stem + "_" + taxon_title + "_dendrogram_" +
        diss_metric + ".xlsx")
    fig.write_image(str(output_pdf))
    fig.write_html(str(output_html))
    matrix_df.to_excel(output_xlsx)

    ## ask to show plot
    answer = sg.PopupYesNo('Show plot?', keep_on_top=True)
    if answer == "Yes":
        webbrowser.open('file://' + str(output_html))

    ## write to log file
    sg.Popup(diss_metric + " clustering dendrograms are found in",
             path_to_outdirs,
             "/Beta_diversity/",
             title="Finished",
             keep_on_top=True)
    from taxontabletools.create_log import ttt_log
    ttt_log(diss_metric + " clustering", "analysis", TaXon_table_xlsx.name,
            output_pdf.name, "", path_to_outdirs)
def read_filter(TaXon_table_xlsx, path_to_outdirs, read_filter_method,
                read_filter_treshold):
    import PySimpleGUI as sg
    import pandas as pd
    from pandas import DataFrame
    from pathlib import Path
    import numpy as np

    # TaXon_table_xlsx = "/Users/tillmacher/Desktop/Projects/TTT_Projects/Projects/Tutorial/TaXon_tables/Tutorial_taxon_table.xlsx"
    # path_to_outdirs = "/Users/tillmacher/Desktop/Projects/TTT_Projects/Projects/Tutorial"
    # read_filter_method = "absolute_filtering"
    # read_filter_treshold = 50

    TaXon_table_file = Path(TaXon_table_xlsx)
    TaXon_table_xlsx_path = TaXon_table_xlsx
    TaXon_table_xlsx = pd.ExcelFile(TaXon_table_xlsx)
    TaXon_table_df = pd.read_excel(TaXon_table_xlsx, 'TaXon table', header=0)
    samples = TaXon_table_df.columns.tolist()[10:]

    if read_filter_method == "absolute_filtering":

        ## transform dataframe to array and apply filter threshold
        a = np.array(TaXon_table_df[samples].values.tolist())
        TaXon_table_df[samples] = np.where(a < int(read_filter_treshold), 0,
                                           a).tolist()

        ## remove OTUs that have 0 reads after filtering
        row_filter_list = []
        for row in TaXon_table_df.values.tolist():
            reads = set(row[10:])
            if reads != {0}:
                row_filter_list.append(row)
        TaXon_table_df_filtered = pd.DataFrame(row_filter_list)
        TaXon_table_df_filtered.columns = TaXon_table_df.columns.tolist()

        ## save filtered dataframe to file
        file_name = TaXon_table_file.stem
        output_name = Path(
            str(path_to_outdirs) + "/TaXon_tables/" + file_name + "_" +
            read_filter_treshold + ".xlsx")
        TaXon_table_df_filtered.to_excel(output_name,
                                         sheet_name='TaXon table',
                                         index=False)

        ## print results for the user
        n_old_OTUs = len(TaXon_table_df["ID"].values.tolist())
        n_remaining_OTUs = len(TaXon_table_df_filtered["ID"].values.tolist())
        diff_abs = n_old_OTUs - n_remaining_OTUs
        diff_rel = round(100 - n_remaining_OTUs / n_old_OTUs * 100, 2)

        ## finish script
        closing_text = "Removed " + str(diff_abs) + " OTUs (" + str(
            diff_rel) + "%).\n\n" + "Taxon table is found under:\n" + '/'.join(
                str(output_name).split("/")[-4:])
        sg.Popup(closing_text, title="Finished", keep_on_top=True)

        from taxontabletools.create_log import ttt_log
        log_text = str(read_filter_treshold) + " ; " + str(
            diff_abs) + " OTUs ; " + str(diff_rel) + "%"
        ttt_log("absolute read filter", "processing", TaXon_table_file.name,
                output_name.name, log_text, path_to_outdirs)

    elif read_filter_method == "relative_filtering":
        ## transform to percentage
        read_filter_rel = float(read_filter_treshold) / 100
        for sample in samples:
            ## transform to array
            a = np.array(TaXon_table_df[sample].values.tolist())
            ## calculate threshold for each sample
            sample_threshold = sum(a) * read_filter_rel
            ## apply filter to dataframe
            TaXon_table_df[sample] = np.where(a < int(sample_threshold), 0,
                                              a).tolist()

        ## remove OTUs that have 0 reads after filtering
        row_filter_list = []
        for row in TaXon_table_df.values.tolist():
            reads = set(row[10:])
            if reads != {0}:
                row_filter_list.append(row)
        TaXon_table_df_filtered = pd.DataFrame(row_filter_list)
        TaXon_table_df_filtered.columns = TaXon_table_df.columns.tolist()

        ## save filtered dataframe to file
        file_name = TaXon_table_file.stem
        output_name = Path(
            str(path_to_outdirs) + "/TaXon_tables/" + file_name + "_" +
            read_filter_treshold + ".xlsx")
        TaXon_table_df_filtered.to_excel(output_name,
                                         sheet_name='TaXon table',
                                         index=False)

        ## print results for the user
        n_old_OTUs = len(TaXon_table_df["ID"].values.tolist())
        n_remaining_OTUs = len(TaXon_table_df_filtered["ID"].values.tolist())
        diff_abs = n_old_OTUs - n_remaining_OTUs
        diff_rel = round(100 - n_remaining_OTUs / n_old_OTUs * 100, 2)

        ## finish script
        closing_text = "Removed " + str(diff_abs) + " OTUs (" + str(
            diff_rel) + "%).\n\n" + "Taxon table is found under:\n" + '/'.join(
                str(output_name).split("/")[-4:])
        sg.Popup(closing_text, title="Finished", keep_on_top=True)

        from taxontabletools.create_log import ttt_log
        log_text = str(read_filter_treshold) + " ; " + str(
            diff_abs) + " OTUs ; " + str(diff_rel) + "%"
        ttt_log("relative read filter", "processing", TaXon_table_file.name,
                output_name.name, log_text, path_to_outdirs)
def calculate_taxonomic_resolution(TaXon_table_xlsx, path_to_outdirs,
                                   x_tax_res, y_tax_res, figure_type, template,
                                   theme, font_size, clustering_unit):

    import glob
    import PySimpleGUI as sg
    import pandas as pd
    from pandas import DataFrame
    import numpy as np
    import plotly.graph_objects as go
    from pathlib import Path
    import webbrowser

    color1 = theme[0]
    color2 = theme[1]
    opacity_value = theme[2]

    TaXon_table_file = Path(TaXon_table_xlsx)
    TaXon_table_xlsx = Path(TaXon_table_xlsx)
    TaXon_table_df = pd.read_excel(TaXon_table_xlsx)
    TaXon_table_df = TaXon_table_df.replace(np.nan, 'nan', regex=True)

    taxonomic_levels = [
        "Phylum", "Class", "Order", "Family", "Genus", "Species"
    ]
    statistics_list, statistics_set, statistics_dict, highest_level_dict = [], [], {}, {}

    title = "# " + clustering_unit

    for taxon_to_evaluate in taxonomic_levels:
        taxa_list = [
            x for x in TaXon_table_df[taxon_to_evaluate].values.tolist()
            if str(x) != 'nan'
        ]
        statistics = taxon_to_evaluate, len(taxa_list)
        statistics_set.append(len(set(taxa_list)))
        statistics_list.append(list(statistics))
        statistics_dict[taxon_to_evaluate] = len(taxa_list)

    highest_level_dict[
        "Phylum"] = statistics_dict["Phylum"] - statistics_dict["Class"]
    highest_level_dict[
        "Class"] = statistics_dict["Class"] - statistics_dict["Order"]
    highest_level_dict[
        "Order"] = statistics_dict["Order"] - statistics_dict["Family"]
    highest_level_dict[
        "Family"] = statistics_dict["Family"] - statistics_dict["Genus"]
    highest_level_dict[
        "Genus"] = statistics_dict["Genus"] - statistics_dict["Species"]
    highest_level_dict["Species"] = statistics_dict["Species"]

    taxon_levels = list(highest_level_dict.keys())
    highest_level_OTUs = list(highest_level_dict.values())
    total_OTUs = list(statistics_dict.values())

    # create plot
    # option A: scatter plot
    if figure_type == "a":

        fig = go.Figure(data=[
            go.Bar(x=taxon_levels,
                   y=highest_level_OTUs,
                   name="Taxon",
                   textposition="outside",
                   text=highest_level_OTUs)
        ])
        fig.update_traces(marker_color=color1,
                          marker_line_color=color2,
                          marker_line_width=1,
                          opacity=opacity_value)
        fig.update_layout(
            title_text='Taxonomic resolution (highest taxonomic level)',
            yaxis_title=title)
        fig.update_layout(height=int(y_tax_res),
                          width=int(x_tax_res),
                          template=template,
                          font_size=font_size,
                          title_font_size=font_size)

        ## finish script
        output_pdf = Path(
            str(path_to_outdirs) + "/" + "Taxonomic_resolution_plots" + "/" +
            TaXon_table_file.stem + "_taxonomic_resolution_a.pdf")
        output_html = Path(
            str(path_to_outdirs) + "/" + "Taxonomic_resolution_plots" + "/" +
            TaXon_table_file.stem + "_taxonomic_resolution_a.html")
        fig.write_image(str(output_pdf))
        fig.write_html(str(output_html))

        ## ask to show file
        answer = sg.PopupYesNo('Show plot?', keep_on_top=True)
        if answer == "Yes":
            webbrowser.open('file://' + str(output_html))

        ## write log file
        from taxontabletools.create_log import ttt_log
        ttt_log("taxonomic resolution", "analysis", TaXon_table_file.name,
                output_pdf.name, "plot a", path_to_outdirs)

    # option B: bar plot
    else:

        fig = go.Figure(data=[
            go.Bar(x=taxon_levels,
                   y=total_OTUs,
                   name="Taxon",
                   textposition="outside",
                   text=total_OTUs)
        ])
        fig.update_traces(marker_color=color1,
                          marker_line_color=color2,
                          marker_line_width=1,
                          opacity=opacity_value)
        fig.update_layout(
            title_text='Taxonomic resolution (total number of OTUs)',
            yaxis_title=title)
        fig.update_layout(height=int(y_tax_res),
                          width=int(x_tax_res),
                          template=template,
                          font_size=font_size,
                          title_font_size=font_size)

        ## finish script
        output_pdf = Path(
            str(path_to_outdirs) + "/" + "Taxonomic_resolution_plots" + "/" +
            TaXon_table_file.stem + "_taxonomic_resolution_b.pdf")
        output_html = Path(
            str(path_to_outdirs) + "/" + "Taxonomic_resolution_plots" + "/" +
            TaXon_table_file.stem + "_taxonomic_resolution_b.html")
        fig.write_image(str(output_pdf))
        fig.write_html(str(output_html))

        ## show plot
        answer = sg.PopupYesNo('Show plot?', keep_on_top=True)
        if answer == "Yes":
            webbrowser.open('file://' + str(output_html))

        ## write log file
        from taxontabletools.create_log import ttt_log
        ttt_log("taxonomic resolution", "analysis", TaXon_table_file.name,
                output_pdf.name, "plot b", path_to_outdirs)

    closing_text = "\n" + "Taxonomic resolution plots are found in: " + str(
        path_to_outdirs) + "/taxonomic_resolution_plots/"
    sg.Popup(closing_text, title="Finished", keep_on_top=True)
def taxon_filter(TaXon_table_xlsx, filtered_taxa, mask, appendix_name,
                 threshold, path_to_outdirs, taxon_filter_method):

    import PySimpleGUI as sg
    import pandas as pd
    from pandas import DataFrame
    from pathlib import Path

    TaXon_table_file = Path(TaXon_table_xlsx)

    TaXon_table_xlsx = pd.ExcelFile(TaXon_table_file)
    df = pd.read_excel(TaXon_table_xlsx, 'TaXon table', header=0)
    # convert taxa to exclude to a list if only one taxon is given (which is then string)
    if type(filtered_taxa) == str:
        filtered_taxa = [filtered_taxa]

    if taxon_filter_method == "keep":
        available_taxa = set(df[mask].values.tolist())
        available_taxa = [x for x in available_taxa if str(x) != 'nan']
        available_taxa = sorted(list(available_taxa))
        filtered_taxa = list(set(available_taxa) - set(filtered_taxa))

        # check for taxa to filter
        mask_position = list(df.columns).index(mask)
        df_columns = df.columns

        rows_to_keep = []

        df_rows = df.values.tolist()
        for row in df_rows:
            taxon_to_evaluate = row[mask_position]
            if taxon_to_evaluate not in filtered_taxa:
                if str(taxon_to_evaluate) != 'nan':
                    rows_to_keep.append(row)

    else:
        # check for taxa to filter
        mask_position = list(df.columns).index(mask)
        df_columns = df.columns

        rows_to_keep = []

        df_rows = df.values.tolist()
        for row in df_rows:
            taxon_to_evaluate = row[mask_position]
            if taxon_to_evaluate not in filtered_taxa:
                rows_to_keep.append(row)

    df_out = pd.DataFrame(rows_to_keep)

    similarity_position = list(df_columns).index("Similarity")
    threshold = int(threshold)

    filtered_rows = []

    for index, row in df_out.iterrows():
        similarity = list(row)[similarity_position]
        if similarity != 'No Match':
            if int(similarity) >= threshold:
                filtered_rows.append(list(row))

    df_out = pd.DataFrame(filtered_rows)

    if df_out.empty:
        sg.PopupError('Filter theshold were to harsh: Nothing to print',
                      title="Error",
                      keep_on_top=True)

    else:
        df_out.columns = df_columns

        # write output file
        file_name = TaXon_table_file.stem
        output_name = Path(
            str(path_to_outdirs) + "/" + "TaXon_tables" + "/" + file_name +
            "_" + appendix_name + ".xlsx")
        threshold_output = "Similarity threshold = " + str(threshold)
        filtered_taxa.append(threshold_output)
        df_filtered_taxa = pd.DataFrame(filtered_taxa)
        df_filtered_taxa.columns = ['Filter criteria']
        writer = pd.ExcelWriter(output_name, engine='xlsxwriter')
        df_out.to_excel(writer, sheet_name='TaXon table', index=False)
        df_filtered_taxa.to_excel(writer,
                                  sheet_name='Filter criteria',
                                  index=False)
        writer.save()
        writer.close()

        ## print results for the user
        n_old_OTUs = len(df["ID"].values.tolist())
        n_remaining_OTUs = len(df_out["ID"].values.tolist())
        diff_abs = n_old_OTUs - n_remaining_OTUs
        diff_rel = round(100 - n_remaining_OTUs / n_old_OTUs * 100, 2)

        ## finish script
        closing_text = "Removed " + str(diff_abs) + " OTUs (" + str(
            diff_rel) + "%).\n\n" + "Taxon table is found under:\n" + '/'.join(
                str(output_name).split("/")[-4:])
        sg.Popup(closing_text, title="Finished", keep_on_top=True)

        from taxontabletools.create_log import ttt_log
        log_text = str(diff_abs) + " OTUs ; " + str(diff_rel) + "%"
        ttt_log("taxon filter", "processing", TaXon_table_file.name,
                output_name.name, log_text, path_to_outdirs)
示例#23
0
def rarefaction_curve_legacy(TaXon_table_xlsx, repetitions, path_to_outdirs, template, theme, font_size, taxonomic_level_1):

    import random
    import PySimpleGUI as sg
    import pandas as pd
    import numpy as np
    import plotly.graph_objects as go
    from pathlib import Path
    import webbrowser

    color1 = theme[0]
    color2 = theme[1]
    opacity_value = theme[2]

    ## create a y axis title text
    taxon_title = taxonomic_level_1.lower()

    ## adjust taxonomic level if neccessary
    if taxonomic_level_1 in ["ASVs", "ESVs", "OTUs", "zOTUs"]:
        taxon_title = taxonomic_level_1
        taxonomic_level_1 = "ID"

    TaXon_table_file = Path(TaXon_table_xlsx)

    TaXon_table_xlsx = pd.ExcelFile(TaXon_table_xlsx)
    df = pd.read_excel(TaXon_table_xlsx, 'TaXon table', header=0)
    df = df.replace(np.nan,"nan")

    available_samples = df.columns.tolist()[10:]
    sample_dict_clean = {}

    # iterate through all available samples
    for sample in available_samples:
        # create a dict for the read numbers of the respective sample for each species
        sample_OTU_list = df[[sample, taxonomic_level_1]].values.tolist()
        # select only the present Species
        sample_species_list = list(set([OTU[1] for OTU in sample_OTU_list if (OTU[0] != 0 and OTU[1] != "nan")]))
        # store the species in a dictionary
        sample_dict_clean[sample] = sample_species_list

    # draw once for each sample
    number_of_draws = len(sample_dict_clean.keys())

    # dictionary to store the drawing results
    draw_dictionary = {}

    ############################################################################
    ## create the progress bar window
    layout = [[sg.Text('Progress bar')],
              [sg.ProgressBar(1000, orientation='h', size=(20, 20), key='progressbar')],
              [sg.Cancel()]]
    window_progress_bar = sg.Window('Progress bar', layout, keep_on_top=True)
    progress_bar = window_progress_bar['progressbar']
    progress_update = 0
    progress_increase = 1000 / repetitions
    ############################################################################

    for n_reps in range(0, repetitions):
        # store the original dictionary to start over again
        # a copy of the original dictionary is required, because the samples will be removed with each draw
        # thus for each replicate a new dictionary to draw from has to be created
        sample_dict_to_draw = dict(sample_dict_clean)

        species_list = []
        species_set = []

        for i in range(0, number_of_draws):
            # choose a random sample from the dictionary
            random_choice = random.choice(list(sample_dict_to_draw.keys()))
            # extract the OTU IDs from the chosen sample and add them to the already existing OTU IDs
            species_list = species_list + sample_dict_clean[random_choice]
            # create a unique set
            species_set = set(species_list)
            # number of OTUs
            n_species = len(species_set)
            # now add the unique OTU list to the output dictionary
            # if the key is not in the dict, create a new entry (= OTU ID plus number of OTUs)
            if i not in draw_dictionary.keys():
                draw_dictionary[i] = [n_species]
            # if the key already exists, calculate the sum of the already existing number of OTUs and the new number of OTUs
            else:
                # create a new list to store the current number of OTUs
                add_species_list = draw_dictionary[i]
                add_species_list.append(n_species)
                draw_dictionary[i] = add_species_list

            # remove the sample to draw only once
            sample_dict_to_draw.pop(random_choice)

        ############################################################################
        event, values = window_progress_bar.read(timeout=1)
        if event == 'Cancel'  or event is None:
            window_progress_bar.Close()
            raise RuntimeError
        # update bar with loop value +1 so that bar eventually reaches the maximum
        progress_update += progress_increase
        progress_bar.UpdateBar(progress_update)
        ############################################################################

    window_progress_bar.Close()

    # create a dict to store the average number of OTUs per draw
    rarefaction_dict_average, rarefaction_dict_stdef = {}, {}

    def average(lst):
        return sum(lst) / len(lst)

    # iterate through the draw_dictionary and calculate the average number of OTUs
    for key, value in draw_dictionary.items():
        average_species = average(draw_dictionary[key])
        stdef_species = np.std(draw_dictionary[key], dtype=np.float64)
        rarefaction_dict_average[key] = average_species
        rarefaction_dict_stdef[key] = stdef_species

    # draw the plot
    draws = [i+1 for i in rarefaction_dict_average.keys()]
    n_species = list(rarefaction_dict_average.values())
    error_bar = list(rarefaction_dict_stdef.values())
    y_axis_title = "# " + taxon_title
    fig = go.Figure(data=[go.Scatter(x=draws, y=n_species, error_y=dict(type='data', array=error_bar, thickness=0.5, width=3, visible=True))])
    fig.update_layout(title_text="repetitions = " + str(n_reps+1), yaxis_title=y_axis_title, xaxis_title="# samples")
    fig.update_traces(marker_color=color1, marker_line_color=color2, opacity=opacity_value)
    fig.update_layout(height=800, width=1200, template=template, showlegend=False, font_size=font_size, title_font_size=font_size)

    ## write files
    output_pdf = Path(str(path_to_outdirs) + "/" + "Rarefaction_curves" + "/" + TaXon_table_file.name + "_rarefaction_" + taxon_title + ".pdf")
    output_html = Path(str(path_to_outdirs) + "/" + "Rarefaction_curves" + "/" + TaXon_table_file.name + "_rarefaction_taxa" + taxon_title + ".html")
    fig.write_image(str(output_pdf))
    fig.write_html(str(output_html))

    ## ask to show file
    answer = sg.PopupYesNo('Show plot?', keep_on_top=True)
    if answer == "Yes":
        webbrowser.open('file://' + str(output_html))

    ## print closing text
    closing_text = "Rarefaction curves are found in: " + str(path_to_outdirs) + "/rarefaction_curves/"
    sg.Popup(closing_text, title="Finished", keep_on_top=True)

    ## write log
    from taxontabletools.create_log import ttt_log
    ttt_log("rarefaction curve all-in-one", "analysis", TaXon_table_file.name, output_pdf.name, "nan", path_to_outdirs)
def subtract_NCs(TaXon_table_xlsx, path_to_outdirs, negative_controls):

    import PySimpleGUI as sg
    import pandas as pd
    import numpy as np
    import plotly.graph_objects as go
    from pathlib import Path
    import webbrowser

    ## load taxon table
    TaXon_table_xlsx = Path(TaXon_table_xlsx)
    TaXon_table_df = pd.read_excel(TaXon_table_xlsx, header=0)

    ##negative_controls = ["NC_3", "NC_1", "NC_2"]
    ## collect samples
    samples = [
        sample for sample in TaXon_table_df.columns.to_list()[10:]
        if sample not in negative_controls
    ]
    ## calculate sum of NCs
    df_nc_sum = TaXon_table_df[negative_controls].sum(axis=1)
    ## create a new dataframe
    df_out = TaXon_table_df[TaXon_table_df.columns.tolist()[0:10]]

    # subtract the sum of reads found in the NCs from each OTU of the samples
    for sample in samples:
        df_out.insert(10, sample,
                      (TaXon_table_df[sample] - df_nc_sum).values.tolist())

    ## replace negative values with 0
    num = df_out._get_numeric_data()
    num[num < 0] = 0

    ## remove empty OTUs
    out_list = [OTU for OTU in df_out.values.tolist() if sum(OTU[10:]) != 0]

    ## check if the still contains reads
    if df_out.empty:
        sg.PopupError('Filter theshold were to harsh: Nothing to print',
                      title="Error",
                      keep_on_top=True)

    else:
        output_xlsx = Path(
            str(path_to_outdirs) + "/" + "TaXon_tables" + "/" +
            TaXon_table_xlsx.stem + "_NCsub.xlsx")
        df_out = pd.DataFrame(out_list,
                              columns=df_out.columns.tolist()).replace(
                                  "nan", "")
        df_out.to_excel(output_xlsx, sheet_name="TaXon table", index=False)

        from taxontabletools.create_log import ttt_log
        ttt_log("nc subtract", "processing", TaXon_table_xlsx.name,
                output_xlsx.name, "nan", path_to_outdirs)

        ## finish script
        closing_text = str(
            len(TaXon_table_df) - len(df_out)
        ) + " OTUs were removed. The Taxon table is found under:\n" + '/'.join(
            str(output_xlsx).split("/")[-4:])
        sg.Popup(closing_text, title="Finished", keep_on_top=True)
示例#25
0
def rarefaction_curve_reads(TaXon_table_xlsx, repetitions, width, height, path_to_outdirs, template, theme, font_size):

    import pandas as pd
    import PySimpleGUI as sg
    import numpy as np
    from statistics import mean
    from pathlib import Path
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    import math, webbrowser

    TaXon_table_file = Path(TaXon_table_xlsx)
    TaXon_table_df = pd.read_excel(TaXon_table_xlsx).fillna("")
    samples = TaXon_table_df.columns.tolist()[10:]
    scatter_size = 5

    color1 = theme[0]
    color2 = theme[1]
    opacity_value = theme[2]

    height = int(height)

    ## count rows and columns to create subplots
    n_rows = math.ceil(len(samples) / 4)
    n_columns = 5
    column_count = 1
    row_count = 1
    fig = make_subplots(rows=n_rows, cols=4, subplot_titles=samples, shared_yaxes=True)

    ## calculate maximum number of OTUs
    max_OTUs = []
    for sample in samples:
        max_OTUs.append(len([OTU for OTU in TaXon_table_df[sample] if OTU != 0]))
    y_limit = max(max_OTUs) + 20

    ############################################################################
    ## create the progress bar window
    layout = [[sg.Text('Progress bar')],
              [sg.ProgressBar(1000, orientation='h', size=(20, 20), key='progressbar')],
              [sg.Cancel()]]
    window_progress_bar = sg.Window('Progress bar', layout, keep_on_top=True)
    progress_bar = window_progress_bar['progressbar']
    progress_update = 0
    progress_increase = 1000 / len(samples)
    ############################################################################

    ############################################################################
    event, values = window_progress_bar.read(timeout=1)
    if event == 'Cancel'  or event is None:
        window_progress_bar.Close()
        raise RuntimeError
    # update bar with loop value +1 so that bar eventually reaches the maximum
    progress_update += 0
    progress_bar.UpdateBar(progress_update)
    ############################################################################

    for sample in samples:

        ## filter sample from data
        read_df = TaXon_table_df[[sample, "ID"]]
        ## drop empty OTUs
        read_df = read_df[read_df[sample] != 0]
        ## create read list to draw the subsamples from
        read_list = pd.Series(np.repeat(read_df['ID'].to_list(), read_df[sample].to_list()))

        output = []

        ## draw random sample
        for perc in np.arange(0.00, 1.05, 0.05):
            ## calculate sample size
            sub_sample_size = int(len(read_list) * perc)

            ## draw X subsamples of that size
            mean_species = mean([read_list.sample(n = sub_sample_size).nunique() for i in range(repetitions)])

            output.append(mean_species)

        output = pd.DataFrame({'percentage': np.arange(0.00, 1.05, 0.05), 'mean_OTUs': output})

        ## write plot
        fig.add_trace(go.Scatter(x=output["percentage"], y=output["mean_OTUs"], name=sample, mode='markers+lines', marker=dict(size=int(scatter_size))), row=row_count, col=column_count)
        fig.update_traces(marker_color=color1, marker_line_color=color2, opacity=opacity_value, row=row_count, col=column_count)
        fig.update_yaxes(range=[0, y_limit], row=row_count, col=column_count)

        ## add a y axis title to all left bound plots
        if column_count == 1:
            fig.update_yaxes(title_text="# OTUs", row=row_count, col=column_count)

        ## add x axis title to all plots in the last row
        # if row_count == n_rows:
        #     fig.update_xaxes(title_text="subsample (%)", row=row_count, col=column_count)

        column_count += 1
        if column_count == n_columns:
            column_count = 1
            row_count += 1
            height += 100

        ############################################################################
        event, values = window_progress_bar.read(timeout=1)
        if event == 'Cancel'  or event is None:
            window_progress_bar.Close()
            raise RuntimeError
        # update bar with loop value +1 so that bar eventually reaches the maximum
        progress_update += progress_increase
        progress_bar.UpdateBar(progress_update)
        ############################################################################

    window_progress_bar.Close()

    fig.update_layout(height=int(height), width=int(width), template=template, font_size=font_size, title_font_size=font_size, showlegend=False)
    fig.update_yaxes(rangemode="tozero")
    fig.update_xaxes(rangemode="tozero")

    ## write files
    output_pdf = Path(str(path_to_outdirs) + "/" + "Rarefaction_curves" + "/" + TaXon_table_file.name + "_rarefaction_reads.pdf")
    output_html = Path(str(path_to_outdirs) + "/" + "Rarefaction_curves" + "/" + TaXon_table_file.name + "_rarefaction_reads.html")
    fig.write_image(str(output_pdf))
    fig.write_html(str(output_html))

    ## ask to show file
    answer = sg.PopupYesNo('Show plot?', keep_on_top=True)
    if answer == "Yes":
        webbrowser.open('file://' + str(output_html))

    ## print closing text
    closing_text = "Rarefaction curves are found in: " + str(path_to_outdirs) + "/rarefaction_curves/"
    sg.Popup(closing_text, title="Finished", keep_on_top=True)

    ## write log
    from taxontabletools.create_log import ttt_log
    ttt_log("rarefaction curve reads", "analysis", TaXon_table_file.name, output_pdf.name, repetitions, path_to_outdirs)
示例#26
0
def NMDS_analysis(TaXon_table_xlsx, meta_data_to_test, taxonomic_level, width,
                  height, nmds_s, max_iter_val, n_init_val, path_to_outdirs,
                  template, font_size, color_discrete_sequence,
                  nmds_dissimilarity):

    import pandas as pd
    import numpy as np
    from skbio.diversity import beta_diversity
    from sklearn.manifold import MDS
    import plotly.graph_objects as go
    import plotly.express as px
    from pathlib import Path
    import PySimpleGUI as sg
    import os, webbrowser
    from itertools import combinations

    TaXon_table_xlsx = Path(TaXon_table_xlsx)
    Meta_data_table_xlsx = Path(
        str(path_to_outdirs) + "/" + "Meta_data_table" + "/" +
        TaXon_table_xlsx.stem + "_metadata.xlsx")
    TaXon_table_df = pd.read_excel(TaXon_table_xlsx,
                                   header=0).fillna("unidentified")
    TaXon_table_samples = TaXon_table_df.columns.tolist()[10:]
    Meta_data_table_df = pd.read_excel(Meta_data_table_xlsx,
                                       header=0).fillna("nan")
    Meta_data_table_samples = Meta_data_table_df['Samples'].tolist()

    metadata_list = Meta_data_table_df[meta_data_to_test].values.tolist()
    metadata_loc = Meta_data_table_df.columns.tolist().index(meta_data_to_test)

    ## drop samples with metadata called nan (= empty)
    drop_samples = [
        i[0] for i in Meta_data_table_df.values.tolist()
        if i[metadata_loc] == "nan"
    ]

    if drop_samples != []:
        ## filter the TaXon table
        TaXon_table_df = TaXon_table_df.drop(drop_samples, axis=1)
        TaXon_table_samples = TaXon_table_df.columns.tolist()[10:]
        ## also remove empty OTUs
        row_filter_list = []
        for row in TaXon_table_df.values.tolist():
            reads = set(row[10:])
            if reads != {0}:
                row_filter_list.append(row)
        columns = TaXon_table_df.columns.tolist()
        TaXon_table_df = pd.DataFrame(row_filter_list, columns=columns)
        Meta_data_table_df = pd.DataFrame(
            [
                i for i in Meta_data_table_df.values.tolist()
                if i[0] not in drop_samples
            ],
            columns=Meta_data_table_df.columns.tolist())
        Meta_data_table_samples = Meta_data_table_df['Samples'].tolist()

    ## create a y axis title text
    taxon_title = taxonomic_level.lower()

    ## adjust taxonomic level if neccessary
    if taxonomic_level in ["ASVs", "ESVs", "OTUs", "zOTUs"]:
        taxon_title = taxonomic_level
        taxonomic_level = "ID"

    ## create a subfolder for better sorting and overview
    dirName = Path(
        str(path_to_outdirs) + "/" + "NMDS_plots" + "/" +
        TaXon_table_xlsx.stem + "/")
    if not os.path.exists(dirName):
        os.mkdir(dirName)

    # check if the meta data differs
    if len(set(Meta_data_table_df[meta_data_to_test])) == len(
            Meta_data_table_df['Samples'].tolist()):
        sg.Popup(
            "The meta data is unique for all samples. Please adjust the meta data table!",
            title=("Error"))
        raise RuntimeError

    if sorted(TaXon_table_samples) == sorted(Meta_data_table_samples):

        samples = Meta_data_table_samples

        ## extract the relevant data
        TaXon_table_df = TaXon_table_df[[taxonomic_level] + samples]

        ## define an aggregation function to combine multiple hit of one taxonimic level
        aggregation_functions = {}
        ## define samples functions
        for sample in samples:
            ## 'sum' will calculate the sum of p/a data
            aggregation_functions[sample] = 'sum'
        ## define taxon level function
        aggregation_functions[taxonomic_level] = 'first'

        ## create condensed dataframe
        df_new = TaXon_table_df.groupby(
            TaXon_table_df[taxonomic_level]).aggregate(aggregation_functions)
        if 'unidentified' in df_new.index:
            df_new = df_new.drop('unidentified')

        ## collect reads
        data = df_new[samples].transpose().values.tolist()
        ## calculate jaccard distances
        jaccard_dm = beta_diversity(nmds_dissimilarity, data, samples)

        ## NMDS function
        def nmds_function(matrix, dimensions):
            nmds = MDS(n_components=dimensions,
                       metric=False,
                       dissimilarity='precomputed',
                       max_iter=int(max_iter_val),
                       n_init=int(n_init_val))
            nmds_results = nmds.fit(jaccard_dm[:100])
            stress = round(nmds_results.stress_, 2)
            nmds_array = nmds_results.embedding_
            return ({"stress": stress, "nmds_results": nmds_array})

        answer = sg.PopupOKCancel(
            "The NMDS calculation may take a while. Continue?")

        if answer == "OK":
            ## test different dimensions
            nmds_results_dict = {}
            stress_dict = {}
            for i in range(1, 11):
                nmds_results = nmds_function(jaccard_dm, i)
                nmds_results_dict[i] = nmds_results
                stress_dict[i] = nmds_results["stress"]

            ####################################################################################################
            win2_active = True

            layout2 = [
                [sg.Text("NMDS analysis options", size=(20, 1))],
                [sg.CB("Show stress plot", default=True, key="stress_plot")],
                [sg.CB("Show NMDS 2D plot", default=True, key="2d_plot")],
                [sg.CB("Show NMDS 3D plot", default=True, key="3d_plot")],
                [sg.CB("Connect categories", default=True, key="draw_mesh")],
                [sg.Text("")], [sg.Button("Apply")]
            ]

            win2 = sg.Window('NMDS analysis', layout2, keep_on_top=False)

            while True:
                event2, values2 = win2.Read()
                if event2 is None or event2 == 'Apply':
                    win2.close()
                    win2_active = False
                    break

            ####################################################################################################

            ## plot stress and dimensions
            fig = go.Figure()
            fig.add_trace(
                go.Scatter(x=list(stress_dict.keys()),
                           y=list(stress_dict.values()),
                           mode='markers+lines',
                           name=sample,
                           marker=dict(color="Blue", size=int(10))))
            fig.update_layout(showlegend=False,
                              xaxis_title="Dimensions",
                              yaxis_title="Stress")
            fig.update_layout(height=int(600),
                              width=int(800),
                              template=template,
                              showlegend=False,
                              font_size=font_size,
                              title_font_size=font_size)

            ## define output files
            output_pdf = Path(
                str(dirName) + "/" + meta_data_to_test + "_" + taxon_title +
                "_stress.pdf")
            output_html = Path(
                str(dirName) + "/" + meta_data_to_test + "_" + taxon_title +
                "_stress.html")
            ## write output files
            fig.write_image(str(output_pdf))
            fig.write_html(str(output_html))
            ## ask to show file
            if values2['stress_plot'] == True:
                webbrowser.open('file://' + str(output_html))

            ####################################################################################################

            ## plot 2D
            stress = stress_dict[2]
            if values2["draw_mesh"] == True:
                ## create dataframe from NMDS results
                nmds_results_df = pd.DataFrame(
                    nmds_results_dict[2]["nmds_results"], index=[samples])
                nmds_results_df.rename(columns={
                    0: 'NMDS1',
                    1: 'NMDS2'
                },
                                       inplace=True)
                nmds_results_df["Sample"] = samples
                nmds_results_df[meta_data_to_test] = Meta_data_table_df[
                    meta_data_to_test].values.tolist()

                combinations_list = []
                for metadata in nmds_results_df[meta_data_to_test]:
                    ## collect all entries for the respective metadata
                    arr = nmds_results_df.loc[
                        nmds_results_df[meta_data_to_test] == metadata][[
                            'NMDS1', 'NMDS2', meta_data_to_test, "Sample"
                        ]].to_numpy()
                    ## create a df for all possible combinations using itertools combinations
                    for entry in list(combinations(arr, 2)):
                        combinations_list.append(list(entry[0]))
                        combinations_list.append(list(entry[1]))

                ## create a dataframe to draw the plot from
                df = pd.DataFrame(combinations_list)
                df.columns = ['NMDS1', 'NMDS2', meta_data_to_test, "Sample"]

                ## plot NMDS
                fig = go.Figure()
                fig = px.scatter(
                    df,
                    x="NMDS1",
                    y="NMDS2",
                    hover_data=['Sample'],
                    color=meta_data_to_test,
                    color_discrete_sequence=color_discrete_sequence)
                fig.update_traces(marker_size=int(nmds_s),
                                  mode="markers+lines",
                                  line=dict(width=0.5))
                fig.update_layout(title="Stress=" + str(stress),
                                  yaxis_title="NMDS1",
                                  xaxis_title="NMDS2")
                fig.update_layout(height=int(height),
                                  width=int(width),
                                  template=template,
                                  showlegend=True,
                                  font_size=font_size,
                                  title_font_size=font_size)

            else:

                ## create dataframe from NMDS results
                nmds_results_df = pd.DataFrame(
                    nmds_results_dict[2]["nmds_results"], index=[samples])
                nmds_results_df.rename(columns={0: 'X', 1: 'Y'}, inplace=True)
                nmds_results_df[meta_data_to_test] = Meta_data_table_df[
                    meta_data_to_test].values.tolist()
                nmds_results_df["Sample"] = samples
                ## plot NMDS
                fig = go.Figure()
                fig = px.scatter(
                    nmds_results_df,
                    x="X",
                    y="Y",
                    hover_data=['Sample'],
                    color=meta_data_to_test,
                    color_discrete_sequence=color_discrete_sequence)
                fig.update_traces(marker_size=int(nmds_s), mode="markers")
                fig.update_layout(title="Stress=" + str(stress),
                                  yaxis_title="NMDS1",
                                  xaxis_title="NMDS2")
                fig.update_layout(height=int(height),
                                  width=int(width),
                                  template=template,
                                  showlegend=True,
                                  font_size=font_size,
                                  title_font_size=font_size)

            ## define output files
            output_pdf = Path(
                str(dirName) + "/" + meta_data_to_test + "_" + taxon_title +
                "_2d.pdf")
            output_html = Path(
                str(dirName) + "/" + meta_data_to_test + "_" + taxon_title +
                "_2d.html")
            ## write output files
            fig.write_image(str(output_pdf))
            fig.write_html(str(output_html))
            ## ask to show file
            if values2['2d_plot'] == True:
                webbrowser.open('file://' + str(output_html))

            ####################################################################################################

            ## plot 3D
            stress = stress_dict[3]

            if values2["draw_mesh"] == True:
                ## create dataframe from NMDS results
                nmds_results_df = pd.DataFrame(
                    nmds_results_dict[3]["nmds_results"], index=[samples])
                nmds_results_df["Sample"] = samples
                nmds_results_df[meta_data_to_test] = Meta_data_table_df[
                    meta_data_to_test].values.tolist()
                nmds_results_df.rename(columns={
                    0: 'NMDS1',
                    1: 'NMDS2',
                    2: 'NMDS3'
                },
                                       inplace=True)

                combinations_list = []
                for metadata in nmds_results_df[meta_data_to_test]:
                    ## collect all entries for the respective metadata
                    arr = nmds_results_df.loc[
                        nmds_results_df[meta_data_to_test] == metadata][[
                            'NMDS1', 'NMDS2', 'NMDS3', meta_data_to_test,
                            "Sample"
                        ]].to_numpy()
                    ## create a df for all possible combinations using itertools combinations
                    for entry in list(combinations(arr, 2)):
                        combinations_list.append(list(entry[0]))
                        combinations_list.append(list(entry[1]))

                ## create a dataframe to draw the plot from
                df = pd.DataFrame(combinations_list)
                df.columns = [
                    'NMDS1', 'NMDS2', 'NMDS3', meta_data_to_test, "Sample"
                ]

                ## plot NMDS
                fig = go.Figure()
                ## draw the plot
                fig = px.scatter_3d(
                    df,
                    x="NMDS1",
                    y="NMDS2",
                    z="NMDS3",
                    color=meta_data_to_test,
                    text="Sample",
                    title="textbox",
                    color_discrete_sequence=color_discrete_sequence)
                fig.update_traces(marker_size=int(12),
                                  mode="markers+lines",
                                  line=dict(width=1))
                fig.update_layout(height=int(height),
                                  width=int(width),
                                  template=template,
                                  title="Stress=" + str(stress),
                                  showlegend=True,
                                  font_size=font_size,
                                  title_font_size=font_size)
                fig.update_layout(scene=dict(xaxis_title="NMDS1",
                                             yaxis_title="NMDS2",
                                             zaxis_title="NMDS3"))

            else:
                ## create dataframe from NMDS results
                nmds_results_df = pd.DataFrame(
                    nmds_results_dict[3]["nmds_results"], index=[samples])
                nmds_results_df["Sample"] = samples
                nmds_results_df[meta_data_to_test] = Meta_data_table_df[
                    meta_data_to_test].values.tolist()
                nmds_results_df.rename(columns={
                    0: 'NMDS1',
                    1: 'NMDS2',
                    2: 'NMDS3'
                },
                                       inplace=True)
                ## plot NMDS
                fig = go.Figure()
                ## draw the plot
                fig = px.scatter_3d(
                    nmds_results_df,
                    x="NMDS1",
                    y="NMDS2",
                    z="NMDS3",
                    color=meta_data_to_test,
                    color_discrete_sequence=color_discrete_sequence)
                fig.update_traces(marker_size=int(12),
                                  mode="markers",
                                  line=dict(width=1))
                fig.update_layout(height=int(height),
                                  width=int(width),
                                  template=template,
                                  title="Stress=" + str(stress),
                                  showlegend=True,
                                  font_size=font_size,
                                  title_font_size=font_size)
                fig.update_layout(scene=dict(xaxis_title="NMDS1",
                                             yaxis_title="NMDS2",
                                             zaxis_title="NMDS3"))

            ## define output files
            output_pdf = Path(
                str(dirName) + "/" + meta_data_to_test + "_" + taxon_title +
                "_3d.pdf")
            output_html = Path(
                str(dirName) + "/" + meta_data_to_test + "_" + taxon_title +
                "_3d.html")
            ## write output files
            fig.write_image(str(output_pdf))
            fig.write_html(str(output_html))
            ## ask to show file
            if values2['3d_plot'] == True:
                webbrowser.open('file://' + str(output_html))

            ####################################################################################################

            ## print closing text
            closing_text = "NMDS plots are found in: " + str(
                path_to_outdirs) + "/NMDS_plots/"
            sg.Popup(closing_text, title="Finished", keep_on_top=True)
            ## write log file
            from taxontabletools.create_log import ttt_log
            ttt_log("nmds analysis", "analysis", TaXon_table_xlsx.name,
                    output_pdf.name, meta_data_to_test, path_to_outdirs)
示例#27
0
def basic_stats(TaXon_table_xlsx, heigth, width, path_to_outdirs, template, theme, font_size, taxonomic_level):

    import csv, glob, sys, os, webbrowser
    import PySimpleGUI as sg
    import pandas as pd
    from pandas import DataFrame
    import numpy as np
    from pathlib import Path
    from plotly.subplots import make_subplots
    import plotly.graph_objects as go

    TaXon_table_xlsx =  Path(TaXon_table_xlsx)
    TaXon_table_df = pd.read_excel(TaXon_table_xlsx)
    TaXon_table_df = TaXon_table_df.replace(np.nan, 'nan', regex=True)

    ## adjust taxonomic level if neccessary
    if taxonomic_level in ["ASVs", "ESVs", "OTUs", "zOTUs"]:
        taxon_title = taxonomic_level

    color1 = theme[0]
    color2 = theme[1]
    opacity_value = theme[2]

    # number of samples
    n_samples = len(TaXon_table_df.columns[10:].tolist())

    # number of OTUs
    n_OTUs_total = len(TaXon_table_df['ID'].tolist())

    # number of taxa per taxon level
    n_Phyla = list(set(TaXon_table_df['Phylum'].tolist()))
    if "nan" in n_Phyla:
        n_Phyla.remove("nan")
    n_Phyla = len(n_Phyla)

    n_Classes = list(set(TaXon_table_df['Class'].tolist()))
    if "nan" in n_Classes:
        n_Classes.remove("nan")
    n_Classes = len(n_Classes)

    n_Orders = list(set(TaXon_table_df['Order'].tolist()))
    if "nan" in n_Orders:
        n_Orders.remove("nan")
    n_Orders = len(n_Orders)

    n_Families = list(set(TaXon_table_df['Family'].tolist()))
    if "nan" in n_Families:
        n_Families.remove("nan")
    n_Families = len(n_Families)

    n_Genera = list(set(TaXon_table_df['Genus'].tolist()))
    if "nan" in n_Genera:
        n_Genera.remove("nan")
    n_Genera = len(n_Genera)

    n_Species = list(set(TaXon_table_df['Species'].tolist()))
    if "nan" in n_Species:
        n_Species.remove("nan")
    n_Species = len(n_Species)

    # number of respective status
    status_dict = {}
    status_entries_set = set(TaXon_table_df['Status'].tolist())
    for status in status_entries_set:
        count = TaXon_table_df['Status'].tolist().count(status)
        status_dict[status] = count

    # sequence lengths
    sequence_list = TaXon_table_df['seq'].tolist()
    sequence_len_list = []
    for sequence in sequence_list:
        sequence_len_list.append(len(sequence))
    sequence_len_set = set(sequence_len_list)
    min_len_seq = min(sequence_len_set)
    max_len_seq = max(sequence_len_set)
    avg_len_seq = round(sum(sequence_len_set) / len(sequence_len_set))

    # read stats per sample
    samples = TaXon_table_df.columns[10:].tolist()
    reads_dict = {}
    for sample in samples:
        # read stats
        reads_list = TaXon_table_df[sample].tolist()
        reads_sum = sum(reads_list)
        reads_avg = round(sum(reads_list) / len(reads_list))

        # OTU stats
        OTUs_list = []
        for OTU in reads_list:
            if OTU != 0:
                OTUs_list.append(OTU)
        n_OTUs = len(OTUs_list)

        # Species stats
        OTUs_species_list = TaXon_table_df[["Species", sample]].values.tolist()
        species_list = []
        for OTU in OTUs_species_list:
            if OTU[0] != 'nan' and OTU[1] != 0:
                species_list.append(OTU[0])
        n_species = len(set(species_list))

        # combine to dict
        reads_dict[sample] = [reads_sum, reads_avg, n_OTUs, n_species]

    # read stats total
    read_sum_total = 0
    for read_sum in reads_dict.values():
        read_sum_total += read_sum[0]

    #####################################################################################
    # Plot reads

    reads = [i[0] for i in reads_dict.values()]
    otus = [i[2] for i in reads_dict.values()]
    species =  [i[3] for i in reads_dict.values()]
    max_otus = max(otus) + 20

    width, heigth = int(width), int(heigth)

    # create subplots
    y_title = "# " + taxon_title
    title_3 = taxon_title + " on species level"
    fig = make_subplots(rows=3, cols=1, subplot_titles=("Reads", taxon_title, title_3), vertical_spacing=0.05, shared_xaxes=True)
    # reads
    fig.add_trace(go.Bar(name="reads", x=samples, y=reads),row=1, col=1)
    fig.update_traces(marker_color=color1, marker_line_color=color2,marker_line_width=1.5, opacity=opacity_value,row=1, col=1)
    fig.update_yaxes(title_text="# reads", row=1, col=1)
    # OTUs
    fig.add_trace(go.Bar(name=taxon_title, x=samples, y=otus),row=2, col=1)
    fig.update_traces(marker_color=color1, marker_line_color=color2,marker_line_width=1.5, opacity=opacity_value,row=2, col=1)
    fig.update_yaxes(range=[0, max_otus], title_text=y_title, row=2, col=1)
    # OTUs on species level
    fig.add_trace(go.Bar(name=title_3, x=samples, y=species),row=3, col=1)
    fig.update_traces(marker_color=color1, marker_line_color=color2,marker_line_width=1.5, opacity=opacity_value,row=3, col=1)
    fig.update_yaxes(range=[0, max_otus], title_text=y_title, row=3, col=1)
    # update the layout
    fig.update_layout(height=heigth, width=width, template=template, showlegend=False, font_size=font_size, title_font_size=font_size)

    ## finish script
    basic_stats_directory = Path(str(path_to_outdirs) + "/" + "Basic_stats" + "/" + TaXon_table_xlsx.stem)
    output_pdf = Path(str(basic_stats_directory) + "_basic_stats.pdf")
    output_html = Path(str(basic_stats_directory) + "_basic_stats.html")
    fig.write_image(str(output_pdf))
    fig.write_html(str(output_html))

    ## ask to show plot
    answer = sg.PopupYesNo('Show plot?', keep_on_top=True)
    if answer == "Yes":
        webbrowser.open('file://' + str(output_html))

    #####################################################################################
    output_list_1 = []
    output_list_2 = []
    #df = pd.DataFrame(simple_list,columns=['col1','col2'])

    output_list_1.append([' Samples',n_samples, ''])
    output_list_1.append([' ' + taxon_title,n_OTUs_total, ''])
    output_list_1.append(['Number of taxa per taxon level', '#', ''])
    output_list_1.append([' Phyla',n_Phyla, ''])
    output_list_1.append([' Classes',n_Classes, ''])
    output_list_1.append([' Orders',n_Orders, ''])
    output_list_1.append([' Families',n_Families, ''])
    output_list_1.append([' Genera',n_Genera, ''])
    output_list_1.append([' Species',n_Species, ''])
    output_list_1.append(['Database status','#', ''])
    for status, count in status_dict.items():
        output_list_1.append([" " + status,count, ''])
    output_list_1.append(['Sequence length','(bp)', ''])
    output_list_1.append([' Min', min_len_seq, ''])
    output_list_1.append([' Avg', avg_len_seq, ''])
    output_list_1.append([' Max', max_len_seq, ''])

    for sample, reads_stats in reads_dict.items():
        output_list_2.append([sample, reads_stats[1], reads_stats[0], reads_stats[2], reads_stats[3]])
    output_list_2.append(['Total reads', '', read_sum_total, '', ''])

    df_1 = pd.DataFrame(output_list_1,columns=['Category','#', ''])
    df_2 = pd.DataFrame(output_list_2,columns=["Sample", "avg reads", "total reads", "n OTUs", "n Species"])
    df_out = pd.concat([df_1, df_2], axis=1)
    df_out = df_out.replace(np.nan, '', regex=True)

    basic_stats_directory = Path(str(path_to_outdirs) + "/" + "Basic_stats" + "/" + TaXon_table_xlsx.stem)
    basic_stats_xlsx = Path(str(basic_stats_directory) + "_basic_stats.xlsx")
    df_out.to_excel(basic_stats_xlsx, index=False)

    table_1 = [['Category','#']] + df_1.values.tolist()
    table_2 = [["Sample", "avg reads", "total reads", "n OTUs", "n Species"]] + df_2.values.tolist()
    table_layout_1 = [[sg.Text('    '.join(list(map(str, row))), size=(70,1)) for col in range(1)] for row in table_1]
    layout = [ [sg.TabGroup([[sg.Tab('General information', table_layout_1),]])], [sg.Button("Close", key="Close")]]

    window_basic_stats = sg.Window('Basic stats', layout, keep_on_top=True)

    while True:
        event, values = window_basic_stats.Read()
        if event is None or event == 'Close':
            window_basic_stats.close()
            break

    ## write to log file
    from taxontabletools.create_log import ttt_log
    ttt_log("basic stats", "analysis", TaXon_table_xlsx.name, basic_stats_xlsx.name, "nan", path_to_outdirs)
示例#28
0
def gbif_occurrence(TaXon_table_xlsx, width, height, continents_to_check,
                    template, theme, font_size, path_to_outdirs):

    import requests_html, json
    import PySimpleGUI as sg
    import pandas as pd
    from pandas import DataFrame
    import numpy as np
    from pathlib import Path
    import plotly.graph_objects as go
    import os, webbrowser

    ## dictionary with all country codes of the Earth
    country_codes_dict = {
        'Andorra': ['AD', 'Europe'],
        'United Arab Emirates': ['AE', 'Asia'],
        'Afghanistan': ['AF', 'Asia'],
        'Antigua and Barbuda': ['AG', 'North America'],
        'Anguilla': ['AI', 'North America'],
        'Albania': ['AL', 'Europe'],
        'Armenia': ['AM', 'Asia'],
        'Angola': ['AO', 'Africa'],
        'Antarctica': ['AQ', 'Antarctica'],
        'Argentina': ['AR', 'South America'],
        'American Samoa': ['AS', 'Oceania'],
        'Austria': ['AT', 'Europe'],
        'Australia': ['AU', 'Oceania'],
        'Aruba': ['AW', 'North America'],
        'Åland Islands': ['AX', 'Europe'],
        'Azerbaijan': ['AZ', 'Asia'],
        'Bosnia and Herzegovina': ['BA', 'Europe'],
        'Barbados': ['BB', 'North America'],
        'Bangladesh': ['BD', 'Asia'],
        'Belgium': ['BE', 'Europe'],
        'Burkina Faso': ['BF', 'Africa'],
        'Bulgaria': ['BG', 'Europe'],
        'Bahrain': ['BH', 'Asia'],
        'Burundi': ['BI', 'Africa'],
        'Benin': ['BJ', 'Africa'],
        'Saint Barthélemy': ['BL', 'North America'],
        'Bermuda': ['BM', 'North America'],
        'Brunei Darussalam': ['BN', 'Asia'],
        'Bolivia': ['BO', 'South America'],
        'Bonaire, Sint Eustatius and Saba': ['BQ', 'North America'],
        'Brazil': ['BR', 'South America'],
        'Bahamas': ['BS', 'North America'],
        'Bhutan': ['BT', 'Asia'],
        'Bouvet Island': ['BV', 'Antarctica'],
        'Botswana': ['BW', 'Africa'],
        'Belarus': ['BY', 'Europe'],
        'Belize': ['BZ', 'North America'],
        'Canada': ['CA', 'North America'],
        'Cocos (Keeling) Islands': ['CC', 'Asia'],
        'Congo (Democratic Republic)': ['CD', 'Africa'],
        'Central African Republic': ['CF', 'Africa'],
        'Congo': ['CG', 'Africa'],
        'Switzerland': ['CH', 'Europe'],
        "Côte d'Ivoire": ['CI', 'Africa'],
        'Cook Islands': ['CK', 'Oceania'],
        'Chile': ['CL', 'South America'],
        'Cameroon': ['CM', 'Africa'],
        'China': ['CN', 'Asia'],
        'Colombia': ['CO', 'South America'],
        'Costa Rica': ['CR', 'North America'],
        'Cuba': ['CU', 'North America'],
        'Cabo Verde': ['CV', 'Africa'],
        'Curaçao': ['CW', 'North America'],
        'Christmas Island': ['CX', 'Asia'],
        'Cyprus': ['CY', 'Asia'],
        'Czechia': ['CZ', 'Europe'],
        'Germany': ['DE', 'Europe'],
        'Djibouti': ['DJ', 'Africa'],
        'Denmark': ['DK', 'Europe'],
        'Dominica': ['DM', 'North America'],
        'Dominican Republic': ['DO', 'North America'],
        'Algeria': ['DZ', 'Africa'],
        'Ecuador': ['EC', 'South America'],
        'Estonia': ['EE', 'Europe'],
        'Egypt': ['EG', 'Africa'],
        'Western Sahara': ['EH', 'Africa'],
        'Eritrea': ['ER', 'Africa'],
        'Spain': ['ES', 'Europe'],
        'Ethiopia': ['ET', 'Africa'],
        'Finland': ['FI', 'Europe'],
        'Fiji': ['FJ', 'Oceania'],
        'Falkland Islands': ['FK', 'South America'],
        'Micronesia': ['FM', 'Oceania'],
        'Faroe Islands': ['FO', 'Europe'],
        'France': ['FR', 'Europe'],
        'Gabon': ['GA', 'Africa'],
        'United Kingdom': ['GB', 'Europe'],
        'Grenada': ['GD', 'North America'],
        'Georgia': ['GE', 'Asia'],
        'French Guiana': ['GF', 'South America'],
        'Guernsey': ['GG', 'Europe'],
        'Ghana': ['GH', 'Africa'],
        'Gibraltar': ['GI', 'Europe'],
        'Greenland': ['GL', 'North America'],
        'Gambia': ['GM', 'Africa'],
        'Guinea': ['GN', 'Africa'],
        'Guadeloupe': ['GP', 'North America'],
        'Equatorial Guinea': ['GQ', 'Africa'],
        'Greece': ['GR', 'Europe'],
        'South Georgia and the South Sandwich Islands': ['GS', 'Antarctica'],
        'Guatemala': ['GT', 'North America'],
        'Guam': ['GU', 'Oceania'],
        'Guinea-Bissau': ['GW', 'Africa'],
        'Guyana': ['GY', 'South America'],
        'Hong Kong': ['HK', 'Asia'],
        'Heard Island and McDonald Islands': ['HM', 'Antarctica'],
        'Honduras': ['HN', 'North America'],
        'Croatia': ['HR', 'Europe'],
        'Haiti': ['HT', 'North America'],
        'Hungary': ['HU', 'Europe'],
        'Indonesia': ['ID', 'Asia'],
        'Ireland': ['IE', 'Europe'],
        'Israel': ['IL', 'Asia'],
        'Isle of Man': ['IM', 'Europe'],
        'India': ['IN', 'Asia'],
        'British Indian Ocean Territory': ['IO', 'Asia'],
        'Iraq': ['IQ', 'Asia'],
        'Iran': ['IR', 'Asia'],
        'Iceland': ['IS', 'Europe'],
        'Italy': ['IT', 'Europe'],
        'Jersey': ['JE', 'Europe'],
        'Jamaica': ['JM', 'North America'],
        'Jordan': ['JO', 'Asia'],
        'Japan': ['JP', 'Asia'],
        'Kenya': ['KE', 'Africa'],
        'Kyrgyzstan': ['KG', 'Asia'],
        'Cambodia': ['KH', 'Asia'],
        'Kiribati': ['KI', 'Oceania'],
        'Comoros': ['KM', 'Africa'],
        'Saint Kitts and Nevis': ['KN', 'North America'],
        "Korea (Democratic People's Republic)": ['KP', 'Asia'],
        'Korea (Republic)': ['KR', 'Asia'],
        'Kuwait': ['KW', 'Asia'],
        'Cayman Islands': ['KY', 'North America'],
        'Kazakhstan': ['KZ', 'Asia'],
        "Lao People's Democratic Republic": ['LA', 'Asia'],
        'Lebanon': ['LB', 'Asia'],
        'Saint Lucia': ['LC', 'North America'],
        'Liechtenstein': ['LI', 'Europe'],
        'Sri Lanka': ['LK', 'Asia'],
        'Liberia': ['LR', 'Africa'],
        'Lesotho': ['LS', 'Africa'],
        'Lithuania': ['LT', 'Europe'],
        'Luxembourg': ['LU', 'Europe'],
        'Latvia': ['LV', 'Europe'],
        'Libya': ['LY', 'Africa'],
        'Morocco': ['MA', 'Africa'],
        'Monaco': ['MC', 'Europe'],
        'Moldova (the Republic of)': ['MD', 'Europe'],
        'Montenegro': ['ME', 'Europe'],
        'Saint Martin (French part)': ['MF', 'North America'],
        'Madagascar': ['MG', 'Africa'],
        'Marshall Islands': ['MH', 'Oceania'],
        'Republic of North Macedonia': ['MK', 'Europe'],
        'Mali': ['ML', 'Africa'],
        'Myanmar': ['MM', 'Asia'],
        'Mongolia': ['MN', 'Asia'],
        'Macao': ['MO', 'Asia'],
        'Northern Mariana Islands': ['MP', 'Oceania'],
        'Martinique': ['MQ', 'North America'],
        'Mauritania': ['MR', 'Africa'],
        'Montserrat': ['MS', 'North America'],
        'Malta': ['MT', 'Europe'],
        'Mauritius': ['MU', 'Africa'],
        'Maldives': ['MV', 'Asia'],
        'Malawi': ['MW', 'Africa'],
        'Mexico': ['MX', 'North America'],
        'Malaysia': ['MY', 'Asia'],
        'Mozambique': ['MZ', 'Africa'],
        'Namibia': ['NA', 'Africa'],
        'New Caledonia': ['NC', 'Oceania'],
        'Niger': ['NE', 'Africa'],
        'Norfolk Island': ['NF', 'Oceania'],
        'Nigeria': ['NG', 'Africa'],
        'Nicaragua': ['NI', 'North America'],
        'Netherlands': ['NL', 'Europe'],
        'Norway': ['NO', 'Europe'],
        'Nepal': ['NP', 'Asia'],
        'Nauru': ['NR', 'Oceania'],
        'Niue': ['NU', 'Oceania'],
        'New Zealand': ['NZ', 'Oceania'],
        'Oman': ['OM', 'Asia'],
        'Panama': ['PA', 'North America'],
        'Peru': ['PE', 'South America'],
        'French Polynesia': ['PF', 'Oceania'],
        'Papua New Guinea': ['PG', 'Oceania'],
        'Philippines': ['PH', 'Asia'],
        'Pakistan': ['PK', 'Asia'],
        'Poland': ['PL', 'Europe'],
        'Saint Pierre and Miquelon': ['PM', 'North America'],
        'Pitcairn': ['PN', 'Oceania'],
        'Puerto Rico': ['PR', 'North America'],
        'Palestine, State of': ['PS', 'Asia'],
        'Portugal': ['PT', 'Europe'],
        'Palau': ['PW', 'Oceania'],
        'Paraguay': ['PY', 'South America'],
        'Qatar': ['QA', 'Asia'],
        'Réunion': ['RE', 'Africa'],
        'Romania': ['RO', 'Europe'],
        'Serbia': ['RS', 'Europe'],
        'Russian Federation': ['RU', 'Europe'],
        'Rwanda': ['RW', 'Africa'],
        'Saudi Arabia': ['SA', 'Asia'],
        'Solomon Islands': ['SB', 'Oceania'],
        'Seychelles': ['SC', 'Africa'],
        'Sudan': ['SD', 'Africa'],
        'Sweden': ['SE', 'Europe'],
        'Singapore': ['SG', 'Asia'],
        'Saint Helena, Ascension and Tristan da Cunha': ['SH', 'Africa'],
        'Slovenia': ['SI', 'Europe'],
        'Svalbard and Jan Mayen': ['SJ', 'Europe'],
        'Slovakia': ['SK', 'Europe'],
        'Sierra Leone': ['SL', 'Africa'],
        'San Marino': ['SM', 'Europe'],
        'Senegal': ['SN', 'Africa'],
        'Somalia': ['SO', 'Africa'],
        'Suriname': ['SR', 'South America'],
        'South Sudan': ['SS', 'Africa'],
        'Sao Tome and Principe': ['ST', 'Africa'],
        'El Salvador': ['SV', 'North America'],
        'Syrian Arab Republic': ['SY', 'Asia'],
        'Eswatini': ['SZ', 'Africa'],
        'Turks and Caicos Islands': ['TC', 'North America'],
        'Chad': ['TD', 'Africa'],
        'French Southern Territories': ['TF', 'Antarctica'],
        'Togo': ['TG', 'Africa'],
        'Thailand': ['TH', 'Asia'],
        'Tajikistan': ['TJ', 'Asia'],
        'Tokelau': ['TK', 'Oceania'],
        'Timor-Leste': ['TL', 'Asia'],
        'Turkmenistan': ['TM', 'Asia'],
        'Tunisia': ['TN', 'Africa'],
        'Tonga': ['TO', 'Oceania'],
        'Turkey': ['TR', 'Europe'],
        'Trinidad and Tobago': ['TT', 'North America'],
        'Tuvalu': ['TV', 'Oceania'],
        'Taiwan': ['TW', 'Asia'],
        'Tanzania': ['TZ', 'Africa'],
        'Ukraine': ['UA', 'Europe'],
        'Uganda': ['UG', 'Africa'],
        'United States Minor Outlying Islands': ['UM', 'Oceania'],
        'United States of America': ['US', 'North America'],
        'Uruguay': ['UY', 'South America'],
        'Uzbekistan': ['UZ', 'Asia'],
        'Holy See': ['VA', 'Europe'],
        'Saint Vincent and the Grenadines': ['VC', 'North America'],
        'Venezuela (Bolivarian Republic of)': ['VE', 'South America'],
        'Virgin Islands (British)': ['VG', 'North America'],
        'Virgin Islands (U.S.)': ['VI', 'North America'],
        'Viet Nam': ['VN', 'Asia'],
        'Vanuatu': ['VU', 'Oceania'],
        'Wallis and Futuna': ['WF', 'Oceania'],
        'Samoa': ['WS', 'Oceania'],
        'Yemen': ['YE', 'Asia'],
        'Mayotte': ['YT', 'Africa'],
        'South Africa': ['ZA', 'Africa'],
        'Zambia': ['ZM', 'Africa'],
        'Zimbabwe': ['ZW', 'Africa']
    }

    ## load Taxon table
    #TaXon_table_xlsx = "/Users/tillmacher/Desktop/Projects/TTT_Projects/Projects/Sicliy_MZB/TaXon_tables/Sicily_eDNA_MZB_taxon_table_renamed_cons_derep_no_match_excluded_blanks_excluded_species.xlsx"
    TaXon_table_xlsx = Path(TaXon_table_xlsx)
    TaXon_table_df = pd.read_excel(TaXon_table_xlsx).fillna("nan")
    ## get a unique list of taxa
    taxa = set([
        taxon for taxon in TaXon_table_df["Species"].values.tolist()
        if taxon != "nan"
    ])
    ## create a dataframe to store the results
    ## select only countries for the selected continents
    selected_countries_list = [
        country for country, values in country_codes_dict.items()
        if values[1] in continents_to_check
    ]
    occurrence_df = pd.DataFrame(selected_countries_list, columns=["Country"])

    ## calculate runtime
    n_countries = len(selected_countries_list)
    n_species = len(taxa)
    t_single_request = 0.15
    t_total = round(t_single_request * n_species * n_countries / 60, 1)
    ## ask to continue the script
    answer = sg.PopupOKCancel("This will take roughly " + str(t_total) +
                              " minutes. Continue?",
                              title="Runtime")

    if answer == 'OK':

        ## create a subfolder for better sorting and overview
        dirName = Path(
            str(path_to_outdirs) + "/" + "Occurrence_analysis" + "/" +
            TaXon_table_xlsx.stem + "/")
        if not os.path.exists(dirName):
            os.mkdir(dirName)

        ############################################################################
        ## create the progress bar window
        layout = [[sg.Text('Progress bar')],
                  [
                      sg.ProgressBar(1000,
                                     orientation='h',
                                     size=(20, 20),
                                     key='progressbar')
                  ], [sg.Cancel()]]
        window_progress_bar = sg.Window('Progress bar',
                                        layout,
                                        keep_on_top=True)
        progress_bar = window_progress_bar['progressbar']
        progress_update = 0
        progress_increase = 1000 / len(taxa) + 1
        ############################################################################

        ############################################################################
        event, values = window_progress_bar.read(timeout=10)
        if event == 'Cancel' or event is None:
            window_progress_bar.Close()
            raise RuntimeError
        # update bar with loop value +1 so that bar eventually reaches the maximum
        progress_update += 0
        progress_bar.UpdateBar(progress_update)
        ############################################################################

        ## request gbif for the occurrence data
        for taxon_name in taxa:
            occurrence_list = []
            for country, values in country_codes_dict.items():
                country_code = values[0]
                continent = values[1]
                ## only check selected continents to reduce runtime
                if continent in continents_to_check:
                    ## create an html session
                    with requests_html.HTMLSession() as session:
                        ## generate html request name
                        request_name = '%20'.join(taxon_name.split(' '))
                        ## request that name
                        r = session.get(
                            "https://api.gbif.org/v1/occurrence/search?scientificName="
                            + request_name + "&country=" + country_code)
                        ## parse json
                        res = json.loads(r.text)
                        ## get number of occurrences
                        occurrence_list.append(res["count"])
            ## store the results in the dataframe
            occurrence_df[taxon_name] = occurrence_list

            ############################################################################
            event, values = window_progress_bar.read(timeout=10)
            if event == 'Cancel' or event is None:
                window_progress_bar.Close()
                raise RuntimeError
            # update bar with loop value +1 so that bar eventually reaches the maximum
            progress_update += progress_increase
            progress_bar.UpdateBar(progress_update)
            ############################################################################

        window_progress_bar.Close()

        ## remove countries that have 0 hits
        occurrence_df_filtered_list = []
        for row in occurrence_df.values.tolist():
            occurrences = set(row[1:])
            if occurrences != {0}:
                occurrence_df_filtered_list.append(row)

        ## create a dataframe with relative values
        occurrence_df_filtered_relative = pd.DataFrame(
            occurrence_df_filtered_list)
        occurrence_df_filtered_relative.columns = occurrence_df.columns.tolist(
        )

        ## create a dataframe with absolute values
        occurrence_df_filtered_absolute = pd.DataFrame(
            occurrence_df_filtered_list)
        occurrence_df_filtered_absolute.columns = occurrence_df.columns.tolist(
        )

        ## convert dataframe to relative occurrence abundance
        for taxon in taxa:
            df = occurrence_df_filtered_relative[["Country", taxon]]
            df_2 = df[taxon] / df[taxon].sum()
            df = df.assign(perc=df_2.values * 100)
            df = df.drop([taxon], axis=1)
            df = df.rename(columns={"perc": taxon})
            occurrence_df_filtered_relative[taxon] = df[taxon]

        fig = go.Figure()
        for row in occurrence_df_filtered_relative.values.tolist():
            occurrences = row[1:]
            country = row[0]
            fig.add_trace(
                go.Bar(x=list(taxa),
                       y=list(occurrences),
                       text=country,
                       name=country,
                       textposition='auto'))
        fig.update_layout(barmode='stack',
                          width=int(width),
                          height=int(height),
                          template=template,
                          font_size=font_size,
                          title_font_size=font_size)
        fig.update_yaxes(title="GBIF occurrence references (%)")

        n_occurrences = []
        for taxon in taxa:
            n_occurrences.append(
                len([
                    value for value in
                    occurrence_df_filtered_relative[taxon].values.tolist()
                    if value != 0
                ]))

        fig.add_trace(
            go.Scatter(x=list(taxa),
                       y=[105] * len(taxa),
                       text=n_occurrences,
                       name="countries",
                       mode="text"))

        ## define output files
        output_pdf = Path(
            str(dirName) + "/" + '_'.join(continents_to_check) + ".pdf")
        output_html = Path(
            str(dirName) + "/" + '_'.join(continents_to_check) + ".html")
        output_xlsx = Path(
            str(dirName) + "/" + '_'.join(continents_to_check) + ".xlsx")

        ## write to different sheets, one for absolute data, one for relative
        with pd.ExcelWriter(output_xlsx) as writer:
            occurrence_df_filtered_relative.to_excel(writer,
                                                     sheet_name='relative',
                                                     index=False)
            occurrence_df_filtered_absolute.to_excel(writer,
                                                     sheet_name='absolute',
                                                     index=False)

        ## write figures
        fig.write_image(str(output_pdf))
        fig.write_html(str(output_html))

        ## ask to show file
        answer = sg.PopupYesNo('Show plot?', keep_on_top=True)
        if answer == "Yes":
            webbrowser.open('file://' + str(output_html))

        ## print closing text
        closing_text = "GBIF occurrence plots and tables are found under: " + str(
            path_to_outdirs) + "/Occurrence_analysis/"
        sg.Popup(closing_text, title="Finished", keep_on_top=True)

        ## write log file
        from taxontabletools.create_log import ttt_log
        ttt_log("occurrence analysis", "analysis", TaXon_table_xlsx.name,
                output_pdf.name, "", path_to_outdirs)
示例#29
0
def PCoA_analysis(TaXon_table_xlsx, meta_data_to_test, taxonomic_level, width,
                  height, pcoa_s, path_to_outdirs, template, font_size,
                  color_discrete_sequence, pcoa_dissimilarity):
    import pandas as pd
    import numpy as np
    from skbio.diversity import beta_diversity
    from skbio.stats.ordination import pcoa
    from skbio.stats.distance import anosim
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    import plotly.express as px
    from pathlib import Path
    import PySimpleGUI as sg
    import os, webbrowser
    from itertools import combinations

    TaXon_table_xlsx = Path(TaXon_table_xlsx)
    Meta_data_table_xlsx = Path(
        str(path_to_outdirs) + "/" + "Meta_data_table" + "/" +
        TaXon_table_xlsx.stem + "_metadata.xlsx")
    TaXon_table_df = pd.read_excel(TaXon_table_xlsx,
                                   header=0).fillna("unidentified")
    TaXon_table_samples = TaXon_table_df.columns.tolist()[10:]
    Meta_data_table_df = pd.read_excel(Meta_data_table_xlsx,
                                       header=0).fillna("nan")
    Meta_data_table_samples = Meta_data_table_df['Samples'].tolist()

    metadata_list = Meta_data_table_df[meta_data_to_test].values.tolist()
    metadata_loc = Meta_data_table_df.columns.tolist().index(meta_data_to_test)

    ## drop samples with metadata called nan (= empty)
    drop_samples = [
        i[0] for i in Meta_data_table_df.values.tolist()
        if i[metadata_loc] == "nan"
    ]

    if drop_samples != []:
        ## filter the TaXon table
        TaXon_table_df = TaXon_table_df.drop(drop_samples, axis=1)
        TaXon_table_samples = TaXon_table_df.columns.tolist()[10:]
        ## also remove empty OTUs
        row_filter_list = []
        for row in TaXon_table_df.values.tolist():
            reads = set(row[10:])
            if reads != {0}:
                row_filter_list.append(row)
        columns = TaXon_table_df.columns.tolist()
        TaXon_table_df = pd.DataFrame(row_filter_list, columns=columns)
        Meta_data_table_df = pd.DataFrame(
            [
                i for i in Meta_data_table_df.values.tolist()
                if i[0] not in drop_samples
            ],
            columns=Meta_data_table_df.columns.tolist())
        Meta_data_table_samples = Meta_data_table_df['Samples'].tolist()

    ## create a y axis title text
    taxon_title = taxonomic_level.lower()

    ## adjust taxonomic level if neccessary
    if taxonomic_level in ["ASVs", "ESVs", "OTUs", "zOTUs"]:
        taxon_title = taxonomic_level
        taxonomic_level = "ID"

    # check if the meta data differs
    if len(set(Meta_data_table_df[meta_data_to_test])) == len(
            Meta_data_table_df['Samples'].tolist()):
        sg.Popup(
            "The meta data is unique for all samples. Please adjust the meta data table!",
            title=("Error"))
        raise RuntimeError

    # check if the meta data differs
    if len(set(Meta_data_table_df[meta_data_to_test])) == 1:
        sg.Popup(
            "The meta data is similar for all samples. Please adjust the meta data table!",
            title=("Error"))
        raise RuntimeError

    if sorted(TaXon_table_samples) == sorted(Meta_data_table_samples):

        samples = Meta_data_table_samples

        ## extract the relevant data
        TaXon_table_df = TaXon_table_df[[taxonomic_level] + samples]
        ## define an aggregation function to combine multiple hit of one taxonimic level
        aggregation_functions = {}
        ## define samples functions
        for sample in samples:
            ## 'sum' will calculate the sum of p/a data
            aggregation_functions[sample] = 'sum'
        ## define taxon level function
        aggregation_functions[taxonomic_level] = 'first'
        ## create condensed dataframe
        TaXon_table_df = TaXon_table_df.groupby(
            TaXon_table_df[taxonomic_level]).aggregate(aggregation_functions)
        if 'unidentified' in TaXon_table_df.index:
            TaXon_table_df = TaXon_table_df.drop('unidentified')

        data = TaXon_table_df[samples].transpose().values.tolist()
        jc_dm = beta_diversity(pcoa_dissimilarity, data, samples)
        ordination_result = pcoa(jc_dm)
        metadata_list = Meta_data_table_df[meta_data_to_test].values.tolist()

        anosim_results = anosim(jc_dm, metadata_list, permutations=999)
        anosim_r = round(anosim_results['test statistic'], 5)
        anosim_p = anosim_results['p-value']
        textbox = meta_data_to_test + ", " + taxon_title + "<br>Anosim " + "R = " + str(
            anosim_r) + " " + "p = " + str(anosim_p)

        #######################################################################################
        # create window to ask for PCoA axis to test
        def slices(list, slice):
            for i in range(0, len(list), slice):
                yield list[i:i + slice]

        # collect the PCoA proportion explained values
        proportion_explained_list = []
        for i, pcoa_axis in enumerate(ordination_result.proportion_explained):
            if round(pcoa_axis * 100, 2) >= 1:
                proportion_explained_list.append("PC" + str(i + 1) + " (" +
                                                 str(round(pcoa_axis *
                                                           100, 2)) + " %)")

        pcoa_axis_checkboxes = list(
            slices([
                sg.Checkbox(name, key=name, size=(15, 1))
                for name in proportion_explained_list
            ], 10))

        pcoa_window_layout = [
            [sg.Text('Check up to four axes to be displayed')],
            [sg.Frame(layout=pcoa_axis_checkboxes, title='')],
            [sg.Text('Only axes >= 1 % explained variance are shown')],
            [sg.CB("Connect categories", default=True, key="draw_mesh")],
            [sg.Text('')],
            [sg.Button('Plot', key='Plot')],
            [sg.Button('Back')],
        ]

        pcoa_window = sg.Window('PCoA axis',
                                pcoa_window_layout,
                                keep_on_top=True)

        while True:
            event, values = pcoa_window.read()

            draw_mesh = values["draw_mesh"]

            if event is None or event == 'Back':
                break

            if event == 'Plot':

                ## create a subfolder for better sorting and overview
                dirName = Path(
                    str(path_to_outdirs) + "/" + "PCoA_plots" + "/" +
                    TaXon_table_xlsx.stem + "/")
                if not os.path.exists(dirName):
                    os.mkdir(dirName)

                # collect the pcoa axis values
                axis_to_plot = [
                    key for key, value in values.items()
                    if value == True and "PC" in key
                ]
                # pass on only if two pcoa axes were checked
                if len(axis_to_plot) == 2:
                    cat1 = axis_to_plot[1].split()[0]
                    cat2 = axis_to_plot[0].split()[0]

                    df_pcoa = ordination_result.samples[[cat1, cat2]]
                    df_pcoa.insert(
                        2, "Metadata",
                        Meta_data_table_df[meta_data_to_test].values.tolist(),
                        True)
                    df_pcoa.insert(
                        3, "Samples",
                        Meta_data_table_df["Samples"].values.tolist(), True)

                    if draw_mesh == True:
                        combinations_list = []
                        for metadata in df_pcoa["Metadata"]:
                            ## collect all entries for the respective metadata
                            arr = df_pcoa.loc[df_pcoa['Metadata'] == metadata][
                                [cat1, cat2, "Metadata",
                                 "Samples"]].to_numpy()
                            ## create a df for all possible combinations using itertools combinations
                            for entry in list(combinations(arr, 2)):
                                combinations_list.append(list(entry[0]))
                                combinations_list.append(list(entry[1]))
                        ## create a dataframe to draw the plot from
                        df = pd.DataFrame(combinations_list)
                        df.columns = [cat1, cat2, "Metadata", "Samples"]

                        fig = px.scatter(
                            df,
                            x=cat1,
                            y=cat2,
                            color="Metadata",
                            text="Samples",
                            title=textbox,
                            color_discrete_sequence=color_discrete_sequence)
                        fig.update_traces(marker_size=int(pcoa_s),
                                          mode="markers+lines")
                        fig.update_layout(height=int(height),
                                          width=int(width),
                                          template=template,
                                          showlegend=True,
                                          font_size=font_size,
                                          title_font_size=font_size)
                        fig.update_xaxes(title=axis_to_plot[1])
                        fig.update_yaxes(title=axis_to_plot[0])

                    else:
                        fig = px.scatter(
                            df_pcoa,
                            x=cat1,
                            y=cat2,
                            color="Metadata",
                            text="Samples",
                            title=textbox,
                            color_discrete_sequence=color_discrete_sequence)
                        fig.update_traces(marker_size=int(pcoa_s),
                                          mode="markers")
                        fig.update_layout(height=int(height),
                                          width=int(width),
                                          template=template,
                                          showlegend=True,
                                          font_size=font_size,
                                          title_font_size=font_size)
                        fig.update_xaxes(title=axis_to_plot[1])
                        fig.update_yaxes(title=axis_to_plot[0])

                    ## define output files
                    output_pdf = Path(
                        str(dirName) + "/" + meta_data_to_test + "_" +
                        taxon_title + ".pdf")
                    output_html = Path(
                        str(dirName) + "/" + meta_data_to_test + "_" +
                        taxon_title + ".html")
                    output_xlsx = Path(
                        str(dirName) + "/" + meta_data_to_test + "_" +
                        taxon_title + ".xlsx")

                    ## write files
                    fig.write_image(str(output_pdf))
                    fig.write_html(str(output_html))
                    ordination_result.samples[[cat1,
                                               cat2]].to_excel(output_xlsx)

                    ## ask to show file
                    answer = sg.PopupYesNo('Show plot?', keep_on_top=True)
                    if answer == "Yes":
                        webbrowser.open('file://' + str(output_html))

                    ## print closing text
                    closing_text = "\n" + "PCoA plots are found in: " + str(
                        path_to_outdirs) + "/PCoA_plots/"
                    sg.Popup(closing_text, title="Finished", keep_on_top=True)

                    ## write to log
                    from taxontabletools.create_log import ttt_log
                    ttt_log("pcoa analysis", "analysis", TaXon_table_xlsx.name,
                            output_pdf.name, meta_data_to_test,
                            path_to_outdirs)
                    break

                elif len(axis_to_plot) == 3:
                    cat1 = axis_to_plot[0].split()[0]
                    cat2 = axis_to_plot[1].split()[0]
                    cat3 = axis_to_plot[2].split()[0]

                    df_pcoa = ordination_result.samples[[cat1, cat2, cat3]]
                    df_pcoa.insert(
                        3, "Metadata",
                        Meta_data_table_df[meta_data_to_test].values.tolist(),
                        True)
                    df_pcoa.insert(
                        4, "Samples",
                        Meta_data_table_df["Samples"].values.tolist(), True)

                    ## check if lines are to be drawn between the dots
                    if draw_mesh == True:
                        combinations_list = []
                        for metadata in df_pcoa["Metadata"]:
                            ## collect all entries for the respective metadata
                            arr = df_pcoa.loc[df_pcoa['Metadata'] == metadata][
                                [cat1, cat2, cat3, "Metadata",
                                 "Samples"]].to_numpy()
                            ## create a df for all possible combinations using itertools combinations
                            for entry in list(combinations(arr, 2)):
                                combinations_list.append(list(entry[0]))
                                combinations_list.append(list(entry[1]))
                        ## create a dataframe to draw the plot from
                        df = pd.DataFrame(combinations_list)
                        df.columns = [cat1, cat2, cat3, "Metadata", "Samples"]
                        ## draw the plot
                        fig = px.scatter_3d(
                            df,
                            x=cat1,
                            y=cat2,
                            z=cat3,
                            color="Metadata",
                            text="Samples",
                            title=textbox,
                            color_discrete_sequence=color_discrete_sequence)
                        fig.update_traces(marker_size=int(pcoa_s),
                                          mode="markers+lines",
                                          line=dict(width=0.5))
                        fig.update_layout(height=int(height),
                                          width=int(width),
                                          template=template,
                                          title=textbox,
                                          showlegend=True,
                                          font_size=font_size,
                                          title_font_size=font_size)
                        fig.update_layout(
                            scene=dict(xaxis_title=axis_to_plot[0],
                                       yaxis_title=axis_to_plot[1],
                                       zaxis_title=axis_to_plot[2]))
                    else:
                        fig = px.scatter_3d(
                            df_pcoa,
                            x=cat1,
                            y=cat2,
                            z=cat3,
                            color="Metadata",
                            text="Samples",
                            color_discrete_sequence=color_discrete_sequence)
                        fig.update_traces(marker_size=int(pcoa_s),
                                          mode="markers")
                        fig.update_layout(height=int(height),
                                          width=int(width),
                                          template=template,
                                          showlegend=True,
                                          title=textbox,
                                          font_size=font_size,
                                          title_font_size=font_size)
                        fig.update_layout(
                            scene=dict(xaxis_title=axis_to_plot[0],
                                       yaxis_title=axis_to_plot[1],
                                       zaxis_title=axis_to_plot[2]))

                    ## define output files
                    output_pdf = Path(
                        str(dirName) + "/" + meta_data_to_test + "_" +
                        taxon_title + "_3d.pdf")
                    output_html = Path(
                        str(dirName) + "/" + meta_data_to_test + "_" +
                        taxon_title + "_3d.html")
                    output_xlsx = Path(
                        str(dirName) + "/" + meta_data_to_test + "_" +
                        taxon_title + "_3d.xlsx")

                    ## write output files
                    fig.write_image(str(output_pdf))
                    fig.write_html(str(output_html))
                    ordination_result.samples[[cat1,
                                               cat2]].to_excel(output_xlsx)

                    ## ask to show file
                    answer = sg.PopupYesNo('Show plot?', keep_on_top=True)
                    if answer == "Yes":
                        webbrowser.open('file://' + str(output_html))

                    ## print closing text
                    closing_text = "PCoA plots are found in: " + str(
                        path_to_outdirs) + "/PCoA_plots/"
                    sg.Popup(closing_text, title="Finished", keep_on_top=True)

                    ## write log file
                    from taxontabletools.create_log import ttt_log
                    ttt_log("pcoa analysis", "analysis", TaXon_table_xlsx.name,
                            output_pdf.name, meta_data_to_test,
                            path_to_outdirs)
                    break

                else:
                    sg.Popup("Please choose not more than 3 PCoA axes",
                             title="Error",
                             keep_on_top=True)

            if event == 'Plot matrix':
                if len(proportion_explained_list) >= 4:

                    ## create a subfolder for better sorting and overview
                    dirName = Path(
                        str(path_to_outdirs) + "/" + "PCoA_plots" + "/" +
                        TaXon_table_xlsx.stem + "/")
                    if not os.path.exists(dirName):
                        os.mkdir(dirName)

                    df_pcoa = ordination_result.samples[[
                        "PC1", "PC2", "PC3", "PC4"
                    ]]
                    df_pcoa.insert(
                        4, "Metadata",
                        Meta_data_table_df[meta_data_to_test].values.tolist(),
                        True)
                    df_pcoa.insert(
                        5, "Sample",
                        Meta_data_table_df["Samples"].values.tolist(), True)

                    fig = make_subplots(rows=4, cols=4)
                    ########### 1 ###########
                    fig.add_trace(go.Scatter(), row=1, col=1)
                    fig.update_layout(template=template,
                                      font_size=font_size,
                                      title_font_size=font_size)
                    text = "PC1 (" + str(
                        round(
                            ordination_result.proportion_explained["PC1"] *
                            100, 2)) + " %)"
                    fig.add_annotation(text=text, showarrow=False)
                    fig.update_xaxes(showticklabels=False, showgrid=False)
                    fig.update_yaxes(showticklabels=False, showgrid=False)
                    ########### 2 ###########
                    df = df_pcoa[["PC1", "PC2", "Metadata", "Sample"]]
                    for metadata in set(metadata_list):
                        df_metadata = df[df['Metadata'] == metadata]
                        #fig = px.scatter(df_pcoa, x="PC1", y="PC2", , )
                        fig.add_trace(go.Scatter(
                            x=df_metadata["PC1"].values.tolist(),
                            y=df_metadata["PC2"].values.tolist(),
                            mode='markers',
                            name=metadata,
                            text=df_metadata["Sample"].values.tolist()),
                                      row=1,
                                      col=2)
                    ########### 3 ###########
                    df = df_pcoa[["PC1", "PC3", "Metadata", "Sample"]]
                    for metadata in set(metadata_list):
                        df_metadata = df[df['Metadata'] == metadata]
                        #fig = px.scatter(df_pcoa, x="PC1", y="PC2", , )
                        fig.add_trace(go.Scatter(
                            x=df_metadata["PC1"].values.tolist(),
                            y=df_metadata["PC3"].values.tolist(),
                            mode='markers',
                            name=metadata,
                            showlegend=False,
                            text=df_metadata["Sample"].values.tolist()),
                                      row=1,
                                      col=3)
                    ########### 4 ###########
                    df = df_pcoa[["PC1", "PC4", "Metadata", "Sample"]]
                    for metadata in set(metadata_list):
                        df_metadata = df[df['Metadata'] == metadata]
                        fig.add_trace(go.Scatter(
                            x=df_metadata["PC1"].values.tolist(),
                            y=df_metadata["PC4"].values.tolist(),
                            mode='markers',
                            name=metadata,
                            showlegend=False,
                            text=df_metadata["Sample"].values.tolist()),
                                      row=1,
                                      col=4)
                        fig.update_traces(marker_size=int(pcoa_s),
                                          mode="markers")
                        fig.update_xaxes(showgrid=False, row=1, col=4)
                        fig.update_yaxes(showgrid=False, row=1, col=4)
                    ########### 5 ###########
                    fig.add_trace(go.Scatter(), row=2, col=2)
                    fig.update_layout(template=template,
                                      font_size=font_size,
                                      title_font_size=font_size)
                    text = "PC2 (" + str(
                        round(
                            ordination_result.proportion_explained["PC2"] *
                            100, 2)) + " %)"
                    fig.add_annotation(text=text,
                                       showarrow=False,
                                       row=2,
                                       col=2)
                    ########### 6 ###########
                    df = df_pcoa[["PC2", "PC3", "Metadata", "Sample"]]
                    for metadata in set(metadata_list):
                        df_metadata = df[df['Metadata'] == metadata]
                        #fig = px.scatter(df_pcoa, x="PC1", y="PC2", , )
                        fig.add_trace(go.Scatter(
                            x=df_metadata["PC2"].values.tolist(),
                            y=df_metadata["PC3"].values.tolist(),
                            mode='markers',
                            name=metadata,
                            showlegend=False,
                            text=df_metadata["Sample"].values.tolist()),
                                      row=2,
                                      col=3)
                    ########### 7 ###########
                    df = df_pcoa[["PC2", "PC4", "Metadata", "Sample"]]
                    for metadata in set(metadata_list):
                        df_metadata = df[df['Metadata'] == metadata]
                        fig.add_trace(go.Scatter(
                            x=df_metadata["PC2"].values.tolist(),
                            y=df_metadata["PC4"].values.tolist(),
                            mode='markers',
                            name=metadata,
                            showlegend=False,
                            text=df_metadata["Sample"].values.tolist()),
                                      row=2,
                                      col=4)
                    ########### 8 ###########
                    fig.add_trace(go.Scatter(), row=3, col=3)
                    fig.update_layout(template=template,
                                      font_size=font_size,
                                      title_font_size=font_size)
                    text = "PC3 (" + str(
                        round(
                            ordination_result.proportion_explained["PC3"] *
                            100, 2)) + " %)"
                    fig.add_annotation(text=text,
                                       showarrow=False,
                                       row=3,
                                       col=3)
                    ########### 9 ###########
                    df = df_pcoa[["PC3", "PC4", "Metadata", "Sample"]]
                    for metadata in set(metadata_list):
                        df_metadata = df[df['Metadata'] == metadata]
                        #fig = px.scatter(df_pcoa, x="PC1", y="PC2", , )
                        fig.add_trace(go.Scatter(
                            x=df_metadata["PC3"].values.tolist(),
                            y=df_metadata["PC4"].values.tolist(),
                            mode='markers',
                            name=metadata,
                            showlegend=False,
                            text=df_metadata["Sample"].values.tolist()),
                                      row=3,
                                      col=4)
                    ########### 5 ###########
                    fig.add_trace(go.Scatter(), row=4, col=4)
                    fig.update_layout(template=template,
                                      font_size=font_size,
                                      title_font_size=font_size)
                    text = "PC4 (" + str(
                        round(
                            ordination_result.proportion_explained["PC4"] *
                            100, 2)) + " %)"
                    fig.add_annotation(text=text,
                                       showarrow=False,
                                       row=4,
                                       col=4)

                    ######################
                    fig.update_xaxes(showline=True,
                                     mirror=True,
                                     linewidth=1,
                                     linecolor='black')
                    fig.update_yaxes(showline=True,
                                     mirror=True,
                                     linewidth=1,
                                     linecolor='black')
                    fig.update_traces(marker_size=int(pcoa_s), mode="markers")
                    # finish plot matrix
                    fig.update_layout(height=1000,
                                      width=1000,
                                      title_text=textbox)

                    ## define output files
                    output_pdf = Path(
                        str(dirName) + "/" + meta_data_to_test + "_" +
                        taxon_title + "_matrix.pdf")
                    output_html = Path(
                        str(dirName) + "/" + meta_data_to_test + "_" +
                        taxon_title + "_matrix.html")

                    ## write output files
                    fig.write_image(str(output_pdf))
                    fig.write_html(str(output_html))

                    ## ask to show file
                    answer = sg.PopupYesNo('Show plot?', keep_on_top=True)
                    if answer == "Yes":
                        webbrowser.open('file://' + str(output_html))

                    ## print closing text
                    closing_text = "\n" + "PCoA plots are found in: " + str(
                        path_to_outdirs) + "/PCoA_plots/"
                    sg.Popup(closing_text, title="Finished", keep_on_top=True)

                    ## write to log file
                    from taxontabletools.create_log import ttt_log
                    ttt_log("pcoa analysis", "analysis", TaXon_table_xlsx.name,
                            output_pdf.name, meta_data_to_test,
                            path_to_outdirs)
                    break
                else:
                    sg.Popup(
                        "There must be at least 4 PCoA axis available to plot the matrix!"
                    )

        pcoa_window.close()

    else:
        sg.PopupError(
            "The sample of both the TaXon table and the metadata table have to match!"
        )
示例#30
0
def create_taxon_table_per_sample(TaXon_table_xlsx, path_to_outdirs):

    import PySimpleGUI as sg
    import pandas as pd
    import numpy as np
    from pathlib import Path

    TaXon_table_file =  Path(TaXon_table_xlsx)

    TaXon_table_xlsx = pd.ExcelFile(TaXon_table_file)
    TaXon_datasheet = pd.read_excel(TaXon_table_xlsx, 'TaXon table', header=0)
    TaXon_table = TaXon_datasheet.values.tolist()
    samples_to_process = TaXon_datasheet.columns[10:]
    first_ten_columns_header = TaXon_datasheet.columns[:10].values.tolist()
    first_ten_columns = TaXon_datasheet.iloc[:,[0,1,2,3,4,5,6,7,8,9]].values.tolist()
    OTU_list = TaXon_datasheet['ID'].values.tolist()

    ############################################################################
    ## create the progress bar window
    layout = [[sg.Text('Progress bar')],
              [sg.ProgressBar(1000, orientation='h', size=(20, 20), key='progressbar')],
              [sg.Cancel()]]
    window_progress_bar = sg.Window('Progress bar', layout, keep_on_top=True)
    progress_bar = window_progress_bar['progressbar']
    progress_update = 0
    progress_increase = 1000 / len(samples_to_process) + 1
    ############################################################################

    for sample in samples_to_process:

        Output_name = Path(sample + ".xlsx")
        Output_file = path_to_outdirs / "TaXon_tables_per_sample" / Output_name

        read_numbers = TaXon_datasheet[sample].values.tolist()
        sample_rows_list = []

        for i, read_number in enumerate(read_numbers):
            if read_number > 0:
                sample_rows_list.append(first_ten_columns[i] + [read_number])

        headers_df = pd.DataFrame([first_ten_columns_header + [sample]])
        sample_df = pd.DataFrame(sample_rows_list)
        sample_df = headers_df.append(sample_df)

        sample_df.to_excel(Output_file, engine='xlsxwriter', sheet_name='TaXon table', index=False, header=False)

        ############################################################################
        event, values = window_progress_bar.read(timeout=10)
        if event == 'Cancel'  or event is None:
            window_progress_bar.Close()
            raise RuntimeError
        # update bar with loop value +1 so that bar eventually reaches the maximum
        progress_update += progress_increase
        progress_bar.UpdateBar(progress_update)
        ############################################################################

    window_progress_bar.Close()

    closing_text = "\n" + "Taxon tables are found in: " + str(path_to_outdirs) + "/TaXon_tables_per_sample/"
    sg.Popup(closing_text, title="Finished", keep_on_top=True)

    from taxontabletools.create_log import ttt_log
    placeholder = TaXon_table_file.name + " (multiple files)"
    ttt_log("taXon table per sample", "analysis", TaXon_table_file.name, placeholder, "nan", path_to_outdirs)