import math
import seaborn as sns; sns.set(color_codes=True)
import operator
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold,LeaveOneOut, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics, svm
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

otu = 'C:/Users/Anna/Documents/otu_saliva_GVHD.csv'
mapping = 'C:/Users/Anna/Documents/mapping_saliva_GVHD.csv'

max_num_of_pcas = 20

OtuMf = OtuMfHandler(otu, mapping, from_QIIME=False)
preproccessed_data = preprocess_data(OtuMf.otu_file, taxnomy_level=7)
mapping_file = OtuMf.mapping_file
mapping_file['DATE'] = pd.to_datetime(OtuMf.mapping_file['DATE'])
mapping_file['Date_Of_Transplantation'] = pd.to_datetime(OtuMf.mapping_file['Date_Of_Transplantation'])
mapping_file['Date_of_engraftmen'] = pd.to_datetime(OtuMf.mapping_file['Date_of_engraftmen'])
mapping_file['aGVHD1_Stat'] = pd.to_datetime(OtuMf.mapping_file['aGVHD1_Stat'])
mapping_file['cGVHD_start'] = pd.to_datetime(OtuMf.mapping_file['cGVHD_start'])
end = pd.to_datetime('2020-01-01')
mapping_file['aGVHD1_Stat'] = mapping_file['aGVHD1_Stat'].fillna(end)
mapping_file['cGVHD_start'] = mapping_file['cGVHD_start'].fillna(end)
mapping_file = mapping_file[(mapping_file['DATE']>mapping_file['Date_Of_Transplantation']) & (mapping_file['DATE']<mapping_file['aGVHD1_Stat']) & (mapping_file['DATE']<mapping_file['cGVHD_start'])].sort_values(['Personal_ID', 'DATE'])

mapping_file = mapping_file.reset_index()
mapping_file = mapping_file.sort_values("DATE").groupby("Personal_ID", as_index=False).last().set_index('#SampleID')
preproccessed_data = preproccessed_data.join(mapping_file[['MTX', 'aGVHD1 ', 'cGVHD ']], how ='inner')
preproccessed_data = preproccessed_data.fillna('No')
Exemplo n.º 2
0
import csv
from plot_clustergram import *
csvfile = 'C:/Users/Anna/Documents/xgboost_gvhd_saliva.csv'
otu = 'C:/Users/Anna/Documents/otu_saliva_GVHD.csv'
mapping = 'C:/Users/Anna/Documents/mapping_saliva_GVHD.csv'
headers = [
    'ms', 'ne', 'learning rate', 'regularization', 'auc test', 'auc train'
]
# with open(csvfile, "w") as output:
#     writer = csv.writer(output, delimiter=',', lineterminator='\n')
#     writer.writerow(headers)

OtuMf = OtuMfHandler(otu, mapping, from_QIIME=False)
print(OtuMf.otu_file.shape)
preproccessed_data = preprocess_data(OtuMf.otu_file,
                                     visualize_data=False,
                                     taxnomy_level=6)
print(preproccessed_data.shape)
mapping_file = OtuMf.mapping_file
mapping_file['DATE'] = pd.to_datetime(OtuMf.mapping_file['DATE'])
mapping_file['Date_Of_Transplantation'] = pd.to_datetime(
    OtuMf.mapping_file['Date_Of_Transplantation'])
mapping_file['Date_of_engraftmen'] = pd.to_datetime(
    OtuMf.mapping_file['Date_of_engraftmen'])
mapping_file['aGVHD1_Stat'] = pd.to_datetime(OtuMf.mapping_file['aGVHD1_Stat'])
mapping_file['cGVHD_start'] = pd.to_datetime(OtuMf.mapping_file['cGVHD_start'])
end = pd.to_datetime('2020-01-01')
mapping_file['aGVHD1_Stat'] = mapping_file['aGVHD1_Stat'].fillna(end)
mapping_file['cGVHD_start'] = mapping_file['cGVHD_start'].fillna(end)
mapping_file = mapping_file[
    (mapping_file['DATE'] > mapping_file['Date_Of_Transplantation'])
def ibd(perform_distance=True, level=3):
    otu = 'C:/Users/Anna/Documents/otu_IBD3.csv'
    mapping = 'C:/Users/Anna/Documents/mapping_IBD3.csv'
    OtuMf = OtuMfHandler(otu, mapping, from_QIIME=False)
    preproccessed_data = preprocess_data(OtuMf.otu_file,
                                         visualize_data=False,
                                         taxnomy_level=7)
    preproccessed_data = preproccessed_data.join(
        OtuMf.mapping_file[['CD_or_UC', 'preg_trimester', 'P-ID']],
        how='inner')
    preproccessed_data = preproccessed_data.loc[(preproccessed_data['CD_or_UC']
                                                 != 'control')]
    preproccessed_data = preproccessed_data.groupby(
        ['CD_or_UC', 'preg_trimester', 'P-ID'], as_index=False).mean()
    new_set2 = preproccessed_data.groupby(['preg_trimester']).mean()
    for i in range(0, len(preproccessed_data)):
        month = preproccessed_data['preg_trimester'][i]
        preproccessed_data.iloc[i:i + 1, 3:preproccessed_data.shape[1]] = (
            preproccessed_data.iloc[i:i + 1,
                                    3:preproccessed_data.shape[1]].values -
            new_set2.loc[month:month, :].values)

    preproccessed_data = preproccessed_data.drop(
        ['preg_trimester', 'P-ID', 'CD_or_UC'], axis=1)
    mapping_file = OtuMf.mapping_file.loc[(OtuMf.mapping_file['CD_or_UC'] !=
                                           'control')]
    mapping_disease = {'CD': 1, 'UC': 0}
    mapping_file['CD_or_UC'] = mapping_file['CD_or_UC'].map(mapping_disease)
    mapping_file = mapping_file['CD_or_UC']
    mapping_file = mapping_file.reset_index()
    if perform_distance:
        cols = [
            col for col in preproccessed_data.columns
            if len(preproccessed_data[col].unique()) != 1
        ]
        dict_bact = {'else': []}
        for col in preproccessed_data[cols]:
            col_name = preproccessed_data[col].name.split(';')
            bact_level = level - 1
            if len(col_name) > bact_level:
                #if ",".join(col_name[0:bact_level+1]) in dict_bact:
                #    dict_bact[",".join(col_name[0:bact_level+1])].append(preproccessed_data[col].name)
                #else:
                #    dict_bact[",".join(col_name[0:bact_level+1])] = [preproccessed_data[col].name]
                if col_name[bact_level] in dict_bact:
                    dict_bact[col_name[bact_level]].append(
                        preproccessed_data[col].name)
                else:
                    dict_bact[col_name[bact_level]] = [
                        preproccessed_data[col].name
                    ]

            else:
                dict_bact['else'].append(preproccessed_data[col].name)
            print(col_name[-1])

        new_df = pd.DataFrame(index=preproccessed_data.index)
        col = 0
        new_dict = {}
        for key, values in dict_bact.items():
            new_dict[key] = []
            new_data = preproccessed_data[values]
            pca = PCA(n_components=round(new_data.shape[1] / 2) + 1)
            pca.fit(new_data)
            sum = 0
            num_comp = 0
            for (i, component) in enumerate(pca.explained_variance_ratio_):
                if sum <= 0.5:
                    sum += component
                else:
                    num_comp = i
                    break
            if num_comp == 0:
                num_comp += 1
            otu_after_pca_new, pca_components = apply_pca(
                new_data, n_components=num_comp)
            for j in range(otu_after_pca_new.shape[1]):
                new_df[col + j] = otu_after_pca_new[j]
                new_dict[key].append(col + j)
            col += num_comp
        return new_df, mapping_file, new_dict, OtuMf.otu_file.T[
            'taxonomy'].values
    else:
        return preproccessed_data, mapping_file, {}
Exemplo n.º 4
0
def allergies(perform_distance=False, level=3):
    otu = 'allergy_otu.csv'
    mapping = 'allergy_mf.csv'
    OtuMf = OtuMfHandler(otu, mapping, from_QIIME=False)
    preproccessed_data = preprocess_data(OtuMf.otu_file,
                                         visualize_data=False,
                                         taxnomy_level=7)
    preproccessed_data = preproccessed_data.join(
        OtuMf.mapping_file[['AllergyType', 'SuccessDescription']], how='inner')
    preproccessed_data = preproccessed_data.loc[
        (preproccessed_data['AllergyType'] == 'Milk') |
        ((preproccessed_data['AllergyType'] == 'Peanut'))]
    preproccessed_data = preproccessed_data.drop(
        ['AllergyType', 'SuccessDescription'], axis=1)
    mapping_file = OtuMf.mapping_file.loc[
        (OtuMf.mapping_file['AllergyType'] == 'Milk') |
        (OtuMf.mapping_file['AllergyType'] == 'Peanut')]
    mapping_disease = {'Milk': 1, 'Peanut': 0}
    mapping_file['AllergyType'] = mapping_file['AllergyType'].map(
        mapping_disease)
    mapping_file = mapping_file['AllergyType']

    if perform_distance:
        cols = [
            col for col in preproccessed_data.columns
            if len(preproccessed_data[col].unique()) != 1
        ]
        dict_bact = {'else': []}
        for col in preproccessed_data[cols]:
            col_name = preproccessed_data[col].name.split(';')
            bact_level = level - 1
            if len(col_name) > bact_level:
                if col_name[bact_level] in dict_bact:
                    dict_bact[col_name[bact_level]].append(
                        preproccessed_data[col].name)
                else:
                    dict_bact[col_name[bact_level]] = [
                        preproccessed_data[col].name
                    ]
            else:
                dict_bact['else'].append(preproccessed_data[col].name)
            print(col_name[-1])

        new_df = pd.DataFrame(index=preproccessed_data.index)
        col = 0
        new_dict = {}
        for key, values in dict_bact.items():
            new_dict[key] = []
            new_data = preproccessed_data[values]
            pca = PCA(n_components=round(new_data.shape[1] / 2) + 1)
            pca.fit(new_data)
            sum = 0
            num_comp = 0
            for (i, component) in enumerate(pca.explained_variance_ratio_):
                if sum <= 0.5:
                    sum += component
                else:
                    num_comp = i
                    break
            if num_comp == 0:
                num_comp += 1
            otu_after_pca_new, pca_components = apply_pca(
                new_data, n_components=num_comp)
            for j in range(otu_after_pca_new.shape[1]):
                new_df[col + j] = otu_after_pca_new[j]
                new_dict[key].append(col + j)
            col += num_comp
        return new_df, mapping_file, new_dict, OtuMf.otu_file.T[
            'taxonomy'].values
    else:
        return preproccessed_data, mapping_file, {}
def gvhd(perform_distance=True, level=3):
    otu = 'C:/Users/Anna/Documents/otu_saliva_GVHD.csv'
    mapping = 'C:/Users/Anna/Documents/mapping_saliva_GVHD.csv'
    OtuMf = OtuMfHandler(otu, mapping, from_QIIME=False)
    preproccessed_data = preprocess_data(OtuMf.otu_file, taxnomy_level=7)
    mapping_file = OtuMf.mapping_file
    mapping_file['DATE'] = pd.to_datetime(OtuMf.mapping_file['DATE'])
    mapping_file['Date_Of_Transplantation'] = pd.to_datetime(
        OtuMf.mapping_file['Date_Of_Transplantation'])
    mapping_file['Date_of_engraftmen'] = pd.to_datetime(
        OtuMf.mapping_file['Date_of_engraftmen'])
    mapping_file['aGVHD1_Stat'] = pd.to_datetime(
        OtuMf.mapping_file['aGVHD1_Stat'])
    mapping_file['cGVHD_start'] = pd.to_datetime(
        OtuMf.mapping_file['cGVHD_start'])
    end = pd.to_datetime('2020-01-01')
    mapping_file['aGVHD1_Stat'] = mapping_file['aGVHD1_Stat'].fillna(end)
    mapping_file['cGVHD_start'] = mapping_file['cGVHD_start'].fillna(end)
    mapping_file = mapping_file[
        (mapping_file['DATE'] > mapping_file['Date_Of_Transplantation'])
        & (mapping_file['DATE'] < mapping_file['aGVHD1_Stat']) &
        (mapping_file['DATE'] < mapping_file['cGVHD_start'])].sort_values(
            ['Personal_ID', 'DATE'])

    mapping_file = mapping_file.reset_index()
    mapping_file = mapping_file.sort_values("DATE").groupby(
        "Personal_ID", as_index=False).last().set_index('#SampleID')

    mapping_file = mapping_file[['MTX', 'aGVHD1 ', 'cGVHD ']]
    mapping_file = mapping_file.fillna('No')
    # preproccessed_data = preproccessed_data.join(mapping_file[['MTX', 'aGVHD1 ', 'cGVHD ']], how='inner')
    # preproccessed_data = preproccessed_data.fillna('No')

    mapping_yes_no = {'Yes': 1, 'No': 0}
    mapping_file['aGVHD1 '] = mapping_file['aGVHD1 '].map(mapping_yes_no)
    mapping_file['cGVHD '] = mapping_file['cGVHD '].map(mapping_yes_no)
    mapping_file['MTX'] = mapping_file['MTX'].map(mapping_yes_no)
    mapping_file["disease"] = mapping_file["aGVHD1 "].map(
        str) + '_' + mapping_file["cGVHD "].map(str)
    mapping_diseases = {'0_0': 1, '1_0': 0, '0_1': 0, '1_1': 0}
    mapping_file["disease"] = mapping_file["disease"].map(mapping_diseases)
    mapping_file = mapping_file.drop(['aGVHD1 ', 'cGVHD '], axis=1)
    preproccessed_data = preproccessed_data.join(mapping_file, how='inner')
    preproccessed_data = preproccessed_data.drop(['MTX', 'disease'], axis=1)
    if perform_distance:
        cols = [
            col for col in preproccessed_data.columns
            if len(preproccessed_data[col].unique()) != 1
        ]
        dict_bact = {'else': []}
        for col in preproccessed_data[cols]:
            col_name = preproccessed_data[col].name.split(';')
            bact_level = level - 1
            if len(col_name) > bact_level:
                if col_name[bact_level] in dict_bact:
                    dict_bact[col_name[bact_level]].append(
                        preproccessed_data[col].name)
                else:
                    dict_bact[col_name[bact_level]] = [
                        preproccessed_data[col].name
                    ]
            else:
                dict_bact['else'].append(preproccessed_data[col].name)
            print(col_name[-1])

        new_df = pd.DataFrame(index=preproccessed_data.index)
        col = 0
        new_dict = {}
        for key, values in dict_bact.items():
            new_dict[key] = []
            new_data = preproccessed_data[values]
            pca = PCA(n_components=round(new_data.shape[1] / 2) + 1)
            pca.fit(new_data)
            sum = 0
            num_comp = 0
            for (i, component) in enumerate(pca.explained_variance_ratio_):
                if sum <= 0.5:
                    sum += component
                else:
                    num_comp = i
                    break
            if num_comp == 0:
                num_comp += 1
            otu_after_pca_new, pca_components = apply_pca(
                new_data, n_components=num_comp)
            for j in range(otu_after_pca_new.shape[1]):
                new_df[col + j] = otu_after_pca_new[j]
                new_dict[key].append(col + j)
            col += num_comp
        return new_df, mapping_file, new_dict, OtuMf.otu_file.T[
            'taxonomy'].values
    else:
        return preproccessed_data, mapping_file, {}, []
Exemplo n.º 6
0
def microbiome_preprocess(max_pca,
                          tax_list,
                          tag_list,
                          old_preprocess=True,
                          rho_pca_plots=False,
                          evaluate=False,
                          algo="svm",
                          method="fold"):
    for tax in tax_list:
        for tag in tag_list:
            if old_preprocess:
                otu_file = "otu_id.csv"
                tag_file = tag + "_tag.csv"
                OtuMf = OtuMfHandler(otu_file,
                                     tag_file,
                                     from_QIIME=False,
                                     id_col='ID',
                                     taxonomy_col='taxonomy')

                preproccessed_data = preprocess_data(
                    OtuMf.otu_file,
                    preform_z_scoring=True,
                    visualize_data=False,
                    taxnomy_level=tax,
                    preform_taxnomy_group=True)

                otu_after_pca_wo_taxonomy, pca_obj, _ = apply_pca(
                    preproccessed_data, n_components=max_pca, visualize=False)
                folder = tag + "_tax_" + str(tax) + "_csv_files"
                otu_name = "old_processed_otu_" + tag + "_tax_" + str(
                    tax) + ".csv"
                otu_after_pca_wo_taxonomy[
                    "ID"] = otu_after_pca_wo_taxonomy.index
                otu_after_pca_wo_taxonomy = otu_after_pca_wo_taxonomy.set_index(
                    "ID")
                if not os.path.exists(folder):
                    os.mkdir(folder)
                otu_after_pca_wo_taxonomy.to_csv(os.path.join(
                    folder, otu_name))

            else:  # yoel new Preprocess
                # parameters for Preprocess
                preprocess_prms = {
                    'taxonomy_level': tax,
                    'taxnomy_group': 'mean',
                    'epsilon': 0.1,
                    'normalization': 'log',
                    'z_scoring': 'row',
                    'norm_after_rel': '',
                    'std_to_delete': 0,
                    'pca': max_pca
                }

                mapping_file = CreateOtuAndMappingFiles(
                    "otu.csv", tag + "_tag.csv")
                mapping_file.preprocess(preprocess_params=preprocess_prms,
                                        visualize=False)

                if rho_pca_plots:
                    folder = "preprocess_plots_" + tag + "_tag_tax_" + str(
                        tax) + "_pca_" + str(max_pca)
                    mapping_file.rhos_and_pca_calculation(
                        tag, preprocess_prms['taxonomy_level'],
                        preprocess_prms['pca'], os.path.join(folder, "rhos"),
                        os.path.join(folder, "pca"))

                otu_path, tag_path, pca_path = mapping_file.csv_to_learn(
                    tag + '_task', tag + "_tax_" + str(tax) + "_csv_files",
                    tax, max_pca)
                print(otu_path)

    # compere tax level and number of pca component using certain svm model and compere results
    if evaluate:
        microbiome_preprocess_evaluation(pca_options=list(range(2, max_pca)),
                                         tax_options=tax_list,
                                         tag_options=tag_list,
                                         old_preprocess=old_preprocess,
                                         algo=algo,
                                         method=method)
def psc(perform_distance=True, level=3):
    otu = 'C:/Users/Anna/Desktop/docs/otu_psc2.csv'
    mapping = 'C:/Users/Anna/Desktop/docs/mapping_psc.csv'
    OtuMf = OtuMfHandler(otu, mapping, from_QIIME=False)
    print('using padp')
    preproccessed_data = preprocess_data(OtuMf.otu_file,
                                         visualize_data=False,
                                         taxnomy_level=7)
    mapping_file = OtuMf.mapping_file

    mapping_disease = {
        'Control': 0,
        'Cirrhosis ': 1,
        'HCC': 1,
        'PSC+IBD': 2,
        'PSC': 2
    }
    mapping_file['DiagnosisGroup'] = mapping_file['DiagnosisGroup'].map(
        mapping_disease)
    mappin_boolean = {'yes': 1, 'no': 0, 'Control': 0, '0': 0, '1': 1}
    mapping_file['FattyLiver'] = mapping_file['FattyLiver'].map(mappin_boolean)
    mapping_file['RegularExercise'] = mapping_file['RegularExercise'].map(
        mappin_boolean)
    mapping_file['Smoking'] = mapping_file['Smoking'].map(mappin_boolean)
    mapping_file = mapping_file[[
        'Age', 'BMI', 'FattyLiver', 'RegularExercise', 'Smoking',
        'DiagnosisGroup'
    ]]

    if perform_distance:
        cols = [
            col for col in preproccessed_data.columns
            if len(preproccessed_data[col].unique()) != 1
        ]
        dict_bact = {'else': []}
        for col in preproccessed_data[cols]:
            col_name = preproccessed_data[col].name.split(';')
            bact_level = level - 1
            if len(col_name) > bact_level:
                if col_name[bact_level] in dict_bact:
                    dict_bact[col_name[bact_level]].append(
                        preproccessed_data[col].name)
                else:
                    dict_bact[col_name[bact_level]] = [
                        preproccessed_data[col].name
                    ]
            else:
                dict_bact['else'].append(preproccessed_data[col].name)
            print(col_name[-1])

        new_df = pd.DataFrame(index=preproccessed_data.index)
        col = 0
        for key, values in dict_bact.items():
            new_data = preproccessed_data[values]
            pca = PCA(n_components=round(new_data.shape[1] / 2) + 1)
            pca.fit(new_data)
            sum = 0
            num_comp = 0
            for (i, component) in enumerate(pca.explained_variance_ratio_):
                if sum <= 0.5:
                    sum += component
                else:
                    num_comp = i
                    break
            if num_comp == 0:
                num_comp += 1
            otu_after_pca_new, pca_components = apply_pca(
                new_data, n_components=num_comp)
            for j in range(otu_after_pca_new.shape[1]):
                new_df[col + j] = otu_after_pca_new[j]
            col += num_comp
        return new_df, mapping_file
    else:
        return preproccessed_data, mapping_file
Exemplo n.º 8
0
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# otu = 'C:/Users/Anna/Desktop/docs/otu_psc2.csv'
# mapping = 'C:/Users/Anna/Desktop/docs/mapping_psc.csv'

# otu = 'C:/Users/Anna/Documents/otu_IBD3.csv'
# mapping = 'C:/Users/Anna/Documents/mapping_IBD3.csv'

otu = 'C:/Users/Anna/Documents/otu_saliva_GVHD.csv'
mapping = 'C:/Users/Anna/Documents/mapping_saliva_GVHD.csv'

OtuMf = OtuMfHandler(otu, mapping, from_QIIME=False)
preproccessed_data = preprocess_data(OtuMf.otu_file,
                                     visualize_data=False,
                                     taxnomy_level=7,
                                     preform_z_scoring=False,
                                     preform_log=True)

mapping_file = OtuMf.mapping_file
mapping_file['DATE'] = pd.to_datetime(OtuMf.mapping_file['DATE'])
mapping_file['Date_Of_Transplantation'] = pd.to_datetime(
    OtuMf.mapping_file['Date_Of_Transplantation'])
mapping_file['Date_of_engraftmen'] = pd.to_datetime(
    OtuMf.mapping_file['Date_of_engraftmen'])
mapping_file['aGVHD1_Stat'] = pd.to_datetime(OtuMf.mapping_file['aGVHD1_Stat'])
mapping_file['cGVHD_start'] = pd.to_datetime(OtuMf.mapping_file['cGVHD_start'])
end = pd.to_datetime('2020-01-01')
mapping_file['aGVHD1_Stat'] = mapping_file['aGVHD1_Stat'].fillna(end)
mapping_file['cGVHD_start'] = mapping_file['cGVHD_start'].fillna(end)
mapping_file = mapping_file[
def allergies(perform_distance=False, level=3):
    otu = 'allergy_otu.csv'
    mapping = 'allergy_mf.csv'
    OtuMf = OtuMfHandler(otu, mapping, from_QIIME=False)
    preproccessed_data = preprocess_data(OtuMf.otu_file,
                                         visualize_data=False,
                                         taxnomy_level=7)
    preproccessed_data = preproccessed_data.join(
        OtuMf.mapping_file[['AllergyType', 'SuccessDescription']], how='inner')
    #preproccessed_data = preproccessed_data.loc[(preproccessed_data['AllergyType'] == 'Milk') | ((preproccessed_data['AllergyType'] == 'Peanut'))]
    preproccessed_data = preproccessed_data.drop(
        ['AllergyType', 'SuccessDescription'], axis=1)
    #mapping_file = OtuMf.mapping_file.loc[(OtuMf.mapping_file['AllergyType']  == 'Milk') | (OtuMf.mapping_file['AllergyType']  == 'Peanut')]
    mapping_file = OtuMf.mapping_file
    mapping_disease = {'Milk': 1, 'Peanut': 0}
    mapping_health = {'Con': 1}
    mapping_success = {'A1': 1}
    mapping_file['Health'] = mapping_file['AllergyType'].map(mapping_health)
    mapping_file['AllergyType'] = mapping_file['AllergyType'].map(
        mapping_disease)
    mapping_file['SuccessDescription'] = mapping_file[
        'SuccessDescription'].map(mapping_success)
    mapping_file[['Health', 'SuccessDescription'
                  ]] = mapping_file[['Health',
                                     'SuccessDescription']].fillna(value=0)

    mapping_file = mapping_file[['AllergyType', 'SuccessDescription']]

    # if perform_distance:
    #     cols = [col for col in preproccessed_data.columns if len(preproccessed_data[col].unique()) != 1]
    #     dict_bact = {'else': []}
    #     for col in preproccessed_data[cols]:
    #         col_name = preproccessed_data[col].name.split(';')
    #         bact_level = level - 1
    #         if len(col_name) > bact_level:
    #             if col_name[bact_level] in dict_bact:
    #                 dict_bact[col_name[bact_level]].append(preproccessed_data[col].name)
    #             else:
    #                 dict_bact[col_name[bact_level]] = [preproccessed_data[col].name]
    #         else:
    #             dict_bact['else'].append(preproccessed_data[col].name)
    #         print(col_name[-1])
    #
    #     new_df = pd.DataFrame(index=preproccessed_data.index)
    #     col = 0
    #     for key, values in dict_bact.items():
    #         new_data = preproccessed_data[values]
    #         pca = PCA(n_components=round(new_data.shape[1] / 2) + 1)
    #         pca.fit(new_data)
    #         sum = 0
    #         num_comp = 0
    #         for (i, component) in enumerate(pca.explained_variance_ratio_):
    #             if sum <= 0.5:
    #                 sum += component
    #             else:
    #                 num_comp = i
    #                 break
    #         if num_comp == 0:
    #             num_comp += 1
    #         otu_after_pca_new, pca_components = apply_pca(new_data, n_components=num_comp)
    #         for j in range(otu_after_pca_new.shape[1]):
    #             new_df[col + j] = otu_after_pca_new[j]
    #         col += num_comp
    #     return new_df, mapping_file
    # else:
    return preproccessed_data, mapping_file