def get_days(days_datetime): return days_datetime.days n_components = 10 use_recorded = False script_dir = sys.path[0] if not use_recorded: OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, 'ronies_Data', 'saliva_samples_231018.csv'), os.path.join( SCRIPT_DIR, 'ronies_Data', 'saliva_samples_mapping_file_231018.csv'), from_QIIME=True) preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=False, taxnomy_level=6) otu_after_pca_wo_taxonomy, _ = apply_pca(preproccessed_data, n_components=n_components, visualize=False) # otu_after_pca = OtuMf.add_taxonomy_col_to_new_otu_data(otu_after_pca_wo_taxonomy) # merged_data_after_pca = OtuMf.merge_mf_with_new_otu_data(otu_after_pca_wo_taxonomy) # merged_data_with_age = otu_after_pca_wo_taxonomy.join(OtuMf.mapping_file['age_in_days']) # merged_data_with_age = merged_data_with_age[merged_data_with_age.age_in_days.notnull()] # remove NaN days # merged_data_with_age_group = otu_after_pca_wo_taxonomy.join(OtuMf.mapping_file[['age_group', 'age_in_days','MouseNumber']]) # merged_data_with_age_group = merged_data_with_age_group[merged_data_with_age_group.age_group.notnull()] # remove NaN days
from sklearn import metrics, svm from sklearn.linear_model import LogisticRegression from xgboost import XGBClassifier import csv from plot_clustergram import * csvfile = 'C:/Users/Anna/Documents/xgboost_gvhd_saliva.csv' otu = 'C:/Users/Anna/Documents/otu_saliva_GVHD.csv' mapping = 'C:/Users/Anna/Documents/mapping_saliva_GVHD.csv' headers = [ 'ms', 'ne', 'learning rate', 'regularization', 'auc test', 'auc train' ] # with open(csvfile, "w") as output: # writer = csv.writer(output, delimiter=',', lineterminator='\n') # writer.writerow(headers) OtuMf = OtuMfHandler(otu, mapping, from_QIIME=False) print(OtuMf.otu_file.shape) preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=False, taxnomy_level=6) print(preproccessed_data.shape) mapping_file = OtuMf.mapping_file mapping_file['DATE'] = pd.to_datetime(OtuMf.mapping_file['DATE']) mapping_file['Date_Of_Transplantation'] = pd.to_datetime( OtuMf.mapping_file['Date_Of_Transplantation']) mapping_file['Date_of_engraftmen'] = pd.to_datetime( OtuMf.mapping_file['Date_of_engraftmen']) mapping_file['aGVHD1_Stat'] = pd.to_datetime(OtuMf.mapping_file['aGVHD1_Stat']) mapping_file['cGVHD_start'] = pd.to_datetime(OtuMf.mapping_file['cGVHD_start']) end = pd.to_datetime('2020-01-01') mapping_file['aGVHD1_Stat'] = mapping_file['aGVHD1_Stat'].fillna(end)
def allergies(perform_distance=False, level=3): otu = 'C:/Users/Anna/Documents/allergy_otu.csv' mapping = 'C:/Users/Anna/Documents/allergy_mf.csv' OtuMf = OtuMfHandler(otu, mapping, from_QIIME=False) preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=False, taxnomy_level=7) preproccessed_data = preproccessed_data.join( OtuMf.mapping_file[['AllergyType', 'SuccessDescription']], how='inner') preproccessed_data = preproccessed_data.loc[ (preproccessed_data['AllergyType'] == 'Milk') | ((preproccessed_data['AllergyType'] == 'Peanut'))] preproccessed_data = preproccessed_data.drop( ['AllergyType', 'SuccessDescription'], axis=1) mapping_file = OtuMf.mapping_file.loc[ (OtuMf.mapping_file['AllergyType'] == 'Milk') | (OtuMf.mapping_file['AllergyType'] == 'Peanut')] mapping_disease = {'Milk': 1, 'Peanut': 0} mapping_file['AllergyType'] = mapping_file['AllergyType'].map( mapping_disease) mapping_disease = {'Milk': 1, 'Peanut': 0} mapping_file = mapping_file['AllergyType'] if perform_distance: cols = [ col for col in preproccessed_data.columns if len(preproccessed_data[col].unique()) != 1 ] dict_bact = {'else': []} for col in preproccessed_data[cols]: col_name = preproccessed_data[col].name.split(';') bact_level = level - 1 if len(col_name) > bact_level: if col_name[bact_level] in dict_bact: dict_bact[col_name[bact_level]].append( preproccessed_data[col].name) else: dict_bact[col_name[bact_level]] = [ preproccessed_data[col].name ] else: dict_bact['else'].append(preproccessed_data[col].name) print(col_name[-1]) new_df = pd.DataFrame(index=preproccessed_data.index) col = 0 for key, values in dict_bact.items(): new_data = preproccessed_data[values] pca = PCA(n_components=round(new_data.shape[1] / 2) + 1) pca.fit(new_data) sum = 0 num_comp = 0 for (i, component) in enumerate(pca.explained_variance_ratio_): if sum <= 0.5: sum += component else: num_comp = i break if num_comp == 0: num_comp += 1 otu_after_pca_new, pca_components = apply_pca( new_data, n_components=num_comp) for j in range(otu_after_pca_new.shape[1]): new_df[col + j] = otu_after_pca_new[j] col += num_comp return new_df, mapping_file else: return preproccessed_data, mapping_file
def gvhd(perform_distance=True,level =3): otu = 'C:/Users/Anna/Documents/otu_saliva_GVHD.csv' mapping = 'C:/Users/Anna/Documents/mapping_saliva_GVHD.csv' OtuMf = OtuMfHandler(otu, mapping, from_QIIME=False) preproccessed_data = preprocess_data(OtuMf.otu_file, taxnomy_level=7) mapping_file = OtuMf.mapping_file mapping_file['DATE'] = pd.to_datetime(OtuMf.mapping_file['DATE']) mapping_file['Date_Of_Transplantation'] = pd.to_datetime(OtuMf.mapping_file['Date_Of_Transplantation']) mapping_file['Date_of_engraftmen'] = pd.to_datetime(OtuMf.mapping_file['Date_of_engraftmen']) mapping_file['aGVHD1_Stat'] = pd.to_datetime(OtuMf.mapping_file['aGVHD1_Stat']) mapping_file['cGVHD_start'] = pd.to_datetime(OtuMf.mapping_file['cGVHD_start']) end = pd.to_datetime('2020-01-01') mapping_file['aGVHD1_Stat'] = mapping_file['aGVHD1_Stat'].fillna(end) mapping_file['cGVHD_start'] = mapping_file['cGVHD_start'].fillna(end) mapping_file = mapping_file[(mapping_file['DATE'] > mapping_file['Date_Of_Transplantation']) & ( mapping_file['DATE'] < mapping_file['aGVHD1_Stat']) & (mapping_file['DATE'] < mapping_file['cGVHD_start'])].sort_values(['Personal_ID', 'DATE']) mapping_file = mapping_file.reset_index() mapping_file = mapping_file.sort_values("DATE").groupby("Personal_ID", as_index=False).last().set_index('#SampleID') mapping_file = mapping_file[['MTX', 'aGVHD1 ', 'cGVHD ']] mapping_file = mapping_file.fillna('No') # preproccessed_data = preproccessed_data.join(mapping_file[['MTX', 'aGVHD1 ', 'cGVHD ']], how='inner') # preproccessed_data = preproccessed_data.fillna('No') mapping_yes_no = {'Yes': 1, 'No': 0} mapping_file['aGVHD1 '] = mapping_file['aGVHD1 '].map(mapping_yes_no) mapping_file['cGVHD '] = mapping_file['cGVHD '].map(mapping_yes_no) mapping_file['MTX'] = mapping_file['MTX'].map(mapping_yes_no) mapping_file["disease"] = mapping_file["aGVHD1 "].map(str) + '_' + mapping_file["cGVHD "].map(str) mapping_diseases = {'0_0': 1, '1_0': 0, '0_1': 0, '1_1': 0} mapping_file["disease"] = mapping_file["disease"].map(mapping_diseases) mapping_file = mapping_file.drop(['aGVHD1 ', 'cGVHD '], axis=1) preproccessed_data = preproccessed_data.join(mapping_file, how='inner') preproccessed_data = preproccessed_data.drop(['MTX','disease'], axis =1) if perform_distance: cols = [col for col in preproccessed_data.columns if len(preproccessed_data[col].unique()) != 1] dict_bact = {'else': []} for col in preproccessed_data[cols]: col_name = preproccessed_data[col].name.split(';') bact_level = level - 1 if len(col_name) > bact_level: if col_name[bact_level] in dict_bact: dict_bact[col_name[bact_level]].append(preproccessed_data[col].name) else: dict_bact[col_name[bact_level]] = [preproccessed_data[col].name] else: dict_bact['else'].append(preproccessed_data[col].name) print(col_name[-1]) new_df = pd.DataFrame(index=preproccessed_data.index) col = 0 for key, values in dict_bact.items(): new_data = preproccessed_data[values] pca = PCA(n_components=round(new_data.shape[1] / 2) + 1) pca.fit(new_data) sum = 0 num_comp = 0 for (i, component) in enumerate(pca.explained_variance_ratio_): if sum <= 0.5: sum += component else: num_comp = i break if num_comp == 0: num_comp += 1 otu_after_pca_new, pca_components = apply_pca(new_data, n_components=num_comp) for j in range(otu_after_pca_new.shape[1]): new_df[col + j] = otu_after_pca_new[j] col += num_comp return new_df, mapping_file else: return preproccessed_data, mapping_file
def ibd(perform_distance=True, level=3): otu = 'C:/Users/Anna/Documents/otu_IBD3.csv' mapping = 'C:/Users/Anna/Documents/mapping_IBD3.csv' OtuMf = OtuMfHandler(otu, mapping, from_QIIME=False) preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=False, taxnomy_level=7) preproccessed_data = preproccessed_data.join( OtuMf.mapping_file[['CD_or_UC', 'preg_trimester', 'P-ID']], how='inner') preproccessed_data = preproccessed_data.loc[(preproccessed_data['CD_or_UC'] != 'control')] preproccessed_data = preproccessed_data.groupby( ['CD_or_UC', 'preg_trimester', 'P-ID'], as_index=False).mean() new_set2 = preproccessed_data.groupby(['preg_trimester']).mean() for i in range(0, len(preproccessed_data)): month = preproccessed_data['preg_trimester'][i] preproccessed_data.iloc[i:i + 1, 3:preproccessed_data.shape[1]] = ( preproccessed_data.iloc[i:i + 1, 3:preproccessed_data.shape[1]].values - new_set2.loc[month:month, :].values) preproccessed_data = preproccessed_data.drop( ['preg_trimester', 'P-ID', 'CD_or_UC'], axis=1) mapping_file = OtuMf.mapping_file.loc[(OtuMf.mapping_file['CD_or_UC'] != 'control')] mapping_disease = {'CD': 1, 'UC': 0} mapping_file['CD_or_UC'] = mapping_file['CD_or_UC'].map(mapping_disease) mapping_file = mapping_file['CD_or_UC'] mapping_file = mapping_file.reset_index() if perform_distance: cols = [ col for col in preproccessed_data.columns if len(preproccessed_data[col].unique()) != 1 ] dict_bact = {'else': []} for col in preproccessed_data[cols]: col_name = preproccessed_data[col].name.split(';') bact_level = level - 1 if len(col_name) > bact_level: #if ",".join(col_name[0:bact_level+1]) in dict_bact: # dict_bact[",".join(col_name[0:bact_level+1])].append(preproccessed_data[col].name) #else: # dict_bact[",".join(col_name[0:bact_level+1])] = [preproccessed_data[col].name] if col_name[bact_level] in dict_bact: dict_bact[col_name[bact_level]].append( preproccessed_data[col].name) else: dict_bact[col_name[bact_level]] = [ preproccessed_data[col].name ] else: dict_bact['else'].append(preproccessed_data[col].name) print(col_name[-1]) new_df = pd.DataFrame(index=preproccessed_data.index) col = 0 new_dict = {} for key, values in dict_bact.items(): new_dict[key] = [] new_data = preproccessed_data[values] pca = PCA(n_components=round(new_data.shape[1] / 2) + 1) pca.fit(new_data) sum = 0 num_comp = 0 for (i, component) in enumerate(pca.explained_variance_ratio_): if sum <= 0.5: sum += component else: num_comp = i break if num_comp == 0: num_comp += 1 otu_after_pca_new, pca_components = apply_pca( new_data, n_components=num_comp) for j in range(otu_after_pca_new.shape[1]): new_df[col + j] = otu_after_pca_new[j] new_dict[key].append(col + j) col += num_comp return new_df, mapping_file, new_dict, OtuMf.otu_file.T[ 'taxonomy'].values else: return preproccessed_data, mapping_file, {}
def psc(perform_distance=True, level=3): otu = 'C:/Users/Anna/Desktop/docs/otu_psc2.csv' mapping = 'C:/Users/Anna/Desktop/docs/mapping_psc.csv' OtuMf = OtuMfHandler(otu, mapping, from_QIIME=False) print('using padp') preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=False, taxnomy_level=7) mapping_file = OtuMf.mapping_file mapping_disease = { 'Control': 0, 'Cirrhosis ': 1, 'HCC': 1, 'PSC+IBD': 2, 'PSC': 2 } mapping_file['DiagnosisGroup'] = mapping_file['DiagnosisGroup'].map( mapping_disease) mappin_boolean = {'yes': 1, 'no': 0, 'Control': 0, '0': 0, '1': 1} mapping_file['FattyLiver'] = mapping_file['FattyLiver'].map(mappin_boolean) mapping_file['RegularExercise'] = mapping_file['RegularExercise'].map( mappin_boolean) mapping_file['Smoking'] = mapping_file['Smoking'].map(mappin_boolean) mapping_file = mapping_file[[ 'Age', 'BMI', 'FattyLiver', 'RegularExercise', 'Smoking', 'DiagnosisGroup' ]] if perform_distance: cols = [ col for col in preproccessed_data.columns if len(preproccessed_data[col].unique()) != 1 ] dict_bact = {'else': []} for col in preproccessed_data[cols]: col_name = preproccessed_data[col].name.split(';') bact_level = level - 1 if len(col_name) > bact_level: if col_name[bact_level] in dict_bact: dict_bact[col_name[bact_level]].append( preproccessed_data[col].name) else: dict_bact[col_name[bact_level]] = [ preproccessed_data[col].name ] else: dict_bact['else'].append(preproccessed_data[col].name) print(col_name[-1]) new_df = pd.DataFrame(index=preproccessed_data.index) col = 0 for key, values in dict_bact.items(): new_data = preproccessed_data[values] pca = PCA(n_components=round(new_data.shape[1] / 2) + 1) pca.fit(new_data) sum = 0 num_comp = 0 for (i, component) in enumerate(pca.explained_variance_ratio_): if sum <= 0.5: sum += component else: num_comp = i break if num_comp == 0: num_comp += 1 otu_after_pca_new, pca_components = apply_pca( new_data, n_components=num_comp) for j in range(otu_after_pca_new.shape[1]): new_df[col + j] = otu_after_pca_new[j] col += num_comp return new_df, mapping_file else: return preproccessed_data, mapping_file
def allergies(perform_distance=False, level=3): otu = 'allergy_otu.csv' mapping = 'allergy_mf.csv' OtuMf = OtuMfHandler(otu, mapping, from_QIIME=False) preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=False, taxnomy_level=7) preproccessed_data = preproccessed_data.join( OtuMf.mapping_file[['AllergyType', 'SuccessDescription']], how='inner') #preproccessed_data = preproccessed_data.loc[(preproccessed_data['AllergyType'] == 'Milk') | ((preproccessed_data['AllergyType'] == 'Peanut'))] preproccessed_data = preproccessed_data.drop( ['AllergyType', 'SuccessDescription'], axis=1) #mapping_file = OtuMf.mapping_file.loc[(OtuMf.mapping_file['AllergyType'] == 'Milk') | (OtuMf.mapping_file['AllergyType'] == 'Peanut')] mapping_file = OtuMf.mapping_file mapping_disease = {'Milk': 1, 'Peanut': 0} mapping_health = {'Con': 1} mapping_success = {'A1': 1} mapping_file['Health'] = mapping_file['AllergyType'].map(mapping_health) mapping_file['AllergyType'] = mapping_file['AllergyType'].map( mapping_disease) mapping_file['SuccessDescription'] = mapping_file[ 'SuccessDescription'].map(mapping_success) mapping_file[['Health', 'SuccessDescription' ]] = mapping_file[['Health', 'SuccessDescription']].fillna(value=0) mapping_file = mapping_file[['AllergyType', 'SuccessDescription']] # if perform_distance: # cols = [col for col in preproccessed_data.columns if len(preproccessed_data[col].unique()) != 1] # dict_bact = {'else': []} # for col in preproccessed_data[cols]: # col_name = preproccessed_data[col].name.split(';') # bact_level = level - 1 # if len(col_name) > bact_level: # if col_name[bact_level] in dict_bact: # dict_bact[col_name[bact_level]].append(preproccessed_data[col].name) # else: # dict_bact[col_name[bact_level]] = [preproccessed_data[col].name] # else: # dict_bact['else'].append(preproccessed_data[col].name) # print(col_name[-1]) # # new_df = pd.DataFrame(index=preproccessed_data.index) # col = 0 # for key, values in dict_bact.items(): # new_data = preproccessed_data[values] # pca = PCA(n_components=round(new_data.shape[1] / 2) + 1) # pca.fit(new_data) # sum = 0 # num_comp = 0 # for (i, component) in enumerate(pca.explained_variance_ratio_): # if sum <= 0.5: # sum += component # else: # num_comp = i # break # if num_comp == 0: # num_comp += 1 # otu_after_pca_new, pca_components = apply_pca(new_data, n_components=num_comp) # for j in range(otu_after_pca_new.shape[1]): # new_df[col + j] = otu_after_pca_new[j] # col += num_comp # return new_df, mapping_file # else: return preproccessed_data, mapping_file