def remove_batch_effect(files_list, cell_type, address): """ Args: files_list: List of strings: List of names of files that include cell_type cell_type: string: Name of cell type we are looking for Returns: batch_corrected_data: (n_genes, n_samples) pd Dataframe: Combined gene expression of cell types with removed batch effect """ import pandas as pd from combat.pycombat import pycombat batch = pd.Series() data_dict = [] for (i, file_name) in enumerate(files_list): data = pd.read_csv(address + file_name, sep="\t", header=0) data.set_index(list(data)[0], inplace=True) cell_type_data = data.loc[:, data.columns.str.contains(cell_type)] assert (cell_type_data.shape[1] != 0), cell_type + " is not present in " + file_name temp_batch = pd.Series([i for _ in range(len(list(cell_type_data)))], index=list(cell_type_data)) batch = batch.append(temp_batch) cell_type_data = cell_type_data.loc[~cell_type_data.index.duplicated( keep='first')] data_dict.append(cell_type_data) data_combined = pd.concat(data_dict, axis=1, sort=True) data_combined = data_combined.apply(lambda row: row.fillna(row.mean()), axis=1) data_combined.dropna(inplace=True) var = data_combined.var(axis=1) data_combined = data_combined[var > 0] data_combined = differentiate_same_col_names(data_combined) batch.index = list(data_combined) batch_corrected_data = pycombat(data_combined, batch) return batch_corrected_data
number_top_genes).index top_target_variable_genes = pd.DataFrame(np.var(data_df[target_data_key]), columns=['variance']) top_target_variable_genes = top_target_variable_genes.sort_values( 'variance', ascending=False) top_target_variable_genes = top_target_variable_genes.head( number_top_genes).index top_variable_genes = np.intersect1d(top_source_variable_genes, top_target_variable_genes) for d in data_df: data_df[d] = data_df[d][top_variable_genes] ## Correct with ComBat data_corrected = pycombat( pd.concat(list(data_df.values())).T, [1] * data_df[source_data_key].shape[0] + [2] * data_df[target_data_key].shape[0]) normalized_data_df = {k: data_corrected[data_df[k].index].T for k in data_df} normalized_data_df[source_data_key].index = pd.MultiIndex.from_tuples( normalized_data_df[source_data_key].index) # Read response if GDSC_drug_name in [ 'Cetuximab', 'Doxorubicin', 'Etoposide', 'Bleomycin', 'Bicalutamide', 'Bleomycin (50 uM)', 'Pemetrexed', 'AICA Ribonucleotide' ]: GDSC_drug_response_file = '../data/GDSC/response/GDSC1_drug_response.xlsx' else: GDSC_drug_response_file = '../data/GDSC/response/GDSC2_drug_response.xlsx' GDSC_drug_response_df = pd.read_excel(GDSC_drug_response_file)
####################### # AITCHISON ADJUSTED # ####################### data_aitchison_adj = (data_aitchison * df_signal).replace(0, np.nan) #data_corrected_adj_aitchison = aitchison_transform(data_aitchison_adj[data_aitchison_adj > 0]) # Aitchison > 0, true signal adjusted kde_matrix_plot_all_channels(data_aitchison_adj, log=False, suptitle="Aitchison, adjusted") kde_matrix_plot_batch( data_aitchison_adj, log=False, suptitle="Aitchison, adjusted (Channel intensities together)", legend_size=6) ########## # COMBAT # ########## batch = pd.DataFrame(data_aitchison_adj.columns.get_level_values(4)).T.values data_corrected = pycombat(data_aitchison_adj.fillna(0), batch[0]) #ComBat kde_matrix_plot_all_channels(data_corrected, log=True, suptitle="log2(ComBat)") kde_matrix_plot_batch( data_corrected, log=True, suptitle="log2(ComBat intensity) (Channel intensities together)", legend_size=6)
kde_matrix_plot_batch( df_norm, log=True, suptitle="log2(Norm intensity) (Channel intensities together)", legend_size=6) kde_matrix_plot_batch(df_norm_ait, log=False, suptitle="Aitchison norm (Channel intensities together)", legend_size=6) ############ # COMBAT ### ############ batch = pd.DataFrame(df_norm.columns.get_level_values(4)).T.values data_corrected = pycombat(df_norm.fillna(0), batch[0]) data_corrected_ait = pycombat(df_norm_ait.fillna(0), batch[0]) kde_matrix_plot_all_channels(data_corrected, log=True, suptitle="log2(ComBat)") kde_matrix_plot_batch( data_corrected, log=True, suptitle="log2(Combta intensity) (Channel intensities together)", legend_size=6) kde_matrix_plot_all_channels(data_corrected_ait, log=False, suptitle="Aitchison ComBat") kde_matrix_plot_batch( data_corrected_ait, log=False,
df_irs =(df_norm.T*irs_fac).T # irs_tmm = calcNormFactors(df_irs) df_irs_tmm = df_irs/irs_tmm ############### # sva comBat ## ############### df_norm batch = pd.DataFrame(df_norm.columns.get_level_values(4)).T.values from combat.pycombat import pycombat data_corrected = pycombat(df_norm,batch) #a = pd.DataFrame([[2,3,4]]).T #b = pd.DataFrame([[1,2,3]]).T # #a = pd.DataFrame([[1,2,3],[2,3,4],[5,6,7]]) #b = pd.DataFrame([[1,2,3]]).values #b = pd.DataFrame([[1,2,3]]).T # #a = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9],[10,11,12]]) #b = pd.Series([1,2,3,4]) #(a.T*b).T #a.multiply(b, axis = 0) ########
df_norm = norm_SL(df_int) kde_matrix_plot_all_channels(df_norm, log=True, suptitle="log2(norm)") kde_matrix_plot_batch( df_norm, log=True, suptitle="log2(Norm intensity) (Channel intensities together)", legend_size=6) ########## # COMBAT # ########## batch = pd.DataFrame(df_norm.columns.get_level_values(4)).T.values data_corrected = pycombat(df_norm.fillna(0), batch[0]) #ComBat kde_matrix_plot_all_channels(data_corrected, log=True, suptitle="log2(ComBat)") kde_matrix_plot_batch( data_corrected, log=True, suptitle="log2(ComBat intensity) (Channel intensities together)", legend_size=6) ############## # AITCHISOLN # ############## data_corrected_filtered = data_corrected[data_corrected > 0] data_corrected_aitchison = aitchison_transform( data_corrected_filtered) # Aitchison > 0 filtered
# RAW # ####### kde_matrix_plot_all_channels(df_int, log=True, suptitle="log2(Raw)") kde_matrix_plot_batch( df_int, log=True, suptitle="log2(Raw intensity) (Channel intensities together)", legend_size=6) ########## # COMBAT # ########## batch = pd.DataFrame(df_int.columns.get_level_values(4)).T.values df_combat = pycombat(df_int.fillna(0), batch[0]) #ComBat kde_matrix_plot_all_channels(df_combat, log=True, suptitle="log2(ComBat)") kde_matrix_plot_batch( df_combat * df_signal, log=True, suptitle="log2(ComBat intensity) (Channel intensities together)", legend_size=6) ############## # AITCHISOLN # ############## data_aitchison = aitchison_transform(df_combat[df_combat > 0]) kde_matrix_plot_all_channels(data_aitchison, log=False, suptitle="Aitchison")