示例#1
0
def remove_batch_effect(files_list, cell_type, address):
    """
    Args:
        files_list: List of strings: List of names of files that include cell_type
        cell_type: string:  Name of cell type we are looking for
    Returns:
        batch_corrected_data: (n_genes, n_samples) pd Dataframe:  Combined gene expression of cell types with removed
        batch effect
    """
    import pandas as pd
    from combat.pycombat import pycombat

    batch = pd.Series()
    data_dict = []
    for (i, file_name) in enumerate(files_list):
        data = pd.read_csv(address + file_name, sep="\t", header=0)
        data.set_index(list(data)[0], inplace=True)
        cell_type_data = data.loc[:, data.columns.str.contains(cell_type)]
        assert (cell_type_data.shape[1] !=
                0), cell_type + " is not present in " + file_name

        temp_batch = pd.Series([i for _ in range(len(list(cell_type_data)))],
                               index=list(cell_type_data))
        batch = batch.append(temp_batch)
        cell_type_data = cell_type_data.loc[~cell_type_data.index.duplicated(
            keep='first')]
        data_dict.append(cell_type_data)
    data_combined = pd.concat(data_dict, axis=1, sort=True)
    data_combined = data_combined.apply(lambda row: row.fillna(row.mean()),
                                        axis=1)
    data_combined.dropna(inplace=True)
    var = data_combined.var(axis=1)
    data_combined = data_combined[var > 0]
    data_combined = differentiate_same_col_names(data_combined)
    batch.index = list(data_combined)
    batch_corrected_data = pycombat(data_combined, batch)
    return batch_corrected_data
    number_top_genes).index
top_target_variable_genes = pd.DataFrame(np.var(data_df[target_data_key]),
                                         columns=['variance'])
top_target_variable_genes = top_target_variable_genes.sort_values(
    'variance', ascending=False)
top_target_variable_genes = top_target_variable_genes.head(
    number_top_genes).index
top_variable_genes = np.intersect1d(top_source_variable_genes,
                                    top_target_variable_genes)

for d in data_df:
    data_df[d] = data_df[d][top_variable_genes]

## Correct with ComBat
data_corrected = pycombat(
    pd.concat(list(data_df.values())).T,
    [1] * data_df[source_data_key].shape[0] +
    [2] * data_df[target_data_key].shape[0])

normalized_data_df = {k: data_corrected[data_df[k].index].T for k in data_df}
normalized_data_df[source_data_key].index = pd.MultiIndex.from_tuples(
    normalized_data_df[source_data_key].index)

# Read response
if GDSC_drug_name in [
        'Cetuximab', 'Doxorubicin', 'Etoposide', 'Bleomycin', 'Bicalutamide',
        'Bleomycin (50 uM)', 'Pemetrexed', 'AICA Ribonucleotide'
]:
    GDSC_drug_response_file = '../data/GDSC/response/GDSC1_drug_response.xlsx'
else:
    GDSC_drug_response_file = '../data/GDSC/response/GDSC2_drug_response.xlsx'
GDSC_drug_response_df = pd.read_excel(GDSC_drug_response_file)
示例#3
0
#######################
# AITCHISON ADJUSTED  #
#######################

data_aitchison_adj = (data_aitchison * df_signal).replace(0, np.nan)
#data_corrected_adj_aitchison = aitchison_transform(data_aitchison_adj[data_aitchison_adj > 0]) # Aitchison > 0, true signal adjusted

kde_matrix_plot_all_channels(data_aitchison_adj,
                             log=False,
                             suptitle="Aitchison, adjusted")
kde_matrix_plot_batch(
    data_aitchison_adj,
    log=False,
    suptitle="Aitchison, adjusted (Channel intensities together)",
    legend_size=6)

##########
# COMBAT #
##########

batch = pd.DataFrame(data_aitchison_adj.columns.get_level_values(4)).T.values
data_corrected = pycombat(data_aitchison_adj.fillna(0), batch[0])  #ComBat

kde_matrix_plot_all_channels(data_corrected, log=True, suptitle="log2(ComBat)")
kde_matrix_plot_batch(
    data_corrected,
    log=True,
    suptitle="log2(ComBat intensity) (Channel intensities together)",
    legend_size=6)
示例#4
0
kde_matrix_plot_batch(
    df_norm,
    log=True,
    suptitle="log2(Norm intensity) (Channel intensities together)",
    legend_size=6)
kde_matrix_plot_batch(df_norm_ait,
                      log=False,
                      suptitle="Aitchison norm (Channel intensities together)",
                      legend_size=6)

############
# COMBAT ###
############

batch = pd.DataFrame(df_norm.columns.get_level_values(4)).T.values
data_corrected = pycombat(df_norm.fillna(0), batch[0])
data_corrected_ait = pycombat(df_norm_ait.fillna(0), batch[0])

kde_matrix_plot_all_channels(data_corrected, log=True, suptitle="log2(ComBat)")
kde_matrix_plot_batch(
    data_corrected,
    log=True,
    suptitle="log2(Combta intensity) (Channel intensities together)",
    legend_size=6)

kde_matrix_plot_all_channels(data_corrected_ait,
                             log=False,
                             suptitle="Aitchison ComBat")
kde_matrix_plot_batch(
    data_corrected_ait,
    log=False,
df_irs =(df_norm.T*irs_fac).T
#
irs_tmm = calcNormFactors(df_irs)
df_irs_tmm = df_irs/irs_tmm


###############
# sva comBat ##
###############

df_norm

batch = pd.DataFrame(df_norm.columns.get_level_values(4)).T.values

from combat.pycombat import pycombat
data_corrected = pycombat(df_norm,batch)



#a = pd.DataFrame([[2,3,4]]).T
#b = pd.DataFrame([[1,2,3]]).T
#
#a = pd.DataFrame([[1,2,3],[2,3,4],[5,6,7]])
#b = pd.DataFrame([[1,2,3]]).values
#b = pd.DataFrame([[1,2,3]]).T
#
#a = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9],[10,11,12]])
#b = pd.Series([1,2,3,4])
#(a.T*b).T
#a.multiply(b, axis = 0)
########

df_norm = norm_SL(df_int)
kde_matrix_plot_all_channels(df_norm, log=True, suptitle="log2(norm)")
kde_matrix_plot_batch(
    df_norm,
    log=True,
    suptitle="log2(Norm intensity) (Channel intensities together)",
    legend_size=6)

##########
# COMBAT #
##########

batch = pd.DataFrame(df_norm.columns.get_level_values(4)).T.values
data_corrected = pycombat(df_norm.fillna(0), batch[0])  #ComBat

kde_matrix_plot_all_channels(data_corrected, log=True, suptitle="log2(ComBat)")
kde_matrix_plot_batch(
    data_corrected,
    log=True,
    suptitle="log2(ComBat intensity) (Channel intensities together)",
    legend_size=6)

##############
# AITCHISOLN #
##############

data_corrected_filtered = data_corrected[data_corrected > 0]
data_corrected_aitchison = aitchison_transform(
    data_corrected_filtered)  # Aitchison > 0 filtered
示例#7
0
# RAW #
#######

kde_matrix_plot_all_channels(df_int, log=True, suptitle="log2(Raw)")
kde_matrix_plot_batch(
    df_int,
    log=True,
    suptitle="log2(Raw intensity) (Channel intensities together)",
    legend_size=6)

##########
# COMBAT #
##########

batch = pd.DataFrame(df_int.columns.get_level_values(4)).T.values
df_combat = pycombat(df_int.fillna(0), batch[0])  #ComBat

kde_matrix_plot_all_channels(df_combat, log=True, suptitle="log2(ComBat)")
kde_matrix_plot_batch(
    df_combat * df_signal,
    log=True,
    suptitle="log2(ComBat intensity) (Channel intensities together)",
    legend_size=6)

##############
# AITCHISOLN #
##############

data_aitchison = aitchison_transform(df_combat[df_combat > 0])

kde_matrix_plot_all_channels(data_aitchison, log=False, suptitle="Aitchison")