def test_overwrite_ensemble_model_invalid(self): (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing() model_builder = MLPModelBuilder() explained_model = RandomForestRegressor(n_estimators=64, max_depth=5, random_state=1) explained_model.fit(x_train, y_train) masking_operation = ZeroMasking() loss = binary_crossentropy num_models = 5 explainer = CXPlain(explained_model, model_builder, masking_operation, loss, num_models=num_models) file_names = [ CXPlain.get_config_file_name(), CXPlain.get_explained_model_file_name(".pkl"), CXPlain.get_loss_pkl_file_name(), CXPlain.get_model_builder_pkl_file_name(), CXPlain.get_masking_operation_pkl_file_name() ] # Test with untrained explanation model. for file_name in file_names: tmp_dir = TestExplanationModel.make_at_tmp(file_name) with self.assertRaises(ValueError): explainer.save(tmp_dir, overwrite=False) # Test with trained explanation model. explainer.fit(x_train, y_train) file_names = [ CXPlain.get_config_file_name(), CXPlain.get_explained_model_file_name(".pkl"), CXPlain.get_loss_pkl_file_name(), CXPlain.get_model_builder_pkl_file_name(), CXPlain.get_masking_operation_pkl_file_name() ] + [ CXPlain.get_prediction_model_h5_file_name(i) for i in range(num_models) ] for file_name in file_names: tmp_dir = TestExplanationModel.make_at_tmp(file_name) with self.assertRaises(ValueError): explainer.save(tmp_dir, overwrite=False)
def find_genes_CX(drug, model, gdsc_expr, gdsc_dr, test_tcga_expr): print('obtaining masked data...') # masked_data, list_of_baselines = get_masked_data_for_CXPlain(model, gdsc_expr, pathway_matrix) masked_data, list_of_baselines = get_masked_data_for_CXPlain(model, gdsc_expr, pathway_matrix, subtract_mean=True) lb = np.concatenate(list_of_baselines).reshape(len(gdsc_expr), -1) lb = pd.DataFrame(lb, index=gdsc_expr.index, columns=pathway_names) lb.to_csv(res_dir + drug + '/baselines.csv') # print(lb.shape) # exit() print('obtained masked data...') import tensorflow as tf tf.compat.v1.disable_v2_behavior() tf.keras.backend.clear_session() tf.random.set_seed(SEED) from tensorflow.python.keras.losses import mean_squared_error as loss from cxplain import CXPlain from cxplain.backend.model_builders.custom_mlp import CustomMLPModelBuilder # from cxplain.backend.masking.zero_masking import FastZeroMasking n_pathways = len(pathway_names) model_builder = CustomMLPModelBuilder(num_layers=2, num_units=512, batch_size=16, learning_rate=0.001, n_feature_groups=n_pathways) # masking_operation = FastZeroMasking() print(gdsc_expr.values.shape, gdsc_dr.values.shape) print("Fitting CXPlain model") explainer = CXPlain(model, model_builder, None, loss, num_models=3) explainer.fit(gdsc_expr.values, gdsc_dr.values, masked_data=masked_data) print("Attributing using CXPlain") attr,_ = explainer.explain_groups(test_tcga_expr.values) print('attr') attr = pd.DataFrame(attr, index=test_tcga_expr.index, columns=pathway_names) borda = get_ranked_list(attr, k=n_pathways) attr_mean = list(np.abs(attr).mean(axis=0).nlargest(n_pathways).index) out = pd.DataFrame(columns=['borda', 'mean']) out['borda'] = borda out['mean'] = attr_mean out.to_csv(res_dir + drug + '/pathways.csv', index=False) if not os.path.exists(res_dir + drug + '/explainer/'): os.mkdir(res_dir + drug + '/explainer/') explainer.save(res_dir + drug + '/explainer/', custom_model_saver=None)
def find_genes_CX(drug, model, meta, gdsc_expr, gdsc_dr, test_tcga_expr, save_dir): torch.manual_seed(SEED) np.random.seed(SEED) print('obtaining masked data...') masked_data = get_masked_data_for_CXPlain(model, gdsc_expr) print('obtained masked data...') # get_masked_data_for_CXPlain(model, test_tcga_expr) import tensorflow as tf tf.compat.v1.disable_v2_behavior() tf.keras.backend.clear_session() tf.random.set_seed(SEED) from tensorflow.python.keras.losses import mean_squared_error as loss from cxplain import MLPModelBuilder, CXPlain # from cxplain.backend.masking.zero_masking import FastZeroMasking model_builder = MLPModelBuilder(num_layers=2, num_units=512, batch_size=8, learning_rate=0.001) # masking_operation = FastZeroMasking() print(gdsc_expr.values.shape, gdsc_dr.values.shape) print("Fitting CXPlain model") explainer = CXPlain(model, model_builder, None, loss) explainer.fit(gdsc_expr.values, gdsc_dr.values, masked_data=masked_data) print("Attributing using CXPlain") attr = explainer.explain(test_tcga_expr.values) attr = pd.DataFrame(attr, index=test_tcga_expr.index, columns=dataset.hgnc) borda = get_ranked_list(attr) attr_mean = list(np.abs(attr).mean(axis=0).nlargest(200).index) out = pd.DataFrame(columns=['borda', 'mean']) out['borda'] = borda out['mean'] = attr_mean out.to_csv(save_dir + '/genes.csv', index=False) if not os.path.exists(save_dir + '/explainer/'): os.mkdir(save_dir + '/explainer/') explainer.save(save_dir + '/explainer/')
def test_boston_housing_load_save_valid(self): (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing() explained_model = RandomForestRegressor(n_estimators=64, max_depth=5, random_state=1) explained_model.fit(x_train, y_train) model_builder = MLPModelBuilder(num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0, batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128) masking_operation = ZeroMasking() loss = mean_squared_error num_models_settings = [1, 2] for num_models in num_models_settings: explainer = CXPlain(explained_model, model_builder, masking_operation, loss, num_models=num_models) explainer.fit(x_train, y_train) median_1 = explainer.predict(x_test) tmp_dir_name = tempfile.mkdtemp() explainer.save(tmp_dir_name) with self.assertRaises(ValueError): explainer.save(tmp_dir_name, overwrite=False) explainer.save(tmp_dir_name, overwrite=True) explainer.load(tmp_dir_name) median_2 = explainer.predict(x_test) self.assertTrue(np.array_equal(median_1, median_2)) shutil.rmtree(tmp_dir_name) # Cleanup.
def find_genes_CX(drug, model, gdsc_expr, gdsc_dr, test_tcga_expr): print('obtaining precalculating omegas...') loss = torch.nn.MSELoss(reduction='none') # idx = gdsc_expr.index[:10] # gdsc_expr = gdsc_expr.loc[idx] # gdsc_dr = gdsc_dr[idx] # omegas, lb = precalculate_omegas(model, gdsc_expr, gdsc_dr, pathway_matrix, loss) # omegas, lb = precalculate_omegas_scaled(model, gdsc_expr, gdsc_dr, pathway_matrix, loss) # omegas, lb = precalculate_omegas(model, gdsc_expr, gdsc_dr, pathway_matrix, loss,mode='scaled-difference') # omegas, lb = precalculate_omegas(model, gdsc_expr, gdsc_dr, pathway_matrix, loss, mode='delta-of-delta') omegas, lb = precalculate_omegas(model, gdsc_expr, gdsc_dr, pathway_matrix, loss, mode='delta-scaled') if lb is not None: lb = pd.DataFrame(lb, index=gdsc_expr.index, columns=pathway_names) lb.to_csv(res_dir + drug + '/baselines.csv') print('obtained masked data...') import tensorflow as tf tf.compat.v1.disable_v2_behavior() tf.keras.backend.clear_session() tf.random.set_seed(SEED) from tensorflow.python.keras.losses import mean_squared_error as loss from cxplain import CXPlain from cxplain.backend.model_builders.custom_mlp_precalc import CustomMLPModelBuilder # from cxplain.backend.masking.zero_masking import FastZeroMasking n_pathways = len(pathway_names) model_builder = CustomMLPModelBuilder(num_layers=2, num_units=512, batch_size=16, learning_rate=0.001, n_feature_groups=n_pathways) # masking_operation = FastZeroMasking() print(gdsc_expr.values.shape, gdsc_dr.values.shape) print("Fitting CXPlain model") explainer = CXPlain(model, model_builder, None, loss, num_models=3) explainer.fit_precalc(gdsc_expr.values, gdsc_dr.values, omega=omegas) print("Attributing using CXPlain") attr, _ = explainer.explain_groups(test_tcga_expr.values) print('attr') attr = pd.DataFrame(attr, index=test_tcga_expr.index, columns=pathway_names) borda = get_ranked_list(attr, k=n_pathways) attr_mean = list(np.abs(attr).mean(axis=0).nlargest(n_pathways).index) out = pd.DataFrame(columns=['borda', 'mean']) out['borda'] = borda out['mean'] = attr_mean out.to_csv(res_dir + drug + '/pathways.csv', index=False) if not os.path.exists(res_dir + drug + '/explainer/'): os.mkdir(res_dir + drug + '/explainer/') explainer.save(res_dir + drug + '/explainer/', custom_model_saver=None)