def find_genes_CX(drug, test_tcga_expr): tf.keras.backend.clear_session() attr_ind = [] for seed in range(1, 11): exp = CXPlain.load('gene_finding/results/CX_ind1/%s/seed%d/explainer' % (drug, seed), relpath=True) attr = exp.explain(test_tcga_expr.values) attr = pd.DataFrame(attr, index=test_tcga_expr.index, columns=dataset.hgnc) attr_ind.append(attr) all_attr = pd.DataFrame(columns=['seed', 'sample'] + list(dataset.hgnc)) i = 0 for sample in attr_ind[0].index: for seed in range(10): all_attr.loc[i] = [seed, sample] + list(attr_ind[seed].loc[sample]) i += 1 print('boat') boat = borda_of_all_tuples(all_attr) print('seed then sample') seed_then_sample = sample_borda_of_seed_bordas(all_attr, test_tcga_expr.index) print('sample then seed') sample_then_seed = seed_borda_of_sample_bordas(all_attr, range(10)) return boat, seed_then_sample, sample_then_seed
def test_imdb_padded_valid(self): num_samples = 32 num_words = 1024 (x_train, y_train), (x_test, y_test) = TestUtil.get_imdb(word_dictionary_size=num_words, num_subsamples=num_samples) explained_model = RandomForestClassifier(n_estimators=64, max_depth=5, random_state=1) counter = CountVectoriser(num_words) tfidf_transformer = TfidfTransformer() explained_model = Pipeline([('counts', counter), ('tfidf', tfidf_transformer), ('model', explained_model)]) explained_model.fit(x_train, y_train) model_builder = RNNModelBuilder(embedding_size=num_words, with_embedding=True, num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0, batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128) masking_operation = WordDropMasking() loss = binary_crossentropy explainer = CXPlain(explained_model, model_builder, masking_operation, loss) x_train = pad_sequences(x_train, padding="post", truncating="post", dtype=int) x_test = pad_sequences(x_test, padding="post", truncating="post", dtype=int, maxlen=x_train.shape[1]) explainer.fit(x_train, y_train) eval_score = explainer.score(x_test, y_test) train_score = explainer.get_last_fit_score() median = explainer.predict(x_test) self.assertTrue(median.shape == x_test.shape)
def test_boston_housing_valid(self): (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing() explained_model = RandomForestRegressor(n_estimators=64, max_depth=5, random_state=1) explained_model.fit(x_train, y_train) model_builder = MLPModelBuilder(num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0, batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128) masking_operation = ZeroMasking() loss = mean_squared_error explainer = CXPlain(explained_model, model_builder, masking_operation, loss) explainer.fit(x_train, y_train) self.assertEqual(explainer.prediction_model.output_shape, (None, np.prod(x_test.shape[1:]))) eval_score = explainer.score(x_test, y_test) train_score = explainer.get_last_fit_score() median = explainer.predict(x_test) self.assertTrue(median.shape == x_test.shape)
def test_time_series_valid(self): num_samples = 1024 fixed_length = 99 (x_train, y_train), (x_test, y_test) = TestUtil.get_random_fixed_length_dataset( num_samples=num_samples, fixed_length=fixed_length) model_builder = RNNModelBuilder(with_embedding=False, num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0, batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128) explained_model = MLPClassifier() explained_model.fit(x_train.reshape((-1, np.prod(x_train.shape[1:]))), y_train) masking_operation = ZeroMasking() loss = binary_crossentropy explainer = CXPlain(explained_model, model_builder, masking_operation, loss, flatten_for_explained_model=True) explainer.fit(x_train, y_train) eval_score = explainer.score(x_test, y_test) train_score = explainer.get_last_fit_score() median = explainer.predict(x_test) self.assertTrue(median.shape == x_test.shape)
def test_mnist_unet_valid(self): num_subsamples = 100 (x_train, y_train), (x_test, y_test) = TestUtil.get_mnist(flattened=False, num_subsamples=num_subsamples) explained_model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(64, 32), random_state=1) explained_model.fit(x_train.reshape((len(x_train), -1)), y_train) masking_operation = ZeroMasking() loss = categorical_crossentropy downsample_factors = [(2, 2), (4, 4), (4, 7), (7, 4), (7, 7)] with_bns = [True if i % 2 == 0 else False for i in range(len(downsample_factors))] for downsample_factor, with_bn in zip(downsample_factors, with_bns): model_builder = UNetModelBuilder(downsample_factor, num_layers=2, num_units=64, activation="relu", p_dropout=0.2, verbose=0, batch_size=256, learning_rate=0.001, num_epochs=2, early_stopping_patience=128, with_bn=with_bn) explainer = CXPlain(explained_model, model_builder, masking_operation, loss, downsample_factors=downsample_factor, flatten_for_explained_model=True) explainer.fit(x_train, y_train) eval_score = explainer.score(x_test, y_test) train_score = explainer.get_last_fit_score() median = explainer.predict(x_test) self.assertTrue(median.shape == x_test.shape)
def test_mnist_unet_with_shape_valid(self): num_subsamples = 100 (x_train, y_train), (x_test, y_test) = TestUtil.get_mnist(flattened=False, num_subsamples=num_subsamples) explained_model_builder = MLPModelBuilder(num_layers=2, num_units=64, activation="relu", p_dropout=0.2, verbose=0, batch_size=256, learning_rate=0.001, num_epochs=2, early_stopping_patience=128) input_shape = x_train.shape[1:] input_layer = Input(shape=input_shape) last_layer = Flatten()(input_layer) last_layer = explained_model_builder.build(last_layer) last_layer = Dense(y_train.shape[-1], activation="softmax")(last_layer) explained_model = Model(input_layer, last_layer) explained_model.compile(loss="categorical_crossentropy", optimizer="adam") explained_model.fit(x_train, y_train) masking_operation = ZeroMasking() loss = categorical_crossentropy downsample_factors = [(2, 2), (4, 4), (4, 7), (7, 4), (7, 7)] with_bns = [ True if i % 2 == 0 else False for i in range(len(downsample_factors)) ] for downsample_factor, with_bn in zip(downsample_factors, with_bns): model_builder = UNetModelBuilder(downsample_factor, num_layers=2, num_units=64, activation="relu", p_dropout=0.2, verbose=0, batch_size=256, learning_rate=0.001, num_epochs=2, early_stopping_patience=128, with_bn=with_bn) explainer = CXPlain(explained_model, model_builder, masking_operation, loss, downsample_factors=downsample_factor) explainer.fit(x_train, y_train) eval_score = explainer.score(x_test, y_test) train_score = explainer.get_last_fit_score() median = explainer.predict(x_test) self.assertTrue(median.shape == x_test.shape)
def test_mnist_valid(self): num_subsamples = 100 (x_train, y_train), (x_test, y_test) = TestUtil.get_mnist(flattened=False, num_subsamples=num_subsamples) explained_model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(64, 32), random_state=1) explained_model.fit(x_train.reshape((len(x_train), -1)), y_train) model_builder = MLPModelBuilder(num_layers=2, num_units=64, activation="relu", p_dropout=0.2, verbose=0, batch_size=256, learning_rate=0.001, num_epochs=3, early_stopping_patience=128) masking_operation = ZeroMasking() loss = categorical_crossentropy downsample_factors = [(2, 2), (4, 4), (4, 7), (7, 4), (7, 7)] for downsample_factor in downsample_factors: explainer = CXPlain(explained_model, model_builder, masking_operation, loss, num_models=2, downsample_factors=downsample_factor, flatten_for_explained_model=True) explainer.fit(x_train, y_train) eval_score = explainer.score(x_test, y_test) train_score = explainer.get_last_fit_score() median, confidence = explainer.predict(x_test, confidence_level=0.95) self.assertTrue(median.shape == x_test.shape) self.assertTrue(confidence.shape == x_test.shape[:-1] + (2, )) # Flatten predictions for iteration below. median = median.reshape((len(x_test), -1)) confidence = confidence.reshape((len(x_test), -1, 2)) for sample_idx in range(len(x_test)): for feature_idx in range(len(x_test[sample_idx])): self.assertTrue(confidence[sample_idx][feature_idx][0] <= median[sample_idx][feature_idx] <= confidence[sample_idx][feature_idx][1]) self.assertTrue( confidence[sample_idx][feature_idx][0] >= 0) self.assertTrue( confidence[sample_idx][feature_idx][1] >= 0)
def test_mnist_confidence_levels_valid(self): num_subsamples = 100 (x_train, y_train), (x_test, y_test) = TestUtil.get_mnist(flattened=False, num_subsamples=num_subsamples) explained_model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(64, 32), random_state=1) explained_model.fit(x_train.reshape((len(x_train), -1)), y_train) model_builder = MLPModelBuilder(num_layers=2, num_units=64, activation="relu", p_dropout=0.2, verbose=0, batch_size=256, learning_rate=0.001, num_epochs=3, early_stopping_patience=128) masking_operation = ZeroMasking() loss = categorical_crossentropy confidence_levels = [0.0, 1.0, 1.01, -0.01] for confidence_level in confidence_levels: downsample_factor = (2, 2) explainer = CXPlain(explained_model, model_builder, masking_operation, loss, num_models=2, downsample_factors=downsample_factor, flatten_for_explained_model=True) explainer.fit(x_train, y_train) with self.assertRaises(ValueError): _ = explainer.predict(x_test, confidence_level=confidence_level)
def test_boston_housing_no_fit_invalid(self): (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing() explained_model = RandomForestRegressor(n_estimators=64, max_depth=5, random_state=1) explained_model.fit(x_train, y_train) model_builder = MLPModelBuilder(num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0, batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128) masking_operation = ZeroMasking() loss = mean_squared_error explainer = CXPlain(explained_model, model_builder, masking_operation, loss) with self.assertRaises(AssertionError): explainer.predict(x_test, y_test) with self.assertRaises(AssertionError): explainer.score(x_test, y_test)
def test_boston_housing_confidence_level_invalid(self): (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing() explained_model = RandomForestRegressor(n_estimators=64, max_depth=5, random_state=1) explained_model.fit(x_train, y_train) model_builder = MLPModelBuilder(num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0, batch_size=32, learning_rate=0.001, num_epochs=3, early_stopping_patience=128) masking_operation = ZeroMasking() loss = mean_squared_error num_models = 2 explainer = CXPlain(explained_model, model_builder, masking_operation, loss, num_models=num_models) explainer.fit(x_train, y_train) invalid_confidence_levels = [1.01, -0.5, -0.01] for confidence_level in invalid_confidence_levels: with self.assertRaises(ValueError): explainer.predict(x_test, confidence_level=confidence_level)
def test_nlp_not_padded_invalid(self): num_words = 1024 (x_train, y_train), (_, _) = TestUtil.get_random_variable_length_dataset(max_value=num_words) explained_model = RandomForestClassifier(n_estimators=64, max_depth=5, random_state=1) counter = CountVectoriser(num_words) tfidf_transformer = TfidfTransformer() explained_model = Pipeline([('counts', counter), ('tfidf', tfidf_transformer), ('model', explained_model)]) explained_model.fit(x_train, y_train) model_builder = RNNModelBuilder(embedding_size=num_words, with_embedding=True, num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0, batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128) masking_operation = WordDropMasking() loss = binary_crossentropy explainer = CXPlain(explained_model, model_builder, masking_operation, loss) with self.assertRaises(ValueError): explainer.fit(x_train, y_train)
def test_boston_housing_valid(self): (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing() explained_model = RandomForestRegressor(n_estimators=64, max_depth=5, random_state=1) explained_model.fit(x_train, y_train) model_builder = MLPModelBuilder(num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0, batch_size=32, learning_rate=0.001, num_epochs=3, early_stopping_patience=128) masking_operation = ZeroMasking() loss = mean_squared_error for num_models in [2, 5, 10]: explainer = CXPlain(explained_model, model_builder, masking_operation, loss, num_models=num_models) explainer.fit(x_train, y_train) eval_score = explainer.score(x_test, y_test) train_score = explainer.get_last_fit_score() median, confidence = explainer.predict(x_test, confidence_level=0.95) self.assertTrue(median.shape == x_test.shape) self.assertTrue(confidence.shape == x_test.shape + (2, )) # Flatten predictions for iteration below. median = median.reshape((len(x_test), -1)) confidence = confidence.reshape((len(x_test), -1, 2)) for sample_idx in range(len(x_test)): for feature_idx in range(len(x_test[sample_idx])): self.assertTrue(confidence[sample_idx][feature_idx][0] <= median[sample_idx][feature_idx] <= confidence[sample_idx][feature_idx][1]) self.assertTrue( confidence[sample_idx][feature_idx][0] >= 0) self.assertTrue( confidence[sample_idx][feature_idx][1] >= 0)
def find_genes_CX(drug, model, gdsc_expr, gdsc_dr, test_tcga_expr): print('obtaining masked data...') # masked_data, list_of_baselines = get_masked_data_for_CXPlain(model, gdsc_expr, pathway_matrix) masked_data, list_of_baselines = get_masked_data_for_CXPlain(model, gdsc_expr, pathway_matrix, subtract_mean=True) lb = np.concatenate(list_of_baselines).reshape(len(gdsc_expr), -1) lb = pd.DataFrame(lb, index=gdsc_expr.index, columns=pathway_names) lb.to_csv(res_dir + drug + '/baselines.csv') # print(lb.shape) # exit() print('obtained masked data...') import tensorflow as tf tf.compat.v1.disable_v2_behavior() tf.keras.backend.clear_session() tf.random.set_seed(SEED) from tensorflow.python.keras.losses import mean_squared_error as loss from cxplain import CXPlain from cxplain.backend.model_builders.custom_mlp import CustomMLPModelBuilder # from cxplain.backend.masking.zero_masking import FastZeroMasking n_pathways = len(pathway_names) model_builder = CustomMLPModelBuilder(num_layers=2, num_units=512, batch_size=16, learning_rate=0.001, n_feature_groups=n_pathways) # masking_operation = FastZeroMasking() print(gdsc_expr.values.shape, gdsc_dr.values.shape) print("Fitting CXPlain model") explainer = CXPlain(model, model_builder, None, loss, num_models=3) explainer.fit(gdsc_expr.values, gdsc_dr.values, masked_data=masked_data) print("Attributing using CXPlain") attr,_ = explainer.explain_groups(test_tcga_expr.values) print('attr') attr = pd.DataFrame(attr, index=test_tcga_expr.index, columns=pathway_names) borda = get_ranked_list(attr, k=n_pathways) attr_mean = list(np.abs(attr).mean(axis=0).nlargest(n_pathways).index) out = pd.DataFrame(columns=['borda', 'mean']) out['borda'] = borda out['mean'] = attr_mean out.to_csv(res_dir + drug + '/pathways.csv', index=False) if not os.path.exists(res_dir + drug + '/explainer/'): os.mkdir(res_dir + drug + '/explainer/') explainer.save(res_dir + drug + '/explainer/', custom_model_saver=None)
def find_genes_CX(drug, model, meta, gdsc_expr, gdsc_dr, test_tcga_expr, save_dir): torch.manual_seed(SEED) np.random.seed(SEED) print('obtaining masked data...') masked_data = get_masked_data_for_CXPlain(model, gdsc_expr) print('obtained masked data...') # get_masked_data_for_CXPlain(model, test_tcga_expr) import tensorflow as tf tf.compat.v1.disable_v2_behavior() tf.keras.backend.clear_session() tf.random.set_seed(SEED) from tensorflow.python.keras.losses import mean_squared_error as loss from cxplain import MLPModelBuilder, CXPlain # from cxplain.backend.masking.zero_masking import FastZeroMasking model_builder = MLPModelBuilder(num_layers=2, num_units=512, batch_size=8, learning_rate=0.001) # masking_operation = FastZeroMasking() print(gdsc_expr.values.shape, gdsc_dr.values.shape) print("Fitting CXPlain model") explainer = CXPlain(model, model_builder, None, loss) explainer.fit(gdsc_expr.values, gdsc_dr.values, masked_data=masked_data) print("Attributing using CXPlain") attr = explainer.explain(test_tcga_expr.values) attr = pd.DataFrame(attr, index=test_tcga_expr.index, columns=dataset.hgnc) borda = get_ranked_list(attr) attr_mean = list(np.abs(attr).mean(axis=0).nlargest(200).index) out = pd.DataFrame(columns=['borda', 'mean']) out['borda'] = borda out['mean'] = attr_mean out.to_csv(save_dir + '/genes.csv', index=False) if not os.path.exists(save_dir + '/explainer/'): os.mkdir(save_dir + '/explainer/') explainer.save(save_dir + '/explainer/')
def find_genes_CX(drug, model, gdsc_expr, gdsc_dr, test_tcga_expr): print('obtaining precalculating omegas...') loss = torch.nn.MSELoss(reduction='none') # idx = gdsc_expr.index[:10] # gdsc_expr = gdsc_expr.loc[idx] # gdsc_dr = gdsc_dr[idx] # omegas, lb = precalculate_omegas(model, gdsc_expr, gdsc_dr, pathway_matrix, loss) # omegas, lb = precalculate_omegas_scaled(model, gdsc_expr, gdsc_dr, pathway_matrix, loss) # omegas, lb = precalculate_omegas(model, gdsc_expr, gdsc_dr, pathway_matrix, loss,mode='scaled-difference') # omegas, lb = precalculate_omegas(model, gdsc_expr, gdsc_dr, pathway_matrix, loss, mode='delta-of-delta') omegas, lb = precalculate_omegas(model, gdsc_expr, gdsc_dr, pathway_matrix, loss, mode='delta-scaled') if lb is not None: lb = pd.DataFrame(lb, index=gdsc_expr.index, columns=pathway_names) lb.to_csv(res_dir + drug + '/baselines.csv') print('obtained masked data...') import tensorflow as tf tf.compat.v1.disable_v2_behavior() tf.keras.backend.clear_session() tf.random.set_seed(SEED) from tensorflow.python.keras.losses import mean_squared_error as loss from cxplain import CXPlain from cxplain.backend.model_builders.custom_mlp_precalc import CustomMLPModelBuilder # from cxplain.backend.masking.zero_masking import FastZeroMasking n_pathways = len(pathway_names) model_builder = CustomMLPModelBuilder(num_layers=2, num_units=512, batch_size=16, learning_rate=0.001, n_feature_groups=n_pathways) # masking_operation = FastZeroMasking() print(gdsc_expr.values.shape, gdsc_dr.values.shape) print("Fitting CXPlain model") explainer = CXPlain(model, model_builder, None, loss, num_models=3) explainer.fit_precalc(gdsc_expr.values, gdsc_dr.values, omega=omegas) print("Attributing using CXPlain") attr, _ = explainer.explain_groups(test_tcga_expr.values) print('attr') attr = pd.DataFrame(attr, index=test_tcga_expr.index, columns=pathway_names) borda = get_ranked_list(attr, k=n_pathways) attr_mean = list(np.abs(attr).mean(axis=0).nlargest(n_pathways).index) out = pd.DataFrame(columns=['borda', 'mean']) out['borda'] = borda out['mean'] = attr_mean out.to_csv(res_dir + drug + '/pathways.csv', index=False) if not os.path.exists(res_dir + drug + '/explainer/'): os.mkdir(res_dir + drug + '/explainer/') explainer.save(res_dir + drug + '/explainer/', custom_model_saver=None)
model(masked_sample).unsqueeze(0).detach().numpy()) masked_outs = np.concatenate(list_of_masked_outs) return (x, y_pred, masked_outs) model_builder = CustomMLPModelBuilder(num_layers=2, num_units=32, batch_size=32, learning_rate=0.001, n_feature_groups=10) # masking_operation = ZeroMasking() # masking_operation = FastZeroMasking() k = get_masked_data_for_CXPlain(model, x[:200]) explainer = CXPlain(model, model_builder, None, loss) #,downsample_factors=(5,)) explainer.fit(x[:200], y[:200], masked_data=k) attributions = explainer.explain_groups(x[200:]) attr = pd.DataFrame(attributions, index=range(200, 300)) # plt.plot(range(50), np.abs(attr).mean(axis=0), label='attr') # #plt.plot(range(50), np.abs(mult).mean(axis=0), label='mult') # plt.show() print(attr.shape) print(attr.loc[200]) df = pd.DataFrame(x[200:], index=range(200, 300)) idx = df.loc[np.abs(df[20]) < 0.1].loc[np.abs(df[40]) > 1].index[0] # plt.plot(range(50), np.abs(attr.loc[idx]), label='attr')
def test_boston_housing_load_save_valid(self): (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing() explained_model = RandomForestRegressor(n_estimators=64, max_depth=5, random_state=1) explained_model.fit(x_train, y_train) model_builder = MLPModelBuilder(num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0, batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128) masking_operation = ZeroMasking() loss = mean_squared_error num_models_settings = [1, 2] for num_models in num_models_settings: explainer = CXPlain(explained_model, model_builder, masking_operation, loss, num_models=num_models) explainer.fit(x_train, y_train) median_1 = explainer.predict(x_test) tmp_dir_name = tempfile.mkdtemp() explainer.save(tmp_dir_name) with self.assertRaises(ValueError): explainer.save(tmp_dir_name, overwrite=False) explainer.save(tmp_dir_name, overwrite=True) explainer.load(tmp_dir_name) median_2 = explainer.predict(x_test) self.assertTrue(np.array_equal(median_1, median_2)) shutil.rmtree(tmp_dir_name) # Cleanup.
def test_overwrite_ensemble_model_invalid(self): (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing() model_builder = MLPModelBuilder() explained_model = RandomForestRegressor(n_estimators=64, max_depth=5, random_state=1) explained_model.fit(x_train, y_train) masking_operation = ZeroMasking() loss = binary_crossentropy num_models = 5 explainer = CXPlain(explained_model, model_builder, masking_operation, loss, num_models=num_models) file_names = [ CXPlain.get_config_file_name(), CXPlain.get_explained_model_file_name(".pkl"), CXPlain.get_loss_pkl_file_name(), CXPlain.get_model_builder_pkl_file_name(), CXPlain.get_masking_operation_pkl_file_name() ] # Test with untrained explanation model. for file_name in file_names: tmp_dir = TestExplanationModel.make_at_tmp(file_name) with self.assertRaises(ValueError): explainer.save(tmp_dir, overwrite=False) # Test with trained explanation model. explainer.fit(x_train, y_train) file_names = [ CXPlain.get_config_file_name(), CXPlain.get_explained_model_file_name(".pkl"), CXPlain.get_loss_pkl_file_name(), CXPlain.get_model_builder_pkl_file_name(), CXPlain.get_masking_operation_pkl_file_name() ] + [ CXPlain.get_prediction_model_h5_file_name(i) for i in range(num_models) ] for file_name in file_names: tmp_dir = TestExplanationModel.make_at_tmp(file_name) with self.assertRaises(ValueError): explainer.save(tmp_dir, overwrite=False)
# _, _, _, test_tcga_expr = dataset.filter_and_normalize_data(drug) # exp = CXPlain.load('gene_finding/results/%s/%s/explainer'%(folder, drug), custom_model_loader=None, relpath=True) # attr,_ = exp.explain(test_tcga_expr.values) # attr = pd.DataFrame(attr, index=test_tcga_expr.index, columns=dataset.genes) # attr_dict[drug]=attr attr_dict = {} conf_dict = {} for i, drug in enumerate(drugs): print(drug) _, _, _, test_tcga_expr = dataset.filter_and_normalize_data(drug) attr_all = np.zeros((len(test_tcga_expr.index), len(dataset.genes))) for seed in range(1, 11): exp = CXPlain.load('gene_finding/results/%s/%s/seed%d/explainer'%(folder, drug, seed), custom_model_loader=None, relpath=True) attr = exp.explain(test_tcga_expr.values) attr_all += attr attr = pd.DataFrame(attr_all/10.0, index=test_tcga_expr.index, columns=dataset.genes) attr_dict[drug]=attr fig, axes = plt.subplots(7, 2, figsize=(14, 35)) writer_a = pd.ExcelWriter('gene_finding/results/%s/top_genes_mean_of_means_aggregation.xlsx'%folder, engine='xlsxwriter') conv = pd.DataFrame(index=dataset.genes, columns=['hgnc']) conv['hgnc'] = dataset.hgnc for i, drug in enumerate(drugs):
def cxpl(model_dir, data_dir, results_subdir, random_seed, resolution): np.random.seed(random_seed) tf.set_random_seed(np.random.randint(1 << 31)) session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) sess = tf.Session(graph=tf.get_default_graph(), config=session_conf) set_session(sess) # parser config config_file = model_dir+ "/config.ini" print("Config File Path:", config_file,flush=True) assert os.path.isfile(config_file) cp = ConfigParser() cp.read(config_file) output_dir = os.path.join(results_subdir, "classification_results/test") print("Output Directory:", output_dir,flush=True) if not os.path.isdir(output_dir): os.makedirs(output_dir) # default config image_dimension = cp["TRAIN"].getint("image_dimension") gan_resolution = resolution batch_size = cp["TEST"].getint("batch_size") use_best_weights = cp["TEST"].getboolean("use_best_weights") if use_best_weights: print("** Using BEST weights",flush=True) model_weights_path = os.path.join(results_subdir, "classification_results/train/best_weights.h5") else: print("** Using LAST weights",flush=True) model_weights_path = os.path.join(results_subdir, "classification_results/train/weights.h5") print("** DenseNet Input Resolution:", image_dimension, flush=True) print("** GAN Image Resolution:", gan_resolution, flush=True) # get test sample count test_dir = os.path.join(results_subdir, "inference/test") shutil.copy(test_dir+"/test.csv", output_dir) # Get class names class_names = get_class_names(output_dir,"test") tfrecord_dir_te = os.path.join(data_dir, "test") test_counts, _ = get_sample_counts(output_dir, "test", class_names) # get indicies (all of csv file for validation) print("** test counts:", test_counts, flush=True) # compute steps test_steps = int(np.floor(test_counts / batch_size)) print("** test_steps:", test_steps, flush=True) log2_record = int(np.log2(gan_resolution)) record_file_ending = "*"+ np.str(log2_record)+ ".tfrecords" print("** resolution ", gan_resolution, " corresponds to ", record_file_ending, " TFRecord file.", flush=True) # Get Model # ------------------------------------ input_shape=(image_dimension, image_dimension, 3) img_input = Input(shape=input_shape) base_model = DenseNet121( include_top = False, weights = None, input_tensor = img_input, input_shape = input_shape, pooling = "avg") x = base_model.output predictions = Dense(len(class_names), activation="sigmoid", name="predictions")(x) model = Model(inputs=img_input, outputs = predictions) print(" ** load model from:", model_weights_path, flush=True) model.load_weights(model_weights_path) # ------------------------------------ print("** load test generator **", flush=True) test_seq = TFWrapper( tfrecord_dir=tfrecord_dir_te, record_file_endings = record_file_ending, batch_size = batch_size, model_target_size = (image_dimension, image_dimension), steps = None, augment=False, shuffle=False, prefetch=True, repeat=False) print("** make prediction **", flush=True) test_seq.initialise() x_all, y_all = test_seq.get_all_test_data() print("X-Test Shape:", x_all.shape,flush=True) print("Y-Test Shape:", y_all.shape,flush=True) print("----------------------------------------", flush=True) print("Test Model AUROC", flush=True) y_pred = model.predict(x_all) current_auroc = [] for i in range(len(class_names)): try: score = roc_auc_score(y_all[:, i], y_pred[:, i]) except ValueError: score = 0 current_auroc.append(score) print(i+1,class_names[i],": ", score, flush=True) mean_auroc = np.mean(current_auroc) print("Mean auroc: ", mean_auroc,flush=True) print("----------------------------------------", flush=True) downscale_factor = 8 num_models_to_use = 3 num_test_images = 100 print("Number of Models to use:", num_models_to_use, flush=True) print("Number of Test images:", num_test_images, flush=True) x_tr, y_tr = x_all[num_test_images:], y_all[num_test_images:] x_te, y_te = x_all[0:num_test_images], y_all[0:num_test_images] downsample_factors = (downscale_factor,downscale_factor) print("Downsample Factors:", downsample_factors,flush=True) model_builder = UNetModelBuilder(downsample_factors, num_layers=2, num_units=8, activation="relu", p_dropout=0.0, verbose=0, batch_size=32, learning_rate=0.001) print("Model build done.",flush=True) masking_operation = ZeroMasking() loss = categorical_crossentropy explainer = CXPlain(model, model_builder, masking_operation, loss, num_models=num_models_to_use, downsample_factors=downsample_factors, flatten_for_explained_model=False) print("Explainer build done.",flush=True) explainer.fit(x_tr, y_tr); print("Explainer fit done.",flush=True) try: attr, conf = explainer.explain(x_te, confidence_level=0.80) np.save(output_dir+"/x_cxpl.npy", x_te) np.save(output_dir+"/y_cxpl.npy", y_te) np.save(output_dir+"/attr.npy", attr) np.save(output_dir+"/conf.npy", conf) print("Explainer explain done and saved.",flush=True) except Exception as ef: print(ef,flush=True)