Exemplo n.º 1
0
def find_genes_CX(drug, test_tcga_expr):
    tf.keras.backend.clear_session()
    attr_ind = []
    for seed in range(1, 11):
        exp = CXPlain.load('gene_finding/results/CX_ind1/%s/seed%d/explainer' %
                           (drug, seed),
                           relpath=True)
        attr = exp.explain(test_tcga_expr.values)
        attr = pd.DataFrame(attr,
                            index=test_tcga_expr.index,
                            columns=dataset.hgnc)
        attr_ind.append(attr)

    all_attr = pd.DataFrame(columns=['seed', 'sample'] + list(dataset.hgnc))

    i = 0
    for sample in attr_ind[0].index:
        for seed in range(10):
            all_attr.loc[i] = [seed, sample] + list(attr_ind[seed].loc[sample])
            i += 1

    print('boat')
    boat = borda_of_all_tuples(all_attr)
    print('seed then sample')
    seed_then_sample = sample_borda_of_seed_bordas(all_attr,
                                                   test_tcga_expr.index)
    print('sample then seed')
    sample_then_seed = seed_borda_of_sample_bordas(all_attr, range(10))

    return boat, seed_then_sample, sample_then_seed
Exemplo n.º 2
0
    def test_imdb_padded_valid(self):
        num_samples = 32
        num_words = 1024
        (x_train, y_train), (x_test, y_test) = TestUtil.get_imdb(word_dictionary_size=num_words,
                                                                 num_subsamples=num_samples)

        explained_model = RandomForestClassifier(n_estimators=64, max_depth=5, random_state=1)

        counter = CountVectoriser(num_words)
        tfidf_transformer = TfidfTransformer()

        explained_model = Pipeline([('counts', counter),
                                    ('tfidf', tfidf_transformer),
                                    ('model', explained_model)])
        explained_model.fit(x_train, y_train)

        model_builder = RNNModelBuilder(embedding_size=num_words, with_embedding=True,
                                        num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0,
                                        batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128)
        masking_operation = WordDropMasking()
        loss = binary_crossentropy
        explainer = CXPlain(explained_model, model_builder, masking_operation, loss)

        x_train = pad_sequences(x_train, padding="post", truncating="post", dtype=int)
        x_test = pad_sequences(x_test, padding="post", truncating="post", dtype=int, maxlen=x_train.shape[1])

        explainer.fit(x_train, y_train)
        eval_score = explainer.score(x_test, y_test)
        train_score = explainer.get_last_fit_score()
        median = explainer.predict(x_test)
        self.assertTrue(median.shape == x_test.shape)
Exemplo n.º 3
0
    def test_boston_housing_valid(self):
        (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing()
        explained_model = RandomForestRegressor(n_estimators=64,
                                                max_depth=5,
                                                random_state=1)
        explained_model.fit(x_train, y_train)

        model_builder = MLPModelBuilder(num_layers=2,
                                        num_units=32,
                                        activation="relu",
                                        p_dropout=0.2,
                                        verbose=0,
                                        batch_size=32,
                                        learning_rate=0.001,
                                        num_epochs=2,
                                        early_stopping_patience=128)
        masking_operation = ZeroMasking()
        loss = mean_squared_error
        explainer = CXPlain(explained_model, model_builder, masking_operation,
                            loss)

        explainer.fit(x_train, y_train)
        self.assertEqual(explainer.prediction_model.output_shape,
                         (None, np.prod(x_test.shape[1:])))

        eval_score = explainer.score(x_test, y_test)
        train_score = explainer.get_last_fit_score()
        median = explainer.predict(x_test)
        self.assertTrue(median.shape == x_test.shape)
Exemplo n.º 4
0
    def test_time_series_valid(self):
        num_samples = 1024
        fixed_length = 99
        (x_train,
         y_train), (x_test, y_test) = TestUtil.get_random_fixed_length_dataset(
             num_samples=num_samples, fixed_length=fixed_length)

        model_builder = RNNModelBuilder(with_embedding=False,
                                        num_layers=2,
                                        num_units=32,
                                        activation="relu",
                                        p_dropout=0.2,
                                        verbose=0,
                                        batch_size=32,
                                        learning_rate=0.001,
                                        num_epochs=2,
                                        early_stopping_patience=128)

        explained_model = MLPClassifier()
        explained_model.fit(x_train.reshape((-1, np.prod(x_train.shape[1:]))),
                            y_train)

        masking_operation = ZeroMasking()
        loss = binary_crossentropy
        explainer = CXPlain(explained_model,
                            model_builder,
                            masking_operation,
                            loss,
                            flatten_for_explained_model=True)

        explainer.fit(x_train, y_train)
        eval_score = explainer.score(x_test, y_test)
        train_score = explainer.get_last_fit_score()
        median = explainer.predict(x_test)
        self.assertTrue(median.shape == x_test.shape)
Exemplo n.º 5
0
    def test_mnist_unet_valid(self):
        num_subsamples = 100
        (x_train, y_train), (x_test, y_test) = TestUtil.get_mnist(flattened=False, num_subsamples=num_subsamples)

        explained_model = MLPClassifier(solver='lbfgs', alpha=1e-5,
                                        hidden_layer_sizes=(64, 32), random_state=1)
        explained_model.fit(x_train.reshape((len(x_train), -1)), y_train)
        masking_operation = ZeroMasking()
        loss = categorical_crossentropy

        downsample_factors = [(2, 2), (4, 4), (4, 7), (7, 4), (7, 7)]
        with_bns = [True if i % 2 == 0 else False for i in range(len(downsample_factors))]
        for downsample_factor, with_bn in zip(downsample_factors, with_bns):
            model_builder = UNetModelBuilder(downsample_factor, num_layers=2, num_units=64, activation="relu",
                                             p_dropout=0.2, verbose=0, batch_size=256, learning_rate=0.001,
                                             num_epochs=2, early_stopping_patience=128, with_bn=with_bn)

            explainer = CXPlain(explained_model, model_builder, masking_operation, loss,
                                downsample_factors=downsample_factor, flatten_for_explained_model=True)

            explainer.fit(x_train, y_train)
            eval_score = explainer.score(x_test, y_test)
            train_score = explainer.get_last_fit_score()
            median = explainer.predict(x_test)
            self.assertTrue(median.shape == x_test.shape)
Exemplo n.º 6
0
    def test_mnist_unet_with_shape_valid(self):
        num_subsamples = 100
        (x_train,
         y_train), (x_test,
                    y_test) = TestUtil.get_mnist(flattened=False,
                                                 num_subsamples=num_subsamples)

        explained_model_builder = MLPModelBuilder(num_layers=2,
                                                  num_units=64,
                                                  activation="relu",
                                                  p_dropout=0.2,
                                                  verbose=0,
                                                  batch_size=256,
                                                  learning_rate=0.001,
                                                  num_epochs=2,
                                                  early_stopping_patience=128)
        input_shape = x_train.shape[1:]
        input_layer = Input(shape=input_shape)
        last_layer = Flatten()(input_layer)
        last_layer = explained_model_builder.build(last_layer)
        last_layer = Dense(y_train.shape[-1], activation="softmax")(last_layer)
        explained_model = Model(input_layer, last_layer)
        explained_model.compile(loss="categorical_crossentropy",
                                optimizer="adam")
        explained_model.fit(x_train, y_train)
        masking_operation = ZeroMasking()
        loss = categorical_crossentropy

        downsample_factors = [(2, 2), (4, 4), (4, 7), (7, 4), (7, 7)]
        with_bns = [
            True if i % 2 == 0 else False
            for i in range(len(downsample_factors))
        ]
        for downsample_factor, with_bn in zip(downsample_factors, with_bns):
            model_builder = UNetModelBuilder(downsample_factor,
                                             num_layers=2,
                                             num_units=64,
                                             activation="relu",
                                             p_dropout=0.2,
                                             verbose=0,
                                             batch_size=256,
                                             learning_rate=0.001,
                                             num_epochs=2,
                                             early_stopping_patience=128,
                                             with_bn=with_bn)

            explainer = CXPlain(explained_model,
                                model_builder,
                                masking_operation,
                                loss,
                                downsample_factors=downsample_factor)

            explainer.fit(x_train, y_train)
            eval_score = explainer.score(x_test, y_test)
            train_score = explainer.get_last_fit_score()
            median = explainer.predict(x_test)
            self.assertTrue(median.shape == x_test.shape)
Exemplo n.º 7
0
    def test_mnist_valid(self):
        num_subsamples = 100
        (x_train,
         y_train), (x_test,
                    y_test) = TestUtil.get_mnist(flattened=False,
                                                 num_subsamples=num_subsamples)

        explained_model = MLPClassifier(solver='lbfgs',
                                        alpha=1e-5,
                                        hidden_layer_sizes=(64, 32),
                                        random_state=1)
        explained_model.fit(x_train.reshape((len(x_train), -1)), y_train)

        model_builder = MLPModelBuilder(num_layers=2,
                                        num_units=64,
                                        activation="relu",
                                        p_dropout=0.2,
                                        verbose=0,
                                        batch_size=256,
                                        learning_rate=0.001,
                                        num_epochs=3,
                                        early_stopping_patience=128)
        masking_operation = ZeroMasking()
        loss = categorical_crossentropy

        downsample_factors = [(2, 2), (4, 4), (4, 7), (7, 4), (7, 7)]
        for downsample_factor in downsample_factors:
            explainer = CXPlain(explained_model,
                                model_builder,
                                masking_operation,
                                loss,
                                num_models=2,
                                downsample_factors=downsample_factor,
                                flatten_for_explained_model=True)

            explainer.fit(x_train, y_train)
            eval_score = explainer.score(x_test, y_test)
            train_score = explainer.get_last_fit_score()
            median, confidence = explainer.predict(x_test,
                                                   confidence_level=0.95)

            self.assertTrue(median.shape == x_test.shape)
            self.assertTrue(confidence.shape == x_test.shape[:-1] + (2, ))

            # Flatten predictions for iteration below.
            median = median.reshape((len(x_test), -1))
            confidence = confidence.reshape((len(x_test), -1, 2))

            for sample_idx in range(len(x_test)):
                for feature_idx in range(len(x_test[sample_idx])):
                    self.assertTrue(confidence[sample_idx][feature_idx][0] <=
                                    median[sample_idx][feature_idx] <=
                                    confidence[sample_idx][feature_idx][1])
                    self.assertTrue(
                        confidence[sample_idx][feature_idx][0] >= 0)
                    self.assertTrue(
                        confidence[sample_idx][feature_idx][1] >= 0)
Exemplo n.º 8
0
    def test_mnist_confidence_levels_valid(self):
        num_subsamples = 100
        (x_train,
         y_train), (x_test,
                    y_test) = TestUtil.get_mnist(flattened=False,
                                                 num_subsamples=num_subsamples)

        explained_model = MLPClassifier(solver='lbfgs',
                                        alpha=1e-5,
                                        hidden_layer_sizes=(64, 32),
                                        random_state=1)
        explained_model.fit(x_train.reshape((len(x_train), -1)), y_train)

        model_builder = MLPModelBuilder(num_layers=2,
                                        num_units=64,
                                        activation="relu",
                                        p_dropout=0.2,
                                        verbose=0,
                                        batch_size=256,
                                        learning_rate=0.001,
                                        num_epochs=3,
                                        early_stopping_patience=128)
        masking_operation = ZeroMasking()
        loss = categorical_crossentropy

        confidence_levels = [0.0, 1.0, 1.01, -0.01]
        for confidence_level in confidence_levels:
            downsample_factor = (2, 2)
            explainer = CXPlain(explained_model,
                                model_builder,
                                masking_operation,
                                loss,
                                num_models=2,
                                downsample_factors=downsample_factor,
                                flatten_for_explained_model=True)

            explainer.fit(x_train, y_train)

            with self.assertRaises(ValueError):
                _ = explainer.predict(x_test,
                                      confidence_level=confidence_level)
Exemplo n.º 9
0
    def test_boston_housing_no_fit_invalid(self):
        (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing()
        explained_model = RandomForestRegressor(n_estimators=64,
                                                max_depth=5,
                                                random_state=1)
        explained_model.fit(x_train, y_train)

        model_builder = MLPModelBuilder(num_layers=2,
                                        num_units=32,
                                        activation="relu",
                                        p_dropout=0.2,
                                        verbose=0,
                                        batch_size=32,
                                        learning_rate=0.001,
                                        num_epochs=2,
                                        early_stopping_patience=128)
        masking_operation = ZeroMasking()
        loss = mean_squared_error
        explainer = CXPlain(explained_model, model_builder, masking_operation,
                            loss)

        with self.assertRaises(AssertionError):
            explainer.predict(x_test, y_test)

        with self.assertRaises(AssertionError):
            explainer.score(x_test, y_test)
Exemplo n.º 10
0
    def test_boston_housing_confidence_level_invalid(self):
        (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing()
        explained_model = RandomForestRegressor(n_estimators=64,
                                                max_depth=5,
                                                random_state=1)
        explained_model.fit(x_train, y_train)

        model_builder = MLPModelBuilder(num_layers=2,
                                        num_units=32,
                                        activation="relu",
                                        p_dropout=0.2,
                                        verbose=0,
                                        batch_size=32,
                                        learning_rate=0.001,
                                        num_epochs=3,
                                        early_stopping_patience=128)
        masking_operation = ZeroMasking()
        loss = mean_squared_error

        num_models = 2
        explainer = CXPlain(explained_model,
                            model_builder,
                            masking_operation,
                            loss,
                            num_models=num_models)

        explainer.fit(x_train, y_train)

        invalid_confidence_levels = [1.01, -0.5, -0.01]

        for confidence_level in invalid_confidence_levels:
            with self.assertRaises(ValueError):
                explainer.predict(x_test, confidence_level=confidence_level)
Exemplo n.º 11
0
    def test_nlp_not_padded_invalid(self):
        num_words = 1024
        (x_train, y_train), (_, _) = TestUtil.get_random_variable_length_dataset(max_value=num_words)

        explained_model = RandomForestClassifier(n_estimators=64, max_depth=5, random_state=1)

        counter = CountVectoriser(num_words)
        tfidf_transformer = TfidfTransformer()

        explained_model = Pipeline([('counts', counter),
                                    ('tfidf', tfidf_transformer),
                                    ('model', explained_model)])
        explained_model.fit(x_train, y_train)

        model_builder = RNNModelBuilder(embedding_size=num_words, with_embedding=True,
                                        num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0,
                                        batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128)
        masking_operation = WordDropMasking()
        loss = binary_crossentropy
        explainer = CXPlain(explained_model, model_builder, masking_operation, loss)

        with self.assertRaises(ValueError):
            explainer.fit(x_train, y_train)
Exemplo n.º 12
0
    def test_boston_housing_valid(self):
        (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing()
        explained_model = RandomForestRegressor(n_estimators=64,
                                                max_depth=5,
                                                random_state=1)
        explained_model.fit(x_train, y_train)

        model_builder = MLPModelBuilder(num_layers=2,
                                        num_units=32,
                                        activation="relu",
                                        p_dropout=0.2,
                                        verbose=0,
                                        batch_size=32,
                                        learning_rate=0.001,
                                        num_epochs=3,
                                        early_stopping_patience=128)
        masking_operation = ZeroMasking()
        loss = mean_squared_error

        for num_models in [2, 5, 10]:
            explainer = CXPlain(explained_model,
                                model_builder,
                                masking_operation,
                                loss,
                                num_models=num_models)

            explainer.fit(x_train, y_train)
            eval_score = explainer.score(x_test, y_test)
            train_score = explainer.get_last_fit_score()
            median, confidence = explainer.predict(x_test,
                                                   confidence_level=0.95)

            self.assertTrue(median.shape == x_test.shape)
            self.assertTrue(confidence.shape == x_test.shape + (2, ))

            # Flatten predictions for iteration below.
            median = median.reshape((len(x_test), -1))
            confidence = confidence.reshape((len(x_test), -1, 2))

            for sample_idx in range(len(x_test)):
                for feature_idx in range(len(x_test[sample_idx])):
                    self.assertTrue(confidence[sample_idx][feature_idx][0] <=
                                    median[sample_idx][feature_idx] <=
                                    confidence[sample_idx][feature_idx][1])
                    self.assertTrue(
                        confidence[sample_idx][feature_idx][0] >= 0)
                    self.assertTrue(
                        confidence[sample_idx][feature_idx][1] >= 0)
Exemplo n.º 13
0
def find_genes_CX(drug, model, gdsc_expr, gdsc_dr, test_tcga_expr):
    print('obtaining masked data...')
    # masked_data, list_of_baselines = get_masked_data_for_CXPlain(model, gdsc_expr, pathway_matrix)
    masked_data, list_of_baselines = get_masked_data_for_CXPlain(model, gdsc_expr, pathway_matrix, subtract_mean=True)
    lb = np.concatenate(list_of_baselines).reshape(len(gdsc_expr), -1)
    lb = pd.DataFrame(lb, index=gdsc_expr.index, columns=pathway_names)
    lb.to_csv(res_dir + drug + '/baselines.csv')
    # print(lb.shape)
    # exit()
    print('obtained masked data...')

    import tensorflow as tf
    tf.compat.v1.disable_v2_behavior()
    tf.keras.backend.clear_session()
    tf.random.set_seed(SEED)
    from tensorflow.python.keras.losses import mean_squared_error as loss
    from cxplain import CXPlain
    from cxplain.backend.model_builders.custom_mlp import CustomMLPModelBuilder
    # from cxplain.backend.masking.zero_masking import FastZeroMasking
    n_pathways = len(pathway_names)
    model_builder = CustomMLPModelBuilder(num_layers=2, num_units=512, batch_size=16, learning_rate=0.001, n_feature_groups=n_pathways)
    # masking_operation = FastZeroMasking()

    print(gdsc_expr.values.shape, gdsc_dr.values.shape)

    print("Fitting CXPlain model")
    explainer = CXPlain(model, model_builder, None, loss, num_models=3)
    explainer.fit(gdsc_expr.values, gdsc_dr.values, masked_data=masked_data)
    print("Attributing using CXPlain")

    attr,_ = explainer.explain_groups(test_tcga_expr.values)
    print('attr')

    attr = pd.DataFrame(attr, index=test_tcga_expr.index, columns=pathway_names)
    borda = get_ranked_list(attr, k=n_pathways)

    attr_mean = list(np.abs(attr).mean(axis=0).nlargest(n_pathways).index)
    out = pd.DataFrame(columns=['borda', 'mean'])
    out['borda'] = borda 
    out['mean'] = attr_mean

    out.to_csv(res_dir + drug + '/pathways.csv', index=False)

    if not os.path.exists(res_dir + drug + '/explainer/'):
        os.mkdir(res_dir + drug + '/explainer/')

    explainer.save(res_dir + drug + '/explainer/', custom_model_saver=None)
Exemplo n.º 14
0
def find_genes_CX(drug, model, meta, gdsc_expr, gdsc_dr, test_tcga_expr,
                  save_dir):
    torch.manual_seed(SEED)
    np.random.seed(SEED)

    print('obtaining masked data...')
    masked_data = get_masked_data_for_CXPlain(model, gdsc_expr)
    print('obtained masked data...')
    # get_masked_data_for_CXPlain(model, test_tcga_expr)

    import tensorflow as tf
    tf.compat.v1.disable_v2_behavior()
    tf.keras.backend.clear_session()
    tf.random.set_seed(SEED)

    from tensorflow.python.keras.losses import mean_squared_error as loss
    from cxplain import MLPModelBuilder, CXPlain
    # from cxplain.backend.masking.zero_masking import FastZeroMasking
    model_builder = MLPModelBuilder(num_layers=2,
                                    num_units=512,
                                    batch_size=8,
                                    learning_rate=0.001)
    # masking_operation = FastZeroMasking()

    print(gdsc_expr.values.shape, gdsc_dr.values.shape)

    print("Fitting CXPlain model")
    explainer = CXPlain(model, model_builder, None, loss)
    explainer.fit(gdsc_expr.values, gdsc_dr.values, masked_data=masked_data)
    print("Attributing using CXPlain")

    attr = explainer.explain(test_tcga_expr.values)
    attr = pd.DataFrame(attr, index=test_tcga_expr.index, columns=dataset.hgnc)
    borda = get_ranked_list(attr)

    attr_mean = list(np.abs(attr).mean(axis=0).nlargest(200).index)
    out = pd.DataFrame(columns=['borda', 'mean'])
    out['borda'] = borda
    out['mean'] = attr_mean

    out.to_csv(save_dir + '/genes.csv', index=False)

    if not os.path.exists(save_dir + '/explainer/'):
        os.mkdir(save_dir + '/explainer/')

    explainer.save(save_dir + '/explainer/')
Exemplo n.º 15
0
def find_genes_CX(drug, model, gdsc_expr, gdsc_dr, test_tcga_expr):
    print('obtaining precalculating omegas...')
    loss = torch.nn.MSELoss(reduction='none')
    # idx  = gdsc_expr.index[:10]
    # gdsc_expr = gdsc_expr.loc[idx]
    # gdsc_dr = gdsc_dr[idx]
    # omegas, lb = precalculate_omegas(model, gdsc_expr, gdsc_dr, pathway_matrix, loss)
    # omegas, lb = precalculate_omegas_scaled(model, gdsc_expr, gdsc_dr, pathway_matrix, loss)
    # omegas, lb = precalculate_omegas(model, gdsc_expr, gdsc_dr, pathway_matrix, loss,mode='scaled-difference')
    # omegas, lb = precalculate_omegas(model, gdsc_expr, gdsc_dr, pathway_matrix, loss, mode='delta-of-delta')
    omegas, lb = precalculate_omegas(model,
                                     gdsc_expr,
                                     gdsc_dr,
                                     pathway_matrix,
                                     loss,
                                     mode='delta-scaled')

    if lb is not None:
        lb = pd.DataFrame(lb, index=gdsc_expr.index, columns=pathway_names)
        lb.to_csv(res_dir + drug + '/baselines.csv')
    print('obtained masked data...')

    import tensorflow as tf
    tf.compat.v1.disable_v2_behavior()
    tf.keras.backend.clear_session()
    tf.random.set_seed(SEED)
    from tensorflow.python.keras.losses import mean_squared_error as loss
    from cxplain import CXPlain
    from cxplain.backend.model_builders.custom_mlp_precalc import CustomMLPModelBuilder
    # from cxplain.backend.masking.zero_masking import FastZeroMasking
    n_pathways = len(pathway_names)
    model_builder = CustomMLPModelBuilder(num_layers=2,
                                          num_units=512,
                                          batch_size=16,
                                          learning_rate=0.001,
                                          n_feature_groups=n_pathways)
    # masking_operation = FastZeroMasking()

    print(gdsc_expr.values.shape, gdsc_dr.values.shape)

    print("Fitting CXPlain model")
    explainer = CXPlain(model, model_builder, None, loss, num_models=3)
    explainer.fit_precalc(gdsc_expr.values, gdsc_dr.values, omega=omegas)
    print("Attributing using CXPlain")

    attr, _ = explainer.explain_groups(test_tcga_expr.values)
    print('attr')

    attr = pd.DataFrame(attr,
                        index=test_tcga_expr.index,
                        columns=pathway_names)
    borda = get_ranked_list(attr, k=n_pathways)

    attr_mean = list(np.abs(attr).mean(axis=0).nlargest(n_pathways).index)
    out = pd.DataFrame(columns=['borda', 'mean'])
    out['borda'] = borda
    out['mean'] = attr_mean

    out.to_csv(res_dir + drug + '/pathways.csv', index=False)

    if not os.path.exists(res_dir + drug + '/explainer/'):
        os.mkdir(res_dir + drug + '/explainer/')

    explainer.save(res_dir + drug + '/explainer/', custom_model_saver=None)
Exemplo n.º 16
0
            model(masked_sample).unsqueeze(0).detach().numpy())

    masked_outs = np.concatenate(list_of_masked_outs)
    return (x, y_pred, masked_outs)


model_builder = CustomMLPModelBuilder(num_layers=2,
                                      num_units=32,
                                      batch_size=32,
                                      learning_rate=0.001,
                                      n_feature_groups=10)

# masking_operation = ZeroMasking()
# masking_operation = FastZeroMasking()
k = get_masked_data_for_CXPlain(model, x[:200])
explainer = CXPlain(model, model_builder, None,
                    loss)  #,downsample_factors=(5,))
explainer.fit(x[:200], y[:200], masked_data=k)
attributions = explainer.explain_groups(x[200:])

attr = pd.DataFrame(attributions, index=range(200, 300))
# plt.plot(range(50), np.abs(attr).mean(axis=0), label='attr')
# #plt.plot(range(50), np.abs(mult).mean(axis=0), label='mult')
# plt.show()

print(attr.shape)
print(attr.loc[200])

df = pd.DataFrame(x[200:], index=range(200, 300))
idx = df.loc[np.abs(df[20]) < 0.1].loc[np.abs(df[40]) > 1].index[0]

# plt.plot(range(50), np.abs(attr.loc[idx]), label='attr')
Exemplo n.º 17
0
    def test_boston_housing_load_save_valid(self):
        (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing()
        explained_model = RandomForestRegressor(n_estimators=64,
                                                max_depth=5,
                                                random_state=1)
        explained_model.fit(x_train, y_train)

        model_builder = MLPModelBuilder(num_layers=2,
                                        num_units=32,
                                        activation="relu",
                                        p_dropout=0.2,
                                        verbose=0,
                                        batch_size=32,
                                        learning_rate=0.001,
                                        num_epochs=2,
                                        early_stopping_patience=128)
        masking_operation = ZeroMasking()
        loss = mean_squared_error

        num_models_settings = [1, 2]
        for num_models in num_models_settings:
            explainer = CXPlain(explained_model,
                                model_builder,
                                masking_operation,
                                loss,
                                num_models=num_models)

            explainer.fit(x_train, y_train)
            median_1 = explainer.predict(x_test)

            tmp_dir_name = tempfile.mkdtemp()
            explainer.save(tmp_dir_name)

            with self.assertRaises(ValueError):
                explainer.save(tmp_dir_name, overwrite=False)

            explainer.save(tmp_dir_name, overwrite=True)
            explainer.load(tmp_dir_name)
            median_2 = explainer.predict(x_test)

            self.assertTrue(np.array_equal(median_1, median_2))

            shutil.rmtree(tmp_dir_name)  # Cleanup.
Exemplo n.º 18
0
    def test_overwrite_ensemble_model_invalid(self):
        (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing()

        model_builder = MLPModelBuilder()
        explained_model = RandomForestRegressor(n_estimators=64,
                                                max_depth=5,
                                                random_state=1)
        explained_model.fit(x_train, y_train)
        masking_operation = ZeroMasking()
        loss = binary_crossentropy
        num_models = 5
        explainer = CXPlain(explained_model,
                            model_builder,
                            masking_operation,
                            loss,
                            num_models=num_models)

        file_names = [
            CXPlain.get_config_file_name(),
            CXPlain.get_explained_model_file_name(".pkl"),
            CXPlain.get_loss_pkl_file_name(),
            CXPlain.get_model_builder_pkl_file_name(),
            CXPlain.get_masking_operation_pkl_file_name()
        ]

        # Test with untrained explanation model.
        for file_name in file_names:
            tmp_dir = TestExplanationModel.make_at_tmp(file_name)
            with self.assertRaises(ValueError):
                explainer.save(tmp_dir, overwrite=False)

        # Test with trained explanation model.
        explainer.fit(x_train, y_train)

        file_names = [
            CXPlain.get_config_file_name(),
            CXPlain.get_explained_model_file_name(".pkl"),
            CXPlain.get_loss_pkl_file_name(),
            CXPlain.get_model_builder_pkl_file_name(),
            CXPlain.get_masking_operation_pkl_file_name()
        ] + [
            CXPlain.get_prediction_model_h5_file_name(i)
            for i in range(num_models)
        ]

        for file_name in file_names:
            tmp_dir = TestExplanationModel.make_at_tmp(file_name)
            with self.assertRaises(ValueError):
                explainer.save(tmp_dir, overwrite=False)
Exemplo n.º 19
0
#     _, _, _, test_tcga_expr = dataset.filter_and_normalize_data(drug)
#     exp = CXPlain.load('gene_finding/results/%s/%s/explainer'%(folder, drug), custom_model_loader=None, relpath=True)
#     attr,_ = exp.explain(test_tcga_expr.values)
#     attr = pd.DataFrame(attr, index=test_tcga_expr.index, columns=dataset.genes)
#     attr_dict[drug]=attr

attr_dict = {}
conf_dict = {}
for i, drug in enumerate(drugs):
    print(drug)
    _, _, _, test_tcga_expr = dataset.filter_and_normalize_data(drug)
    
    attr_all = np.zeros((len(test_tcga_expr.index), len(dataset.genes)))

    for seed in range(1, 11):
        exp = CXPlain.load('gene_finding/results/%s/%s/seed%d/explainer'%(folder, drug, seed), custom_model_loader=None, relpath=True)
        attr = exp.explain(test_tcga_expr.values)
        attr_all += attr

    
    attr = pd.DataFrame(attr_all/10.0, index=test_tcga_expr.index, columns=dataset.genes)        
    attr_dict[drug]=attr

fig, axes = plt.subplots(7, 2, figsize=(14, 35))

writer_a = pd.ExcelWriter('gene_finding/results/%s/top_genes_mean_of_means_aggregation.xlsx'%folder, engine='xlsxwriter')

conv = pd.DataFrame(index=dataset.genes, columns=['hgnc'])
conv['hgnc'] = dataset.hgnc

for i, drug in enumerate(drugs):
Exemplo n.º 20
0
def cxpl(model_dir, data_dir, results_subdir, random_seed, resolution):
    np.random.seed(random_seed)
    tf.set_random_seed(np.random.randint(1 << 31))
    session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
    sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
    set_session(sess)

    # parser config
    config_file = model_dir+ "/config.ini"
    print("Config File Path:", config_file,flush=True)
    assert os.path.isfile(config_file)
    cp = ConfigParser()
    cp.read(config_file)

    output_dir = os.path.join(results_subdir, "classification_results/test")
    print("Output Directory:", output_dir,flush=True)
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)


    # default config
    image_dimension = cp["TRAIN"].getint("image_dimension")
    gan_resolution = resolution
    batch_size = cp["TEST"].getint("batch_size")
    use_best_weights = cp["TEST"].getboolean("use_best_weights")

    if use_best_weights:
        print("** Using BEST weights",flush=True)
        model_weights_path = os.path.join(results_subdir, "classification_results/train/best_weights.h5")
    else:
        print("** Using LAST weights",flush=True)
        model_weights_path = os.path.join(results_subdir, "classification_results/train/weights.h5")

    print("** DenseNet Input Resolution:", image_dimension, flush=True)
    print("** GAN Image Resolution:", gan_resolution, flush=True)

    # get test sample count
    test_dir = os.path.join(results_subdir, "inference/test")
    shutil.copy(test_dir+"/test.csv", output_dir)

    # Get class names 
    class_names = get_class_names(output_dir,"test")

    tfrecord_dir_te = os.path.join(data_dir, "test")
    test_counts, _ = get_sample_counts(output_dir, "test", class_names)
    
    # get indicies (all of csv file for validation)
    print("** test counts:", test_counts, flush=True)

    # compute steps
    test_steps = int(np.floor(test_counts / batch_size))
    print("** test_steps:", test_steps, flush=True)

    log2_record = int(np.log2(gan_resolution))
    record_file_ending = "*"+ np.str(log2_record)+ ".tfrecords"
    print("** resolution ", gan_resolution, " corresponds to ", record_file_ending, " TFRecord file.", flush=True)

    # Get Model
    # ------------------------------------
    input_shape=(image_dimension, image_dimension, 3)
    img_input = Input(shape=input_shape)

    base_model = DenseNet121(
        include_top = False, 
        weights = None,
        input_tensor = img_input,
        input_shape = input_shape,
        pooling = "avg")

    x = base_model.output
    predictions = Dense(len(class_names), activation="sigmoid", name="predictions")(x)
    model = Model(inputs=img_input, outputs = predictions)

    print(" ** load model from:", model_weights_path, flush=True)
    model.load_weights(model_weights_path)
    # ------------------------------------

    print("** load test generator **", flush=True)
    test_seq = TFWrapper(
            tfrecord_dir=tfrecord_dir_te,
            record_file_endings = record_file_ending,
            batch_size = batch_size,
            model_target_size = (image_dimension, image_dimension),
            steps = None,
            augment=False,
            shuffle=False,
            prefetch=True,
            repeat=False)

    print("** make prediction **", flush=True)
    test_seq.initialise() 
    x_all, y_all = test_seq.get_all_test_data()
    print("X-Test  Shape:", x_all.shape,flush=True)
    print("Y-Test  Shape:", y_all.shape,flush=True)

    print("----------------------------------------", flush=True)
    print("Test Model AUROC", flush=True)
    y_pred = model.predict(x_all)
    current_auroc = []
    for i in range(len(class_names)):
        try:
            score = roc_auc_score(y_all[:, i], y_pred[:, i])
        except ValueError:
            score = 0
        current_auroc.append(score)
        print(i+1,class_names[i],": ", score, flush=True)
    mean_auroc = np.mean(current_auroc)
    print("Mean auroc: ", mean_auroc,flush=True)

    print("----------------------------------------", flush=True)
    downscale_factor  = 8
    num_models_to_use = 3
    num_test_images   = 100
    print("Number of Models to use:", num_models_to_use, flush=True)
    print("Number of Test images:", num_test_images, flush=True)
    x_tr, y_tr = x_all[num_test_images:], y_all[num_test_images:]
    x_te, y_te = x_all[0:num_test_images], y_all[0:num_test_images]

    downsample_factors = (downscale_factor,downscale_factor)
    print("Downsample Factors:", downsample_factors,flush=True)
    model_builder = UNetModelBuilder(downsample_factors, num_layers=2, num_units=8, activation="relu",
                                     p_dropout=0.0, verbose=0, batch_size=32, learning_rate=0.001)
    print("Model build done.",flush=True)
    masking_operation = ZeroMasking()
    loss = categorical_crossentropy

    explainer = CXPlain(model, model_builder, masking_operation, loss, 
                    num_models=num_models_to_use, downsample_factors=downsample_factors, flatten_for_explained_model=False)
    print("Explainer build done.",flush=True)

    explainer.fit(x_tr, y_tr);
    print("Explainer fit done.",flush=True)

    try:
        attr, conf = explainer.explain(x_te, confidence_level=0.80)
        np.save(output_dir+"/x_cxpl.npy", x_te)
        np.save(output_dir+"/y_cxpl.npy", y_te)
        np.save(output_dir+"/attr.npy", attr)
        np.save(output_dir+"/conf.npy", conf)
        print("Explainer explain done and saved.",flush=True)
    except Exception as ef: print(ef,flush=True)