def continue_run_rand_nn(matrixA, matrixB, num_gene, num_pathway, layer1,
                         layer2, layer3, path, dir_opt, RNA_seq_filename,
                         input_num, epoch, batch_size, verbose):
    # RECONSTRCUT TO BE TRAINED MODEL
    model = RandNN().keras_rand_nn(matrixA, matrixB, num_gene, layer0, layer1,
                                   layer2, layer3)
    with open(path + '/layer_bias_list.txt', 'rb') as filebias:
        layer_bias_list = pickle.load(filebias)
    with open(path + '/layer_weight_list.txt', 'rb') as fileweight:
        layer_weight_list = pickle.load(fileweight)
    model.compile(loss='mean_squared_error',
                  optimizer='adam',
                  metrics=['mse', 'accuracy'])
    xTmp, yTmp = LoadData(dir_opt, RNA_seq_filename).load_train(0, 1)
    model.fit(xTmp, yTmp, epochs=1, validation_split=1, verbose=0)
    model_layer_list = []
    num_layer = len(model.layers)
    for i in range(num_layer):
        each_layer_list = [layer_weight_list[i], layer_bias_list[i]]
        model_layer_list.append(each_layer_list)
        model.layers[i].set_weights(each_layer_list)
    # AUTO UPDATE WEIGHT
    model, history, num_layer, path = RunRandNN(
        model, dir_opt, RNA_seq_filename).train(input_num, epoch, batch_size,
                                                verbose)
    return model, history, path
def manual_test_rand_nn(matrixA, matrixB, num_gene, num_pathway, layer1,
                        layer2, layer3, path, dir_opt, RNA_seq_filename):
    # RECONSTRCUT TEST MODEL
    model = RandNN().keras_rand_nn(matrixA, matrixB, num_gene, layer0, layer1,
                                   layer2, layer3)
    with open(path + '/layer_bias_list.txt', 'rb') as filebias:
        layer_bias_list = pickle.load(filebias)
    with open(path + '/layer_weight_list.txt', 'rb') as fileweight:
        layer_weight_list = pickle.load(fileweight)
    model.compile(loss='mean_squared_error',
                  optimizer='adam',
                  metrics=['mse', 'accuracy'])
    xTmp, yTmp = LoadData(dir_opt, RNA_seq_filename).load_train(0, 1)
    model.fit(xTmp, yTmp, epochs=1, validation_split=1, verbose=0)
    model_layer_list = []
    num_layer = len(model.layers)
    for i in range(num_layer):
        each_layer_list = [layer_weight_list[i], layer_bias_list[i]]
        model_layer_list.append(each_layer_list)
        model.layers[i].set_weights(each_layer_list)
    # PREDICT MODEL USING [xTe, yTe]
    verbose = 1
    y_pred, score = RunRandNN(model, dir_opt,
                              RNA_seq_filename).test(verbose, path)
示例#3
0
 def gene_analysis(self, RNA_seq_filename, pathway_filename, path,
                   post_data_path, epoch_time):
     # ANALYSE PATHWAY GENE AND DISTRIBUTION
     dir_opt = self.dir_opt
     zero_final_dl_input_df, input_num, num_feature, cellline_gene_df, num_gene, num_pathway = LoadData(
         dir_opt, RNA_seq_filename).pre_load_train()
     layer0 = num_pathway
     layer1 = 256
     layer2 = 128
     layer3 = 32
     matrixA = GenMatrix(dir_opt, RNA_seq_filename,
                         pathway_filename).feature_gene_matrix(
                             num_feature, num_gene)
     matrixB = GenMatrix(dir_opt, RNA_seq_filename,
                         pathway_filename).gene_pathway_matrix(num_pathway)
     # RECONSTRUCT THE DECOMPOSED MODEL
     input_model, gene_model, pathway_model, model = RandNN().keras_rand_nn(
         matrixA, matrixB, num_gene, num_pathway, layer1, layer2, layer3)
     # AUTO REBUILD THE MODEL
     # model = model.load_weights(path + '/model.h5')
     # MANUAL REBUILD THE MODEL (BETTER)
     with open(path + '/layer_list.txt', 'rb') as filelayer:
         layer_list = pickle.load(filelayer)
     model.compile(loss='mean_squared_error',
                   optimizer='adam',
                   metrics=['mse', 'accuracy'])
     xTmp, yTmp = LoadData(dir_opt, RNA_seq_filename).load_train(0, 1)
     model.fit(xTmp, yTmp, epochs=1, validation_split=1, verbose=0)
     num_layer = len(model.layers)
     for i in range(num_layer):
         model.get_layer(index=i).set_weights(layer_list[i])
     # GENE IMPORTANT ANALYSIS
     # ON TRAINING DATA SET
     xTr = np.load(post_data_path + '/xTr.npy')
     gene_x = np.array(input_model.predict(xTr))
     pathway_analyzer = innvestigate.create_analyzer(
         'smoothgrad',
         gene_model,
         noise_scale=(np.max(gene_x) - np.min(gene_x)) * 0.1)
     analysis = pathway_analyzer.analyze(gene_x)
     mean_analysis = np.mean(analysis, axis=0)
     print('Gene Analysis')
     print(mean_analysis.shape)
     # MAKE BARPOLT ON GENES IMPORTANCE
     top = 50
     mean_analysis = np.absolute(mean_analysis)
     # GET AND SAVE TOP GENE INDEX
     top_gene_index = heapq.nlargest(top, range(len(mean_analysis)),
                                     mean_analysis.take)
     np.save(path + '/train_top_gene_index' + epoch_time + '.npy',
             top_gene_index)
     top_gene_value = mean_analysis[top_gene_index]
     cellline_df = pd.read_csv('.' + dir_opt + '/filtered_data/' +
                               RNA_seq_filename + '.csv')
     gene_name_list = cellline_df['geneSymbol']
     gene_name_dict = {k: v for k, v in enumerate(gene_name_list)}
     top_gene_name = [gene_name_dict.get(key) for key in top_gene_index]
     top_gene_name_value_dict = dict(zip(top_gene_name, top_gene_value))
     # BARPLOT ON GENES AND SAVE
     plt.figure(figsize=(16, 9))
     plt.bar(range(len(top_gene_name_value_dict)),
             list(top_gene_name_value_dict.values()))
     plt.xticks(range(len(top_gene_name_value_dict)),
                list(top_gene_name_value_dict.keys()),
                rotation=30,
                fontsize=10,
                ha='right')
     plt.title(
         'Top 50 Genes With Largest Absolute Importance Scores of 1684 Genes On Training Dataset',
         fontsize=16)
     plt.tight_layout()
     # SAVE TRAINING PLOT FIGURE
     file_name = 'epoch_' + epoch_time + '_train_gene_barplot'
     train_path = '.' + dir_opt + '/plot/%s' % (file_name) + '.png'
     unit = 1
     while os.path.exists(train_path):
         train_path = '.' + dir_opt + '/plot/%s_%d' % (file_name,
                                                       unit) + '.png'
         unit += 1
     plt.savefig(train_path, dpi=300)
     # ON TEST DATA SET
     xTe = np.load(post_data_path + '/xTe.npy')
     gene_x = np.array(input_model.predict(xTe))
     gene_analyzer = innvestigate.create_analyzer(
         'smoothgrad',
         gene_model,
         noise_scale=(np.max(gene_x) - np.min(gene_x)) * 0.1)
     analysis = gene_analyzer.analyze(gene_x)
     mean_analysis = np.mean(analysis, axis=0)
     print('Gene Analysis')
     print(mean_analysis.shape)
     # MAKE BARPOLT ON GENES IMPORTANCE
     top = 50
     mean_analysis = np.absolute(mean_analysis)
     # GET AND SAVE TOP GENE INDEX
     top_gene_index = heapq.nlargest(top, range(len(mean_analysis)),
                                     mean_analysis.take)
     np.save(path + '/test_top_gene_index' + epoch_time + '.npy',
             top_gene_index)
     top_gene_value = mean_analysis[top_gene_index]
     cellline_df = pd.read_csv('.' + dir_opt + '/filtered_data/' +
                               RNA_seq_filename + '.csv')
     gene_name_list = cellline_df['geneSymbol']
     gene_name_dict = {k: v for k, v in enumerate(gene_name_list)}
     top_gene_name = [gene_name_dict.get(key) for key in top_gene_index]
     top_gene_name_value_dict = dict(zip(top_gene_name, top_gene_value))
     # BARPLOT ON GENES AND SAVE
     plt.figure(figsize=(16, 9))
     plt.bar(range(len(top_gene_name_value_dict)),
             list(top_gene_name_value_dict.values()))
     plt.xticks(range(len(top_gene_name_value_dict)),
                list(top_gene_name_value_dict.keys()),
                rotation=30,
                fontsize=10,
                ha='right')
     plt.title(
         'Top 50 Genes With Largest Absolute Importance Scores of 1684 Genes On Test Dataset',
         fontsize=16)
     plt.tight_layout()
     # SAVE TEST PLOT FIGURE
     file_name = 'epoch_' + epoch_time + '_test_gene_barplot'
     test_path = '.' + dir_opt + '/plot/%s' % (file_name) + '.png'
     unit = 1
     while os.path.exists(test_path):
         test_path = '.' + dir_opt + '/plot/%s_%d' % (file_name,
                                                      unit) + '.png'
         unit += 1
     plt.savefig(test_path, dpi=300)
示例#4
0
    def pathway_analysis(self, RNA_seq_filename, pathway_filename, path,
                         post_data_path, epoch_time):
        # ANALYSE PATHWAY GENE AND DISTRIBUTION
        dir_opt = self.dir_opt
        train_input_df, input_num, num_feature, rna_df, cpnum_df, num_gene, num_pathway = LoadData(
            dir_opt).pre_load_train()
        layer0 = num_pathway
        layer1 = 256
        layer2 = 128
        layer3 = 32
        matrixA = GenMatrix.feature_gene_matrix(num_feature, num_gene)
        matrixB = GenMatrix.gene_pathway_matrix()
        # RECONSTRUCT THE DECOMPOSED MODEL
        input_model, gene_model, pathway_model, model = RandNN().keras_rand_nn(
            matrixA, matrixB, num_gene, num_pathway, layer1, layer2, layer3)
        # AUTO REBUILD THE MODEL
        # model = model.load_weights(path + '/model.h5')
        # MANUAL REBUILD THE MODEL (BETTER)
        with open(path + '/layer_list.txt', 'rb') as filelayer:
            layer_list = pickle.load(filelayer)
        model.compile(loss='mean_squared_error',
                      optimizer='adam',
                      metrics=['mse', 'accuracy'])
        xTmp, yTmp = LoadData(dir_opt).load_train(0, 1)
        model.fit(xTmp, yTmp, epochs=1, validation_split=1, verbose=0)
        num_layer = len(model.layers)
        for i in range(num_layer):
            model.get_layer(index=i).set_weights(layer_list[i])
        # RE-LOAD EACH MODEL
        input_model = model.layers[1]
        gene_model = model.layers[2]
        pathway_model = model.layers[3]
        # GENE IMPORTANT ANALYSIS
        # ON TEST DATA ONLY
        xTe = np.load(post_data_path + '/xTe.npy')
        gene_x = np.array(input_model.predict(xTe))
        # PATHWAY INPORTANCE ANALYSIS ON TEST INDEX
        # import pdb; pdb.set_trace()
        pathway_x = gene_model.predict(gene_x)
        gene_pathway_df = pd.read_table('.' + dir_opt + '/init_data/' +
                                        pathway_filename + '.txt')
        pathway_name_list = list(gene_pathway_df.columns[1:])
        pathway_analyzer = innvestigate.create_analyzer(
            "smoothgrad",
            pathway_model,
            noise_scale=(np.max(pathway_x) - np.min(pathway_x)) * 0.1)
        analysis = pathway_analyzer.analyze(pathway_x)

        print('PATHWAY IMPORTANCE ANALYSIS...')
        file_name = 'epoch_' + epoch_time + '_pathway_analysis'
        analysis_path = '.' + dir_opt + '/plot/%s' % (file_name) + '.pdf'
        unit = 1
        while os.path.exists(analysis_path):
            analysis_path = '.' + dir_opt + '/plot/%s_%d' % (file_name,
                                                             unit) + '.pdf'
            unit += 1
        pdf = PdfPages(analysis_path)
        plt.figure(figsize=(8, 6))
        ax = plt.imshow(analysis.squeeze(),
                        cmap='seismic',
                        interpolation='nearest',
                        aspect="auto")
        cb = plt.colorbar(ax)
        cb.ax.tick_params(labelsize=8)
        plt.ylabel('Sample Index')
        plt.xlabel('Pathway')
        plt.xticks(rotation=45)
        plt.tick_params(labelsize=8)
        pdf.savefig()
        plt.close()
        pdf.close()

        # PATHWAY DISTRIBUTION ANALYSIS ON TEST INDEX
        print('PATHWAY DISTRIBUTION ANALYSIS...')
        file_name = 'epoch_' + epoch_time + '_pathway_distribution'
        distribution_path = '.' + dir_opt + '/plot/%s' % (file_name) + '.pdf'
        unit = 1
        while os.path.exists(distribution_path):
            distribution_path = '.' + dir_opt + '/plot/%s_%d' % (file_name,
                                                                 unit) + '.pdf'
            unit += 1
        pdf = PdfPages(distribution_path)
        pathway_name = pathway_name_list
        fig = plt.figure(figsize=(15, 20))
        for i in range(num_pathway):
            ax = fig.add_subplot(11, 5, i + 1)
            plt.xlim([-0.1, 0.1])
            sns.kdeplot(analysis[:, i], shade=True)
            plt.title(pathway_name[i], fontsize='small', fontweight='bold')
            plt.tight_layout()
            plt.subplots_adjust(wspace=0.5, hspace=0.5)
        pdf.savefig()
        plt.close()
        pdf.close()
示例#5
0
def build_rand_nn(matrixA, matrixB, num_gene, num_pathway, layer1, layer2,
                  layer3):
    input_model, gene_model, pathway_model, model = RandNN().keras_rand_nn(
        matrixA, matrixB, num_gene, num_pathway, layer1, layer2, layer3)
    return input_model, gene_model, pathway_model, model
def build_rand_nn(matrixA, matrixB, num_gene, layer0, layer1, layer2, layer3):
    model = RandNN().keras_rand_nn(matrixA, matrixB, num_gene, layer0, layer1,
                                   layer2, layer3)
    return model