def run_model(data_set, kmer_size, norm_input, encoding_dim, encoded_activation, input_dropout_pct, dropout_pct, num_epochs, batch_size, n_splits, n_repeats, compute_informative_features, plot_iteration, graph_dir, outFile): # format strings for outputting the paramters associated with this run: summary_string, plotting_string = stats_utils.format_input_parameters_printing( data_set, kmer_size, norm_input, encoding_dim, encoded_activation, input_dropout_pct, dropout_pct, num_epochs, batch_size, n_splits, n_repeats, compute_informative_features, plot_iteration) outFile_header = 'data_set\tkmer_size\tnorm_input\tencoding_dim\tencoded_activation\tinput_dropout_pct\tdropout_pct\tnum_epochs\tbatch_size\tn_splits\tn_repeats\t' ################# # Load the data # ################# print('Loading data...') data_normalized, labels, rskf = load_kmer_cnts_jf.load_single_disease( data_set, kmer_size, n_splits, n_repeats, precomputed_kfolds=False) # rskf = repeated stratified k fold. This contains all the kfold-by-iteration combos. ################################################### # iterate through the data kfolds and iterations # ################################################### # Create a dictionary to store the metrics of each fold aggregated_statistics = {} # key=n_repeat, values= dictionary with stats for n_repeat in range(0, len(rskf[0])): print('Iteration %s...' % n_repeat) aggregated_statistics[n_repeat] = {} train_idx = rskf[0][n_repeat] test_idx = rskf[1][n_repeat] x_train, y_train = data_normalized[train_idx], labels[train_idx] x_test, y_test = data_normalized[test_idx], labels[test_idx] #standardize the data, mean=0, std=1 if norm_input: x_train, x_test = stats_utils.standardize_data(x_train, x_test) ########################################### # set up a model (supervised learning) # ########################################### # note that the model has to be instantiated each time a new fold is started otherwise the weights will not start from scratch. input_dim = len( data_normalized[0]) # this is the number of input kmers model = deep_learning_models.create_supervised_model( input_dim, encoding_dim, encoded_activation, input_dropout_pct, dropout_pct) #weightFile = os.environ['HOME'] + '/deep_learning_microbiome/data/weights.txt' ################################################## # Fit the model with the train data of this fold # ################################################## history = History() # history is a dictionary. To get the keys, type print(history.history.keys()) model.fit(x_train, y_train, epochs=num_epochs, batch_size=batch_size, shuffle=True, validation_data=(x_test, y_test), verbose=0, callbacks=[history]) # predict using the held out data y_pred = model.predict(x_test) # save the weights of this model. TODO ################################################################ # Compute summary statistics # ################################################################ # Store the results of this fold in aggregated_statistics aggregated_statistics = stats_utils.compute_summary_statistics( y_test, y_pred, history, aggregated_statistics, n_repeat) # could plot everything (roc, accuracy vs epoch, loss vs epoch, confusion matrix, precision recall) for each fold, but this will produce a lot of graphs. if compute_informative_features: shap_values, shap_values_summed = stats_utils.compute_shap_values_deeplearning( input_dim, model, x_test) aggregated_statistics[n_repeat][ 'shap_values_summed'] = shap_values_summed aggregated_statistics[n_repeat]['shap_values'] = shap_values # also plot: #shap.summary_plot(shap_values, X, plot_type="bar") #shap.summary_plot(shap_values, X) ############################################## # aggregate the results from all the k-folds # # Print and Plot # ############################################## print('Aggregating statistics across iterations and printing/plotting...') stats_utils.aggregate_statistics_across_folds(aggregated_statistics, rskf, n_splits, outFile, summary_string, plotting_string, outFile_header) ################### # Aggregate shap: # ################### if compute_informative_features: print('Computing informative features with Shap...') stats_utils.aggregate_shap(aggregated_statistics, rskf)
#encoding_dims = [] encoding_dims = [8, 300, 4000] for encoding_dim in encoding_dims: input_dim = len(data_normalized[0]) # this is the number of input kmers encoded_activation = 'relu' #encoded_activation = 'sigmoid' #encoded_activation = 'linear' #decoded_activation = 'softmax' decoded_activation = 'sigmoid' loss = 'binary_crossentropy' model = deep_learning_models.create_supervised_model( input_dim, encoding_dim, encoded_activation, decoded_activation) ################# # Fit the model # ################# numEpochs = 1000 batchSize = 32 history = History() model.fit(data_normalized, labels, epochs=numEpochs, validation_split=0.2, batch_size=batchSize,
#standardize the data, mean=0, std=1 norm_input = True if norm_input: x_train, x_test = stats_utils.standardize_data_bootstrap( data_normalized[train_idx], x_test, x_train) # ########################################### # set up a model (supervised learning) # ########################################### # note that the model has to be instantiated each time a new fold is started otherwise the weights will not start from scratch. # input_dim = len( data_normalized[0]) # this is the number of input kmers # model = deep_learning_models.create_supervised_model( input_dim, encoding_dim, encoded_activation, input_dropout_pct, dropout_pct) # ################################################## # Fit the model with the train data of this fold # ################################################## history = History() # history is a dictionary. To get the keys, type print(history.history.keys()) # model.fit(x_train, y_train, epochs=num_epochs, batch_size=len(x_train), shuffle=True, validation_data=(x_test, y_test), verbose=1,