def run_model(data_set, kmer_size, norm_input, encoding_dim_1, encoding_dim_2, encoded_activation, input_dropout_pct, dropout_pct, num_epochs, batch_size, n_splits, n_repeats, compute_informative_features, plot_iteration, graph_dir, outFile): # format strings for outputting the paramters associated with this run: summary_string, plotting_string = stats_utils.format_input_parameters_printing_2layers( data_set, kmer_size, norm_input, encoding_dim_1, encoding_dim_2, encoded_activation, input_dropout_pct, dropout_pct, num_epochs, batch_size, n_splits, n_repeats, compute_informative_features, plot_iteration) outFile_header = 'data_set\tkmer_size\tnorm_input\tencoding_dim_1\tencoding_dim_2\tencoded_activation\tinput_dropout_pct\tdropout_pct\tnum_epochs\tbatch_size\tn_splits\tn_repeats\t' ################# # Load the data # ################# print('Loading data...') data_normalized, labels, rskf = load_kmer_cnts_jf.load_single_disease( data_set, kmer_size, n_splits, n_repeats, precomputed_kfolds=False) # rskf = repeated stratified k fold. This contains all the kfold-by-iteration combos. ################################################### # iterate through the data kfolds and iterations # ################################################### # Create a dictionary to store the metrics of each fold aggregated_statistics = {} # key=n_repeat, values= dictionary with stats for n_repeat in range(0, len(rskf[0])): print('Iteration %s...' % n_repeat) aggregated_statistics[n_repeat] = {} train_idx = rskf[0][n_repeat] test_idx = rskf[1][n_repeat] x_train, y_train = data_normalized[train_idx], labels[train_idx] x_test, y_test = data_normalized[test_idx], labels[test_idx] #standardize the data, mean=0, std=1 if norm_input: x_train, x_test = stats_utils.standardize_data(x_train, x_test) ########################################### # set up a model (supervised learning) # ########################################### # note that the model has to be instantiated each time a new fold is started otherwise the weights will not start from scratch. input_dim = len( data_normalized[0]) # this is the number of input kmers model = deep_learning_models.create_supervised_model_2layers( input_dim, encoding_dim_1, encoding_dim_2, encoded_activation, input_dropout_pct, dropout_pct) #weightFile = os.environ['HOME'] + '/deep_learning_microbiome/data/weights.txt' ################################################## # Fit the model with the train data of this fold # ################################################## history = History() # history is a dictionary. To get the keys, type print(history.history.keys()) model.fit(x_train, y_train, epochs=num_epochs, batch_size=len(x_train), shuffle=True, validation_data=(x_test, y_test), verbose=0, callbacks=[history]) # predict using the held out data y_pred = model.predict(x_test) # save the weights of this model. TODO ################################################################ # Compute summary statistics # ################################################################ # Store the results of this fold in aggregated_statistics aggregated_statistics = stats_utils.compute_summary_statistics( y_test, y_pred, history, aggregated_statistics, n_repeat) # could plot everything (roc, accuracy vs epoch, loss vs epoch, confusion matrix, precision recall) for each fold, but this will produce a lot of graphs. if compute_informative_features: shap_values, shap_values_summed = stats_utils.compute_shap_values_deeplearning( input_dim, model, x_test) aggregated_statistics[n_repeat][ 'shap_values_summed'] = shap_values_summed aggregated_statistics[n_repeat]['shap_values'] = shap_values # also plot: #shap.summary_plot(shap_values, X, plot_type="bar") #shap.summary_plot(shap_values, X) ############################################## # aggregate the results from all the k-folds # # Print and Plot # ############################################## print('Aggregating statistics across iterations and printing/plotting...') stats_utils.aggregate_statistics_across_folds(aggregated_statistics, rskf, n_splits, outFile, summary_string, plotting_string, outFile_header) ################### # Aggregate shap: # ################### if compute_informative_features: print('Computing informative features with Shap...') stats_utils.aggregate_shap(aggregated_statistics, rskf)
tmp_intermediate_directory = config_file.tmp_intermediate_directory for kmer_size in [5, 6, 7, 8, 10]: print(kmer_size) ################# # Load the data # ################# print('Loading data...') data_set = 'Qin_et_al' data_normalized, kmer_cnts, labels, rskf = load_kmer_cnts_jf.load_single_disease( data_set, kmer_size, n_splits, n_repeats, precomputed_kfolds=False, bootstrap=True) num_replicates = 100 num_kmers = 100000 bootstrapped_data = stats_utils.bootstrap_data(data_normalized, kmer_cnts, num_replicates, num_kmers) pickle.dump( bootstrapped_data, open( "%skmer_size_%s_Qin_bootstrap.p" % (tmp_intermediate_directory, kmer_size), "wb"))