def run_model(data_set, kmer_size, norm_input, encoding_dim,
              encoded_activation, input_dropout_pct, dropout_pct, num_epochs,
              batch_size, n_splits, n_repeats, compute_informative_features,
              plot_iteration, graph_dir, outFile):

    # format strings for outputting the paramters associated with this run:
    summary_string, plotting_string = stats_utils.format_input_parameters_printing(
        data_set, kmer_size, norm_input, encoding_dim, encoded_activation,
        input_dropout_pct, dropout_pct, num_epochs, batch_size, n_splits,
        n_repeats, compute_informative_features, plot_iteration)

    outFile_header = 'data_set\tkmer_size\tnorm_input\tencoding_dim\tencoded_activation\tinput_dropout_pct\tdropout_pct\tnum_epochs\tbatch_size\tn_splits\tn_repeats\t'

    #################
    # Load the data #
    #################
    print('Loading data...')

    data_normalized, labels, rskf = load_kmer_cnts_jf.load_single_disease(
        data_set, kmer_size, n_splits, n_repeats, precomputed_kfolds=False)

    # rskf = repeated stratified k fold. This contains all the kfold-by-iteration combos.

    ###################################################
    # iterate through the data kfolds and iterations #
    ###################################################

    # Create a dictionary to store the metrics of each fold
    aggregated_statistics = {}  # key=n_repeat, values= dictionary with stats

    for n_repeat in range(0, len(rskf[0])):

        print('Iteration %s...' % n_repeat)

        aggregated_statistics[n_repeat] = {}

        train_idx = rskf[0][n_repeat]
        test_idx = rskf[1][n_repeat]
        x_train, y_train = data_normalized[train_idx], labels[train_idx]
        x_test, y_test = data_normalized[test_idx], labels[test_idx]

        #standardize the data, mean=0, std=1
        if norm_input:
            x_train, x_test = stats_utils.standardize_data(x_train, x_test)

        ###########################################
        # set up a model (supervised learning)    #
        ###########################################
        # note that the model has to be instantiated each time a new fold is started otherwise the weights will not start from scratch.

        input_dim = len(
            data_normalized[0])  # this is the number of input kmers

        model = deep_learning_models.create_supervised_model(
            input_dim, encoding_dim, encoded_activation, input_dropout_pct,
            dropout_pct)

        #weightFile = os.environ['HOME'] + '/deep_learning_microbiome/data/weights.txt'

        ##################################################
        # Fit the model with the train data of this fold #
        ##################################################
        history = History()
        # history is a dictionary. To get the keys, type print(history.history.keys())

        model.fit(x_train,
                  y_train,
                  epochs=num_epochs,
                  batch_size=batch_size,
                  shuffle=True,
                  validation_data=(x_test, y_test),
                  verbose=0,
                  callbacks=[history])

        # predict using the held out data
        y_pred = model.predict(x_test)

        # save the weights of this model. TODO

        ################################################################
        # Compute summary statistics                                   #
        ################################################################
        # Store the results of this fold in aggregated_statistics
        aggregated_statistics = stats_utils.compute_summary_statistics(
            y_test, y_pred, history, aggregated_statistics, n_repeat)

        # could  plot everything (roc, accuracy vs epoch, loss vs epoch, confusion matrix, precision recall) for each fold, but this will produce a lot of graphs.
        if compute_informative_features:
            shap_values, shap_values_summed = stats_utils.compute_shap_values_deeplearning(
                input_dim, model, x_test)
            aggregated_statistics[n_repeat][
                'shap_values_summed'] = shap_values_summed
            aggregated_statistics[n_repeat]['shap_values'] = shap_values

        # also plot:
        #shap.summary_plot(shap_values, X, plot_type="bar")
        #shap.summary_plot(shap_values, X)

    ##############################################
    # aggregate the results from all the k-folds #
    # Print and Plot                             #
    ##############################################
    print('Aggregating statistics across iterations and printing/plotting...')

    stats_utils.aggregate_statistics_across_folds(aggregated_statistics, rskf,
                                                  n_splits, outFile,
                                                  summary_string,
                                                  plotting_string,
                                                  outFile_header)

    ###################
    # Aggregate shap: #
    ###################

    if compute_informative_features:
        print('Computing informative features with Shap...')
        stats_utils.aggregate_shap(aggregated_statistics, rskf)
예제 #2
0
#encoding_dims = []
encoding_dims = [8, 300, 4000]

for encoding_dim in encoding_dims:
    input_dim = len(data_normalized[0])  # this is the number of input kmers

    encoded_activation = 'relu'
    #encoded_activation = 'sigmoid'
    #encoded_activation = 'linear'
    #decoded_activation = 'softmax'
    decoded_activation = 'sigmoid'

    loss = 'binary_crossentropy'

    model = deep_learning_models.create_supervised_model(
        input_dim, encoding_dim, encoded_activation, decoded_activation)

    #################
    # Fit the model #
    #################

    numEpochs = 1000
    batchSize = 32

    history = History()

    model.fit(data_normalized,
              labels,
              epochs=numEpochs,
              validation_split=0.2,
              batch_size=batchSize,
 #standardize the data, mean=0, std=1
 norm_input = True
 if norm_input:
     x_train, x_test = stats_utils.standardize_data_bootstrap(
         data_normalized[train_idx], x_test, x_train)
 #
 ###########################################
 # set up a model (supervised learning)    #
 ###########################################
 # note that the model has to be instantiated each time a new fold is started otherwise the weights will not start from scratch.
 #
 input_dim = len(
     data_normalized[0])  # this is the number of input kmers
 #
 model = deep_learning_models.create_supervised_model(
     input_dim, encoding_dim, encoded_activation, input_dropout_pct,
     dropout_pct)
 #
 ##################################################
 # Fit the model with the train data of this fold #
 ##################################################
 history = History()
 # history is a dictionary. To get the keys, type print(history.history.keys())
 #
 model.fit(x_train,
           y_train,
           epochs=num_epochs,
           batch_size=len(x_train),
           shuffle=True,
           validation_data=(x_test, y_test),
           verbose=1,