예제 #1
0
def train_nn_models(df: pd.DataFrame, opts: options.TrainOptions) -> None:
    """
    Train individual models for all targets (columns) present in the provided target data (y) and a multi-label
    model that classifies all targets at once. For each individual target the data is first subset to exclude NA
    values (for target associations). A random sample of the remaining data (size is the split fraction) is used for
    training and the remaining data for validation.

    :param opts: The command line arguments in the options class
    :param df: The dataframe containing x matrix and at least one column for a y target.
    """

    # find target columns
    names_y = [c for c in df.columns if c not in ['cid', 'id', 'mol_id', 'smiles', 'fp', 'inchi', 'fpcompressed']]

    # For each individual target train a model
    for target in names_y:  # [:2]:
        # target=names_y[0] # --> only for testing the code
        x, y, opts = prepare_nn_training_data(df, target, opts)
        if x is None:
            continue

        logging.info(f"X training matrix of shape {x.shape} and type {x.dtype}")
        logging.info(f"Y training matrix of shape {y.shape} and type {y.dtype}")

        # from keras.wrappers.scikit_learn import KerasClassifier
        # from sklearn.model_selection import cross_val_score
        # from sklearn.model_selection import StratifiedKFold
        # estimator = KerasClassifier(build_fn=define_nn_model,
        #                             input_size=x.shape[1],
        #                             epochs=100,
        #                             batch_size=5,
        #                             verbose=0)
        # kfold = StratifiedKFold(n_splits=5, shuffle=True)
        # results = cross_val_score(estimator, x, y, cv=kfold, verbose=2, n_jobs=5)
        # print("Baseline: %.2f%% (%.2f%%)" % (results.mean() * 100, results.std() * 100))

        # do a kfold cross validation for the FNN training
        kfold_c_validator = KFold(n_splits=opts.kFolds,
                                  shuffle=True,
                                  random_state=42)

        # store acc and loss for each fold
        all_scores = pd.DataFrame(columns=["fold_no",  # fold number of k-fold CV
                                           "loss", "val_loss", "acc", "val_acc",  # FNN training
                                           "loss_test", "acc_test"])  # FNN test data

        fold_no = 1

        # split the data
        for train, test in kfold_c_validator.split(x, y):  # kfold_c_validator.split(Xt, Yt):
            # for testing use one of the splits:
            # kf = kfold_c_validator.split(x, y)
            # train, test = next(kf)

            logging.info("Training of fold number:" + str(fold_no))

            logging.info(f"The distribution of 0 and 1 values is:")
            logging.info(f"\ttrain data:\t{pd.DataFrame(y[train])[0].value_counts().to_list()}")
            logging.info(f"\ttest  data:\t{pd.DataFrame(y[test])[0].value_counts().to_list()}")

            model_name = target + "_compressed-" + str(opts.compressFeatures) + "_sampled-" + \
                         str(opts.sampleFractionOnes)

            # define all the output file/path names
            (model_file_path_weights, model_file_path_json, model_hist_path,
             model_validation, model_auc_file,
             model_auc_file_data, outfile_path, checkpoint_path,
             model_heatmap_x, model_heatmap_z) = define_out_file_names(path_prefix=opts.outputDir,
                                                                       target=model_name,
                                                                       fold=fold_no)

            model = define_nn_model(input_size=x[train].shape[1])

            callback_list = nn_callback(checkpoint_path=checkpoint_path)

            # measure the training time
            start = time()
            # train and validate
            hist = model.fit(x[train], y[train],
                             callbacks=callback_list,
                             epochs=opts.epochs,
                             batch_size=256,
                             verbose=opts.verbose,
                             validation_split=opts.testingFraction)
            #                             validation_data=(x_test, y_test))  # this overwrites val_split!
            trainTime = str(round((time() - start) / 60, ndigits=2))

            logging.info("Computation time for training the single-label FNN:" + trainTime + "min")

            ht.store_and_plot_history(base_file_name=model_hist_path,
                                      hist=hist)

            # pd.DataFrame(hist.history).to_csv(model_hist_csv_path)

            # validate model on test data set (x_test, y_test)
            scores = validate_model_on_test_data(x[test], checkpoint_path, y[test],
                                                 "FNN", model_validation, target,
                                                 model_auc_file_data, model_auc_file)

            idx = hist.history['val_loss'].index(min(hist.history['val_loss']))

            row_df = pd.DataFrame([[fold_no,
                                    hist.history['loss'][idx], hist.history['val_loss'][idx],
                                    hist.history['my_acc'][idx], hist.history['val_my_acc'][idx],
                                    scores[0], scores[1], scores[2]]],
                                  columns=["fold_no",  # fold number of k-fold CV
                                           "loss", "val_loss", "acc", "val_acc",  # FNN training
                                           "loss_test", "acc_test", "mcc_test"]
                                  )
            logging.info(row_df)
            all_scores = all_scores.append(row_df, ignore_index=True)
            fold_no += 1
            del model
            # now next fold

        logging.info(all_scores)

        # finalize model
        # 1. provide best performing fold variant
        # select best model based on MCC
        idx2 = all_scores[['mcc_test']].idxmax().ravel()[0]
        fold_no = all_scores.iloc[idx2]['fold_no']

        model_name = target + "_compressed-" + str(opts.compressFeatures) + "_sampled-" + str(
            opts.sampleFractionOnes) + '.Fold-' + str(fold_no)
        checkpoint_path = str(opts.outputDir) + "/" + model_name + '.checkpoint.model.hdf5'

        best_model_file = checkpoint_path.replace("Fold-" + str(fold_no) + ".checkpoint", "best.FNN")

        # store all scores
        file = re.sub(".hdf5", "scores.csv", re.sub("Fold-..checkpoint", "Fold-All", checkpoint_path))
        all_scores.to_csv(file)

        # copy best DNN model
        shutil.copyfile(checkpoint_path, best_model_file)
        logging.info("Best model for FNN is saved: " + best_model_file)

        # AND retrain with full data set
        full_model_file = checkpoint_path.replace("Fold-" + str(fold_no) + ".checkpoint", "full.FNN")
        (model_file_path_weights, model_file_path_json, model_hist_path,
         model_validation, model_auc_file,
         model_auc_file_data, out_file_path, checkpoint_path,
         model_heatmap_x, model_heatmap_z) = define_out_file_names(path_prefix=opts.outputDir,
                                                                   target=target + "_compressed-" + str(
                                                                       opts.compressFeatures) + "_sampled-" + str(
                                                                       opts.sampleFractionOnes))
        # measure the training time
        start = time()

        model = define_nn_model(input_size=x.shape[1])
        callback_list = nn_callback(checkpoint_path=full_model_file)

        # train and validate
        hist = model.fit(x, y,
                         callbacks=callback_list,
                         epochs=opts.epochs,
                         batch_size=256,
                         verbose=opts.verbose,
                         validation_split=opts.testingFraction)

        trainTime = str(round((time() - start) / 60,
                              ndigits=2))

        logging.info("Computation time for training the full classification FNN: " + trainTime + "min")

        model_hist_path = full_model_file.replace(".hdf5", "")
        ht.store_and_plot_history(base_file_name=model_hist_path,
                                  hist=hist)

        # pd.DataFrame(hist.history).to_csv(full_model_file.replace(".hdf5", ".history.csv"))

        del model
예제 #2
0
def train_nn_models_multi(df: pd.DataFrame, opts: options.TrainOptions) -> None:
    # find target columns
    names_y = [c for c in df.columns if c not in ['id', 'smiles', 'fp', 'inchi', 'fpcompressed']]
    selector = df[names_y].notna().apply(np.logical_and.reduce, axis=1)

    if opts.compressFeatures:
        # get compressed fingerprints as numpy array
        fpMatrix = np.array(
            df[df['fpcompressed'].notnull() & selector]['fpcompressed'].to_list(),
            dtype=settings.nn_multi_fp_compressed_numpy_type,
            copy=settings.numpy_copy_values)
        y = np.array(
            df[df['fpcompressed'].notnull() & selector][names_y],
            dtype=settings.nn_multi_target_numpy_type,
            copy=settings.numpy_copy_values)
    else:
        # get fingerprints as numpy array
        fpMatrix = np.array(
            df[df['fp'].notnull() & selector]['fp'].to_list(),
            dtype=settings.nn_multi_fp_numpy_type,
            copy=settings.numpy_copy_values)

        y = np.array(
            df[df['fp'].notnull() & selector][names_y],
            dtype=settings.nn_multi_target_numpy_type,
            copy=settings.numpy_copy_values)

    # do a kfold cross validation for the autoencoder training
    kfold_c_validator = KFold(n_splits=opts.kFolds,
                              shuffle=True,
                              random_state=42)

    # store acc and loss for each fold
    all_scores = pd.DataFrame(columns=["fold_no",  # fold number of k-fold CV
                                       "loss", "val_loss", "acc", "val_acc",  # FNN training
                                       "f1_random", "f1_trained"])  # F1 scores of predictions

    fold_no = 1

    # split the data
    for train, test in kfold_c_validator.split(fpMatrix, y):
        # kf = kfold_c_validator.split(fpMatrix, y)
        # train, test = next(kf)

        (model_file_path_weights, model_file_path_json, model_hist_path,
         model_validation, model_auc_file,
         model_auc_file_data, out_file_path, checkpoint_path,
         model_heatmap_x, model_heatmap_z) = define_out_file_names(path_prefix=opts.outputDir,
                                                                   target="multi" + "_compressed-" + str(
                                                                       opts.compressFeatures),
                                                                   fold=fold_no)

        # use a dnn for multi-class prediction
        model = define_nn_model_multi(input_size=fpMatrix[train].shape[1],
                                      output_size=y.shape[1])

        callback_list = nn_callback(checkpoint_path=checkpoint_path)
        # measure the training time
        start = time()

        # train and validate
        hist = model.fit(fpMatrix[train], y[train],
                         callbacks=callback_list,
                         epochs=opts.epochs,
                         batch_size=256,
                         verbose=opts.verbose,
                         validation_split=opts.testingFraction)

        trainTime = str(round((time() - start) / 60, ndigits=2))

        if opts.verbose > 0:
            logging.info("Computation time for training the multi-label FNN: " + trainTime + " min")

        ht.store_and_plot_history(base_file_name=model_hist_path,
                                  hist=hist)
        # pd.DataFrame(hist.history).to_csv(model_hist_csv_path)

        # validate model on test data set (fpMatrix_test, y_test)
        scores = validate_multi_model_on_test_data(x_test=fpMatrix[test],
                                                   checkpoint_path=checkpoint_path,
                                                   y_test=y[test],
                                                   col_names=names_y,
                                                   result_file=out_file_path.replace("trainingResults.txt",
                                                                                     "predictionResults.csv"))

        idx = hist.history['val_loss'].index(min(hist.history['val_loss']))
        row_df = pd.DataFrame([[fold_no,
                                hist.history['loss'][idx], hist.history['val_loss'][idx],
                                hist.history['accuracy'][idx], hist.history['val_accuracy'][idx],
                                scores[0], scores[1]]],
                              columns=["fold_no",  # fold number of k-fold CV
                                       "loss", "val_loss", "acc", "val_acc", "f1_random", "f1_trained"]
                              )

        logging.info(row_df)
        all_scores = all_scores.append(row_df, ignore_index=True)

        fold_no += 1
        del model

    logging.info(all_scores)

    # finalize model
    # 1. provide best performing fold variant
    # select best model based on MCC
    idx2 = all_scores[['f1_trained']].idxmax().ravel()[0]
    fold_no = all_scores.iloc[idx2]['fold_no']

    model_name = "multi" + "_compressed-" + str(opts.compressFeatures) + '.Fold-' + str(fold_no)
    checkpoint_path = opts.outputDir + '/' + model_name + '.checkpoint.model.hdf5'
    best_model_file = checkpoint_path.replace("Fold-" + str(fold_no) + ".checkpoint.", "best.FNN-")

    file = re.sub(".hdf5", "scores.csv", re.sub("Fold-..checkpoint", "Fold-All", checkpoint_path))
    all_scores.to_csv(file)

    # copy best DNN model
    shutil.copyfile(checkpoint_path, best_model_file)
    logging.info("Best models for FNN is saved:\n" + best_model_file)

    # AND retrain with full data set
    full_model_file = checkpoint_path.replace("Fold-" + str(fold_no) + ".checkpoint", "full.FNN-")

    (model_file_path_weights, model_file_path_json, model_hist_path,
     model_validation, model_auc_file,
     model_auc_file_data, out_file_path, checkpoint_path,
     model_heatmap_x, model_heatmap_z) = define_out_file_names(path_prefix=opts.outputDir,
                                                               target="multi" + "_compressed-" + str(
                                                                   opts.compressFeatures))

    # measure the training time
    start = time()

    model = define_nn_model_multi(input_size=fpMatrix.shape[1],
                                  output_size=y.shape[1])
    callback_list = nn_callback(checkpoint_path=full_model_file)
    # train and validate
    hist = model.fit(fpMatrix, y,
                     callbacks=callback_list,
                     epochs=opts.epochs,
                     batch_size=256,
                     verbose=opts.verbose,
                     validation_split=opts.testingFraction)

    trainTime = str(round((time() - start) / 60,
                          ndigits=2))

    logging.info("Computation time for training the full multi-label FNN: " + trainTime + " min")
    ht.store_and_plot_history(base_file_name=model_hist_path, hist=hist)
예제 #3
0
def train_full_ac(df: pd.DataFrame, opts: options.TrainOptions) -> Model:
    """
    Train an autoencoder on the given feature matrix X. Response matrix is only used to
    split meaningfully in test and train data set.

    :param opts: Command line arguments as defined in options.py
    :param df: Pandas dataframe that contains the smiles/inchi data for training the autoencoder
    :return: The encoder model of the trained autoencoder
    """

    # Set up the model of the AC w.r.t. the input size and the dimension of the bottle neck (z!)
    (autoencoder, encoder) = define_ac_model(input_size=opts.fpSize,
                                             encoding_dim=opts.encFPSize)

    # define output file for autoencoder and encoder weights
    if opts.ecWeightsFile == "":
        logging.info("No AC encoder weights file specified")
        base_file_name = os.path.splitext(basename(opts.inputFile))[0]
        ac_weights_file = os.path.join(opts.outputDir,
                                       base_file_name + ".autoencoder.hdf5")
        ec_weights_file = os.path.join(opts.outputDir,
                                       base_file_name + ".encoder.hdf5")
    else:
        logging.info(f"AC encoder will be saved")
        base_file_name = os.path.splitext(basename(opts.ecWeightsFile))[0]
        ac_weights_file = os.path.join(opts.outputDir,
                                       base_file_name + ".autoencoder.hdf5")
        ec_weights_file = os.path.join(opts.outputDir, opts.ecWeightsFile)

    # collect the callbacks for training
    callback_list = autoencoder_callback(checkpoint_path=ac_weights_file)

    # Select all fps that are valid and turn them into a numpy array
    # This step is crucial for speed!!!
    fp_matrix = np.array(df[df["fp"].notnull()]["fp"].to_list(),
                         dtype=settings.ac_fp_numpy_type,
                         copy=settings.numpy_copy_values)
    logging.info(
        f"Training AC on a matrix of shape {fp_matrix.shape} with type {fp_matrix.dtype}"
    )

    # split data into test and training data
    x_train, x_test = train_test_split(fp_matrix,
                                       test_size=0.2,
                                       random_state=42)
    logging.info(
        f"AC train data shape {x_train.shape} with type {x_train.dtype}")
    logging.info(f"AC test data shape {x_test.shape} with type {x_test.dtype}")

    auto_hist = autoencoder.fit(x_train,
                                x_train,
                                callbacks=callback_list,
                                epochs=opts.epochs,
                                batch_size=256,
                                verbose=opts.verbose,
                                validation_data=(x_test, x_test))
    logging.info(f"Autoencoder weights stored in file: {ac_weights_file}")

    ht.store_and_plot_history(base_file_name=os.path.join(
        opts.outputDir, base_file_name + ".AC"),
                              hist=auto_hist)

    encoder.save_weights(ec_weights_file)
    logging.info(f"Encoder weights stored in file: {ec_weights_file}")

    return encoder