def train_nn_models(df: pd.DataFrame, opts: options.TrainOptions) -> None: """ Train individual models for all targets (columns) present in the provided target data (y) and a multi-label model that classifies all targets at once. For each individual target the data is first subset to exclude NA values (for target associations). A random sample of the remaining data (size is the split fraction) is used for training and the remaining data for validation. :param opts: The command line arguments in the options class :param df: The dataframe containing x matrix and at least one column for a y target. """ # find target columns names_y = [c for c in df.columns if c not in ['cid', 'id', 'mol_id', 'smiles', 'fp', 'inchi', 'fpcompressed']] # For each individual target train a model for target in names_y: # [:2]: # target=names_y[0] # --> only for testing the code x, y, opts = prepare_nn_training_data(df, target, opts) if x is None: continue logging.info(f"X training matrix of shape {x.shape} and type {x.dtype}") logging.info(f"Y training matrix of shape {y.shape} and type {y.dtype}") # from keras.wrappers.scikit_learn import KerasClassifier # from sklearn.model_selection import cross_val_score # from sklearn.model_selection import StratifiedKFold # estimator = KerasClassifier(build_fn=define_nn_model, # input_size=x.shape[1], # epochs=100, # batch_size=5, # verbose=0) # kfold = StratifiedKFold(n_splits=5, shuffle=True) # results = cross_val_score(estimator, x, y, cv=kfold, verbose=2, n_jobs=5) # print("Baseline: %.2f%% (%.2f%%)" % (results.mean() * 100, results.std() * 100)) # do a kfold cross validation for the FNN training kfold_c_validator = KFold(n_splits=opts.kFolds, shuffle=True, random_state=42) # store acc and loss for each fold all_scores = pd.DataFrame(columns=["fold_no", # fold number of k-fold CV "loss", "val_loss", "acc", "val_acc", # FNN training "loss_test", "acc_test"]) # FNN test data fold_no = 1 # split the data for train, test in kfold_c_validator.split(x, y): # kfold_c_validator.split(Xt, Yt): # for testing use one of the splits: # kf = kfold_c_validator.split(x, y) # train, test = next(kf) logging.info("Training of fold number:" + str(fold_no)) logging.info(f"The distribution of 0 and 1 values is:") logging.info(f"\ttrain data:\t{pd.DataFrame(y[train])[0].value_counts().to_list()}") logging.info(f"\ttest data:\t{pd.DataFrame(y[test])[0].value_counts().to_list()}") model_name = target + "_compressed-" + str(opts.compressFeatures) + "_sampled-" + \ str(opts.sampleFractionOnes) # define all the output file/path names (model_file_path_weights, model_file_path_json, model_hist_path, model_validation, model_auc_file, model_auc_file_data, outfile_path, checkpoint_path, model_heatmap_x, model_heatmap_z) = define_out_file_names(path_prefix=opts.outputDir, target=model_name, fold=fold_no) model = define_nn_model(input_size=x[train].shape[1]) callback_list = nn_callback(checkpoint_path=checkpoint_path) # measure the training time start = time() # train and validate hist = model.fit(x[train], y[train], callbacks=callback_list, epochs=opts.epochs, batch_size=256, verbose=opts.verbose, validation_split=opts.testingFraction) # validation_data=(x_test, y_test)) # this overwrites val_split! trainTime = str(round((time() - start) / 60, ndigits=2)) logging.info("Computation time for training the single-label FNN:" + trainTime + "min") ht.store_and_plot_history(base_file_name=model_hist_path, hist=hist) # pd.DataFrame(hist.history).to_csv(model_hist_csv_path) # validate model on test data set (x_test, y_test) scores = validate_model_on_test_data(x[test], checkpoint_path, y[test], "FNN", model_validation, target, model_auc_file_data, model_auc_file) idx = hist.history['val_loss'].index(min(hist.history['val_loss'])) row_df = pd.DataFrame([[fold_no, hist.history['loss'][idx], hist.history['val_loss'][idx], hist.history['my_acc'][idx], hist.history['val_my_acc'][idx], scores[0], scores[1], scores[2]]], columns=["fold_no", # fold number of k-fold CV "loss", "val_loss", "acc", "val_acc", # FNN training "loss_test", "acc_test", "mcc_test"] ) logging.info(row_df) all_scores = all_scores.append(row_df, ignore_index=True) fold_no += 1 del model # now next fold logging.info(all_scores) # finalize model # 1. provide best performing fold variant # select best model based on MCC idx2 = all_scores[['mcc_test']].idxmax().ravel()[0] fold_no = all_scores.iloc[idx2]['fold_no'] model_name = target + "_compressed-" + str(opts.compressFeatures) + "_sampled-" + str( opts.sampleFractionOnes) + '.Fold-' + str(fold_no) checkpoint_path = str(opts.outputDir) + "/" + model_name + '.checkpoint.model.hdf5' best_model_file = checkpoint_path.replace("Fold-" + str(fold_no) + ".checkpoint", "best.FNN") # store all scores file = re.sub(".hdf5", "scores.csv", re.sub("Fold-..checkpoint", "Fold-All", checkpoint_path)) all_scores.to_csv(file) # copy best DNN model shutil.copyfile(checkpoint_path, best_model_file) logging.info("Best model for FNN is saved: " + best_model_file) # AND retrain with full data set full_model_file = checkpoint_path.replace("Fold-" + str(fold_no) + ".checkpoint", "full.FNN") (model_file_path_weights, model_file_path_json, model_hist_path, model_validation, model_auc_file, model_auc_file_data, out_file_path, checkpoint_path, model_heatmap_x, model_heatmap_z) = define_out_file_names(path_prefix=opts.outputDir, target=target + "_compressed-" + str( opts.compressFeatures) + "_sampled-" + str( opts.sampleFractionOnes)) # measure the training time start = time() model = define_nn_model(input_size=x.shape[1]) callback_list = nn_callback(checkpoint_path=full_model_file) # train and validate hist = model.fit(x, y, callbacks=callback_list, epochs=opts.epochs, batch_size=256, verbose=opts.verbose, validation_split=opts.testingFraction) trainTime = str(round((time() - start) / 60, ndigits=2)) logging.info("Computation time for training the full classification FNN: " + trainTime + "min") model_hist_path = full_model_file.replace(".hdf5", "") ht.store_and_plot_history(base_file_name=model_hist_path, hist=hist) # pd.DataFrame(hist.history).to_csv(full_model_file.replace(".hdf5", ".history.csv")) del model
def train_nn_models_multi(df: pd.DataFrame, opts: options.TrainOptions) -> None: # find target columns names_y = [c for c in df.columns if c not in ['id', 'smiles', 'fp', 'inchi', 'fpcompressed']] selector = df[names_y].notna().apply(np.logical_and.reduce, axis=1) if opts.compressFeatures: # get compressed fingerprints as numpy array fpMatrix = np.array( df[df['fpcompressed'].notnull() & selector]['fpcompressed'].to_list(), dtype=settings.nn_multi_fp_compressed_numpy_type, copy=settings.numpy_copy_values) y = np.array( df[df['fpcompressed'].notnull() & selector][names_y], dtype=settings.nn_multi_target_numpy_type, copy=settings.numpy_copy_values) else: # get fingerprints as numpy array fpMatrix = np.array( df[df['fp'].notnull() & selector]['fp'].to_list(), dtype=settings.nn_multi_fp_numpy_type, copy=settings.numpy_copy_values) y = np.array( df[df['fp'].notnull() & selector][names_y], dtype=settings.nn_multi_target_numpy_type, copy=settings.numpy_copy_values) # do a kfold cross validation for the autoencoder training kfold_c_validator = KFold(n_splits=opts.kFolds, shuffle=True, random_state=42) # store acc and loss for each fold all_scores = pd.DataFrame(columns=["fold_no", # fold number of k-fold CV "loss", "val_loss", "acc", "val_acc", # FNN training "f1_random", "f1_trained"]) # F1 scores of predictions fold_no = 1 # split the data for train, test in kfold_c_validator.split(fpMatrix, y): # kf = kfold_c_validator.split(fpMatrix, y) # train, test = next(kf) (model_file_path_weights, model_file_path_json, model_hist_path, model_validation, model_auc_file, model_auc_file_data, out_file_path, checkpoint_path, model_heatmap_x, model_heatmap_z) = define_out_file_names(path_prefix=opts.outputDir, target="multi" + "_compressed-" + str( opts.compressFeatures), fold=fold_no) # use a dnn for multi-class prediction model = define_nn_model_multi(input_size=fpMatrix[train].shape[1], output_size=y.shape[1]) callback_list = nn_callback(checkpoint_path=checkpoint_path) # measure the training time start = time() # train and validate hist = model.fit(fpMatrix[train], y[train], callbacks=callback_list, epochs=opts.epochs, batch_size=256, verbose=opts.verbose, validation_split=opts.testingFraction) trainTime = str(round((time() - start) / 60, ndigits=2)) if opts.verbose > 0: logging.info("Computation time for training the multi-label FNN: " + trainTime + " min") ht.store_and_plot_history(base_file_name=model_hist_path, hist=hist) # pd.DataFrame(hist.history).to_csv(model_hist_csv_path) # validate model on test data set (fpMatrix_test, y_test) scores = validate_multi_model_on_test_data(x_test=fpMatrix[test], checkpoint_path=checkpoint_path, y_test=y[test], col_names=names_y, result_file=out_file_path.replace("trainingResults.txt", "predictionResults.csv")) idx = hist.history['val_loss'].index(min(hist.history['val_loss'])) row_df = pd.DataFrame([[fold_no, hist.history['loss'][idx], hist.history['val_loss'][idx], hist.history['accuracy'][idx], hist.history['val_accuracy'][idx], scores[0], scores[1]]], columns=["fold_no", # fold number of k-fold CV "loss", "val_loss", "acc", "val_acc", "f1_random", "f1_trained"] ) logging.info(row_df) all_scores = all_scores.append(row_df, ignore_index=True) fold_no += 1 del model logging.info(all_scores) # finalize model # 1. provide best performing fold variant # select best model based on MCC idx2 = all_scores[['f1_trained']].idxmax().ravel()[0] fold_no = all_scores.iloc[idx2]['fold_no'] model_name = "multi" + "_compressed-" + str(opts.compressFeatures) + '.Fold-' + str(fold_no) checkpoint_path = opts.outputDir + '/' + model_name + '.checkpoint.model.hdf5' best_model_file = checkpoint_path.replace("Fold-" + str(fold_no) + ".checkpoint.", "best.FNN-") file = re.sub(".hdf5", "scores.csv", re.sub("Fold-..checkpoint", "Fold-All", checkpoint_path)) all_scores.to_csv(file) # copy best DNN model shutil.copyfile(checkpoint_path, best_model_file) logging.info("Best models for FNN is saved:\n" + best_model_file) # AND retrain with full data set full_model_file = checkpoint_path.replace("Fold-" + str(fold_no) + ".checkpoint", "full.FNN-") (model_file_path_weights, model_file_path_json, model_hist_path, model_validation, model_auc_file, model_auc_file_data, out_file_path, checkpoint_path, model_heatmap_x, model_heatmap_z) = define_out_file_names(path_prefix=opts.outputDir, target="multi" + "_compressed-" + str( opts.compressFeatures)) # measure the training time start = time() model = define_nn_model_multi(input_size=fpMatrix.shape[1], output_size=y.shape[1]) callback_list = nn_callback(checkpoint_path=full_model_file) # train and validate hist = model.fit(fpMatrix, y, callbacks=callback_list, epochs=opts.epochs, batch_size=256, verbose=opts.verbose, validation_split=opts.testingFraction) trainTime = str(round((time() - start) / 60, ndigits=2)) logging.info("Computation time for training the full multi-label FNN: " + trainTime + " min") ht.store_and_plot_history(base_file_name=model_hist_path, hist=hist)
def train_full_ac(df: pd.DataFrame, opts: options.TrainOptions) -> Model: """ Train an autoencoder on the given feature matrix X. Response matrix is only used to split meaningfully in test and train data set. :param opts: Command line arguments as defined in options.py :param df: Pandas dataframe that contains the smiles/inchi data for training the autoencoder :return: The encoder model of the trained autoencoder """ # Set up the model of the AC w.r.t. the input size and the dimension of the bottle neck (z!) (autoencoder, encoder) = define_ac_model(input_size=opts.fpSize, encoding_dim=opts.encFPSize) # define output file for autoencoder and encoder weights if opts.ecWeightsFile == "": logging.info("No AC encoder weights file specified") base_file_name = os.path.splitext(basename(opts.inputFile))[0] ac_weights_file = os.path.join(opts.outputDir, base_file_name + ".autoencoder.hdf5") ec_weights_file = os.path.join(opts.outputDir, base_file_name + ".encoder.hdf5") else: logging.info(f"AC encoder will be saved") base_file_name = os.path.splitext(basename(opts.ecWeightsFile))[0] ac_weights_file = os.path.join(opts.outputDir, base_file_name + ".autoencoder.hdf5") ec_weights_file = os.path.join(opts.outputDir, opts.ecWeightsFile) # collect the callbacks for training callback_list = autoencoder_callback(checkpoint_path=ac_weights_file) # Select all fps that are valid and turn them into a numpy array # This step is crucial for speed!!! fp_matrix = np.array(df[df["fp"].notnull()]["fp"].to_list(), dtype=settings.ac_fp_numpy_type, copy=settings.numpy_copy_values) logging.info( f"Training AC on a matrix of shape {fp_matrix.shape} with type {fp_matrix.dtype}" ) # split data into test and training data x_train, x_test = train_test_split(fp_matrix, test_size=0.2, random_state=42) logging.info( f"AC train data shape {x_train.shape} with type {x_train.dtype}") logging.info(f"AC test data shape {x_test.shape} with type {x_test.dtype}") auto_hist = autoencoder.fit(x_train, x_train, callbacks=callback_list, epochs=opts.epochs, batch_size=256, verbose=opts.verbose, validation_data=(x_test, x_test)) logging.info(f"Autoencoder weights stored in file: {ac_weights_file}") ht.store_and_plot_history(base_file_name=os.path.join( opts.outputDir, base_file_name + ".AC"), hist=auto_hist) encoder.save_weights(ec_weights_file) logging.info(f"Encoder weights stored in file: {ec_weights_file}") return encoder