Пример #1
0
            def create_mod(params):
                print('Model: {}'.format(params))

                model_tag = data_name

                K.clear_session()

                if n_gpus > 1:
                    if bridge_type == 'NVLink':
                        model_opt = model.LSTMAttModel.create(inputtokens = max_length+1, 
                                                              vocabsize = vocab_size, 
                                                              lstmunits=int(params[:,0][0]), 
                                                              denseunits = int(params[:,1]), 
                                                              embedding = int(params[:,2][0]))
                    else:
                        with tf.device('/cpu'): # necessary to multi-GPU scaling
                            model_opt = model.LSTMAttModel.create(inputtokens = max_length+1, 
                                                                  vocabsize = vocab_size, 
                                                                  lstmunits=int(params[:,0][0]), 
                                                                  denseunits = int(params[:,1]), 
                                                                  embedding = int(params[:,2][0]))
                            
                    multi_model = model.ModelMGPU(model_opt, gpus=n_gpus, bridge_type=bridge_type)
                else: # single GPU
                    model_opt = model.LSTMAttModel.create(inputtokens = max_length+1, 
                                                          vocabsize = vocab_size, 
                                                          lstmunits=int(params[:,0][0]), 
                                                          denseunits = int(params[:,1]), 
                                                          embedding = int(params[:,2][0]))
                    
                    multi_model = model_opt

                batch_size = int(params[:,3][0])
                custom_adam = Adam(lr=math.pow(10,-float(params[:,4][0])))
                multi_model.compile(loss='mse', optimizer=custom_adam, metrics=[metrics.mae,metrics.mse])

                history = multi_model.fit_generator(generator = DataSequence(x_train_enum_tokens,
                                                                             vocab = tokens, 
                                                                             max_length = max_length, 
                                                                             props_set = y_train_enum, 
                                                                             batch_size = batch_size), 
                                                                             steps_per_epoch = math.ceil(len(x_train_enum_tokens)/batch_size)//bayopt_it_factor, 
                                                    validation_data = DataSequence(x_valid_enum_tokens,
                                                                                   vocab = tokens, 
                                                                                   max_length = max_length, 
                                                                                   props_set = y_valid_enum, 
                                                                                   batch_size = min(len(x_valid_enum_tokens), batch_size)),
                                                    validation_steps = math.ceil(len(x_valid_enum_tokens)/min(len(x_valid_enum_tokens), batch_size))//bayopt_it_factor, 
                                                    epochs = bayopt_n_epochs, 
                                                    shuffle = True,
                                                    initial_epoch = 0, 
                                                    verbose = 0)

                best_epoch = np.argmin(history.history['val_loss'])
                mae_valid = history.history['val_mean_absolute_error'][best_epoch]
                mse_valid = history.history['val_mean_squared_error'][best_epoch]
                if math.isnan(mse_valid): # discard diverging architectures (rare event)
                    mae_valid = math.inf
                    mse_valid = math.inf
                print('Valid MAE: {0:0.4f}, RMSE: {1:0.4f}'.format(mae_valid, mse_valid))

                return mse_valid
Пример #2
0
def Main(data, 
         data_name, 
         bayopt_bounds, 
         data_units = '',
         k_fold_number = 8, 
         augmentation = False, 
         outdir = "../data/", 
         bayopt_n_epochs = 10,
         bayopt_n_rounds = 25, 
         bayopt_it_factor = 1, 
         bayopt_on = True, 
         lstmunits_ref = 512, 
         denseunits_ref = 512, 
         embedding_ref = 512, 
         batch_size_ref = 64, 
         alpha_ref = 3, 
         n_gpus = 1, 
         bridge_type = 'None', 
         patience = 25, 
         n_epochs = 1000):
    
    if augmentation:
        p_dir_temp = 'Augm'
    else:
        p_dir_temp = 'Can'
        
    save_dir = outdir+'Main/'+'{}/{}/'.format(data_name,p_dir_temp)
    os.makedirs(save_dir, exist_ok=True)
        
    print("***SMILES_X starts...***\n\n")
    np.random.seed(seed=123)
    seed_list = np.random.randint(int(1e6), size = k_fold_number).tolist()
    # Train/validation/test data splitting - 80/10/10 % at random with diff. seeds for k_fold_number times
    for ifold in range(k_fold_number):
        
        print("******")
        print("***Fold #{} initiated...***".format(ifold))
        print("******")
        
        print("***Sampling and splitting of the dataset.***\n")
        x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \
        utils.random_split(smiles_input=data.smiles, 
                           prop_input=np.array(data.iloc[:,1]), 
                           random_state=seed_list[ifold], 
                           scaling = True)
              
        # data augmentation or not
        if augmentation == True:
            print("***Data augmentation to {}***\n".format(augmentation))
            canonical = False
            rotation = True
        else:
            print("***No data augmentation has been required.***\n")
            canonical = True
            rotation = False
            
        x_train_enum, x_train_enum_card, y_train_enum = \
        augm.Augmentation(x_train, y_train, canon=canonical, rotate=rotation)

        x_valid_enum, x_valid_enum_card, y_valid_enum = \
        augm.Augmentation(x_valid, y_valid, canon=canonical, rotate=rotation)

        x_test_enum, x_test_enum_card, y_test_enum = \
        augm.Augmentation(x_test, y_test, canon=canonical, rotate=rotation)
        
        print("Enumerated SMILES:\n\tTraining set: {}\n\tValidation set: {}\n\tTest set: {}\n".\
        format(x_train_enum.shape[0], x_valid_enum.shape[0], x_test_enum.shape[0]))
        
        print("***Tokenization of SMILES.***\n")
        # Tokenize SMILES per dataset
        x_train_enum_tokens = token.get_tokens(x_train_enum)
        x_valid_enum_tokens = token.get_tokens(x_valid_enum)
        x_test_enum_tokens = token.get_tokens(x_test_enum)
        
        print("Examples of tokenized SMILES from a training set:\n{}\n".\
        format(x_train_enum_tokens[:5]))
        
        # Vocabulary size computation
        all_smiles_tokens = x_train_enum_tokens+x_valid_enum_tokens+x_test_enum_tokens

        # Check if the vocabulary for current dataset exists already
        if os.path.exists(save_dir+data_name+'_Vocabulary.txt'):
            tokens = token.get_vocab(save_dir+data_name+'_Vocabulary.txt')
        else:
            tokens = token.extract_vocab(all_smiles_tokens)
            token.save_vocab(tokens, save_dir+data_name+'_Vocabulary.txt')
            tokens = token.get_vocab(save_dir+data_name+'_Vocabulary.txt')

        vocab_size = len(tokens)
        
        train_unique_tokens = token.extract_vocab(x_train_enum_tokens)
        print("Number of tokens only present in a training set: {}\n".format(len(train_unique_tokens)))
        valid_unique_tokens = token.extract_vocab(x_valid_enum_tokens)
        print("Number of tokens only present in a validation set: {}".format(len(valid_unique_tokens)))
        print("Is the validation set a subset of the training set: {}".\
              format(valid_unique_tokens.issubset(train_unique_tokens)))
        print("What are the tokens by which they differ: {}\n".\
              format(valid_unique_tokens.difference(train_unique_tokens)))
        test_unique_tokens = token.extract_vocab(x_test_enum_tokens)
        print("Number of tokens only present in a test set: {}".format(len(test_unique_tokens)))
        print("Is the test set a subset of the training set: {}".\
              format(test_unique_tokens.issubset(train_unique_tokens)))
        print("What are the tokens by which they differ: {}".\
              format(test_unique_tokens.difference(train_unique_tokens)))
        print("Is the test set a subset of the validation set: {}".\
              format(test_unique_tokens.issubset(valid_unique_tokens)))
        print("What are the tokens by which they differ: {}\n".\
              format(test_unique_tokens.difference(valid_unique_tokens)))
        
        print("Full vocabulary: {}\nOf size: {}\n".format(tokens, vocab_size))
        
        # Add 'pad', 'unk' tokens to the existing list
        tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size)
        
        # Maximum of length of SMILES to process
        max_length = np.max([len(ismiles) for ismiles in all_smiles_tokens])
        print("Maximum length of tokenized SMILES: {} tokens (termination spaces included)\n".format(max_length))
        
        print("***Bayesian Optimization of the SMILESX's architecture.***\n")        
        if bayopt_on:
            # Operate the bayesian optimization of the neural architecture
            def create_mod(params):
                print('Model: {}'.format(params))

                model_tag = data_name

                K.clear_session()

                if n_gpus > 1:
                    if bridge_type == 'NVLink':
                        model_opt = model.LSTMAttModel.create(inputtokens = max_length+1, 
                                                              vocabsize = vocab_size, 
                                                              lstmunits=int(params[:,0][0]), 
                                                              denseunits = int(params[:,1]), 
                                                              embedding = int(params[:,2][0]))
                    else:
                        with tf.device('/cpu'): # necessary to multi-GPU scaling
                            model_opt = model.LSTMAttModel.create(inputtokens = max_length+1, 
                                                                  vocabsize = vocab_size, 
                                                                  lstmunits=int(params[:,0][0]), 
                                                                  denseunits = int(params[:,1]), 
                                                                  embedding = int(params[:,2][0]))
                            
                    multi_model = model.ModelMGPU(model_opt, gpus=n_gpus, bridge_type=bridge_type)
                else: # single GPU
                    model_opt = model.LSTMAttModel.create(inputtokens = max_length+1, 
                                                          vocabsize = vocab_size, 
                                                          lstmunits=int(params[:,0][0]), 
                                                          denseunits = int(params[:,1]), 
                                                          embedding = int(params[:,2][0]))
                    
                    multi_model = model_opt

                batch_size = int(params[:,3][0])
                custom_adam = Adam(lr=math.pow(10,-float(params[:,4][0])))
                multi_model.compile(loss='mse', optimizer=custom_adam, metrics=[metrics.mae,metrics.mse])

                history = multi_model.fit_generator(generator = DataSequence(x_train_enum_tokens,
                                                                             vocab = tokens, 
                                                                             max_length = max_length, 
                                                                             props_set = y_train_enum, 
                                                                             batch_size = batch_size), 
                                                                             steps_per_epoch = math.ceil(len(x_train_enum_tokens)/batch_size)//bayopt_it_factor, 
                                                    validation_data = DataSequence(x_valid_enum_tokens,
                                                                                   vocab = tokens, 
                                                                                   max_length = max_length, 
                                                                                   props_set = y_valid_enum, 
                                                                                   batch_size = min(len(x_valid_enum_tokens), batch_size)),
                                                    validation_steps = math.ceil(len(x_valid_enum_tokens)/min(len(x_valid_enum_tokens), batch_size))//bayopt_it_factor, 
                                                    epochs = bayopt_n_epochs, 
                                                    shuffle = True,
                                                    initial_epoch = 0, 
                                                    verbose = 0)

                best_epoch = np.argmin(history.history['val_loss'])
                mae_valid = history.history['val_mean_absolute_error'][best_epoch]
                mse_valid = history.history['val_mean_squared_error'][best_epoch]
                if math.isnan(mse_valid): # discard diverging architectures (rare event)
                    mae_valid = math.inf
                    mse_valid = math.inf
                print('Valid MAE: {0:0.4f}, RMSE: {1:0.4f}'.format(mae_valid, mse_valid))

                return mse_valid

            print("Random initialization:\n")
            Bayes_opt = GPyOpt.methods.BayesianOptimization(f=create_mod, 
                                                            domain=bayopt_bounds, 
                                                            acquisition_type = 'EI',
                                                            initial_design_numdata = bayopt_n_rounds,
                                                            exact_feval = False,
                                                            normalize_Y = True,
                                                            num_cores = multiprocessing.cpu_count()-1)
            print("Optimization:\n")
            Bayes_opt.run_optimization(max_iter=bayopt_n_rounds)
            best_arch = Bayes_opt.x_opt
        else:
            best_arch = [lstmunits_ref, denseunits_ref, embedding_ref, batch_size_ref, alpha_ref]
            
        print("\nThe architecture for this datatset is:\n\tLSTM units: {}\n\tDense units: {}\n\tEmbedding dimensions {}".\
             format(int(best_arch[0]), int(best_arch[1]), int(best_arch[2])))
        print("\tBatch size: {0:}\n\tLearning rate: 10^-({1:.1f})\n".format(int(best_arch[3]), float(best_arch[4])))
        
        print("***Training of the best model.***\n")
        # Train the model and predict
        K.clear_session()   
        # Define the multi-gpus model if necessary
        if n_gpus > 1:
            if bridge_type == 'NVLink':
                model_train = model.LSTMAttModel.create(inputtokens = max_length+1, 
                                                        vocabsize = vocab_size, 
                                                        lstmunits= int(best_arch[0]), 
                                                        denseunits = int(best_arch[1]), 
                                                        embedding = int(best_arch[2]))
            else:
                with tf.device('/cpu'):
                    model_train = model.LSTMAttModel.create(inputtokens = max_length+1, 
                                                            vocabsize = vocab_size, 
                                                            lstmunits= int(best_arch[0]), 
                                                            denseunits = int(best_arch[1]), 
                                                            embedding = int(best_arch[2]))
            print("Best model summary:\n")
            print(model_train.summary())
            print("\n")
            multi_model = model.ModelMGPU(model_train, gpus=n_gpus, bridge_type=bridge_type)
        else:
            model_train = model.LSTMAttModel.create(inputtokens = max_length+1, 
                                                    vocabsize = vocab_size, 
                                                    lstmunits= int(best_arch[0]), 
                                                    denseunits = int(best_arch[1]), 
                                                    embedding = int(best_arch[2]))

            print("Best model summary:\n")
            print(model_train.summary())
            print("\n")
            multi_model = model_train

        batch_size = int(best_arch[3])
        custom_adam = Adam(lr=math.pow(10,-float(best_arch[4])))
        # Compile the model
        multi_model.compile(loss="mse", optimizer=custom_adam, metrics=[metrics.mae,metrics.mse])
        
        # Checkpoint, Early stopping and callbacks definition
        filepath=save_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(ifold)+'.hdf5'
        
        checkpoint = ModelCheckpoint(filepath, 
                                     monitor='val_loss', 
                                     verbose=0, 
                                     save_best_only=True, 
                                     mode='min')

        earlystopping = EarlyStopping(monitor='val_loss', 
                                      min_delta=0, 
                                      patience=patience, 
                                      verbose=0, 
                                      mode='min')
                
        callbacks_list = [checkpoint, earlystopping]

        # Fit the model
        history = multi_model.fit_generator(generator = DataSequence(x_train_enum_tokens,
                                                                     vocab = tokens, 
                                                                     max_length = max_length, 
                                                                     props_set = y_train_enum, 
                                                                     batch_size = batch_size), 
                                            validation_data = DataSequence(x_valid_enum_tokens,
                                                                           vocab = tokens, 
                                                                           max_length = max_length, 
                                                                           props_set = y_valid_enum, 
                                                                           batch_size = min(len(x_valid_enum_tokens), batch_size)),
                                            epochs = n_epochs, 
                                            shuffle = True,
                                            initial_epoch = 0, 
                                            callbacks = callbacks_list)

        # Summarize history for losses per epoch
        plt.plot(history.history['loss'])
        plt.plot(history.history['val_loss'])
        plt.title('')
        plt.ylabel('Loss')
        plt.xlabel('Epoch')
        plt.legend(['Train', 'Validation'], loc='upper right')
        plt.savefig(save_dir+'History_fit_LSTMAtt_'+data_name+'_model_weights.best_fold_'+str(ifold)+'.png', bbox_inches='tight')
        plt.close()
        
        print("Best val_loss @ Epoch #{}\n".format(np.argmin(history.history['val_loss'])+1))

        print("***Predictions from the best model.***\n")
        model_train.load_weights(save_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(ifold)+'.hdf5')
        model_train.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse])

        # predict and compare for the training, validation and test sets
        x_train_enum_tokens_tointvec = token.int_vec_encode(tokenized_smiles_list = x_train_enum_tokens, 
                                                            max_length = max_length+1, 
                                                            vocab = tokens)
        x_valid_enum_tokens_tointvec = token.int_vec_encode(tokenized_smiles_list = x_valid_enum_tokens, 
                                                            max_length = max_length+1, 
                                                            vocab = tokens)
        x_test_enum_tokens_tointvec = token.int_vec_encode(tokenized_smiles_list = x_test_enum_tokens, 
                                                           max_length = max_length+1, 
                                                           vocab = tokens)

        y_pred_train = model_train.predict(x_train_enum_tokens_tointvec)
        y_pred_valid = model_train.predict(x_valid_enum_tokens_tointvec)
        y_pred_test = model_train.predict(x_test_enum_tokens_tointvec)

        # compute a mean per set of augmented SMILES
        y_pred_train_mean, _ = utils.mean_median_result(x_train_enum_card, y_pred_train)
        y_pred_valid_mean, _ = utils.mean_median_result(x_valid_enum_card, y_pred_valid)
        y_pred_test_mean, _ = utils.mean_median_result(x_test_enum_card, y_pred_test)

        # inverse transform the scaling of the property and plot 'predictions VS observations'
        y_pred_VS_true_train = scaler.inverse_transform(y_train) - \
                               scaler.inverse_transform(y_pred_train_mean.reshape(-1,1))
        mae_train = np.mean(np.absolute(y_pred_VS_true_train))
        mse_train = np.mean(np.square(y_pred_VS_true_train))
        corrcoef_train = r2_score(scaler.inverse_transform(y_train), \
                                 scaler.inverse_transform(y_pred_train_mean.reshape(-1,1)))
        print("For the training set:\nMAE: {0:0.4f} RMSE: {1:0.4f} R^2: {2:0.4f}\n".\
              format(mae_train, np.sqrt(mse_train), corrcoef_train))

        y_pred_VS_true_valid = scaler.inverse_transform(y_valid) - \
                               scaler.inverse_transform(y_pred_valid_mean.reshape(-1,1))
        mae_valid = np.mean(np.absolute(y_pred_VS_true_valid))
        mse_valid = np.mean(np.square(y_pred_VS_true_valid))
        corrcoef_valid = r2_score(scaler.inverse_transform(y_valid), \
                                  scaler.inverse_transform(y_pred_valid_mean.reshape(-1,1)))
        print("For the validation set:\nMAE: {0:0.4f} RMSE: {1:0.4f} R^2: {2:0.4f}\n".\
              format(mae_valid, np.sqrt(mse_valid), corrcoef_valid))

        y_pred_VS_true_test = scaler.inverse_transform(y_test) - \
                              scaler.inverse_transform(y_pred_test_mean.reshape(-1,1))
        mae_test = np.mean(np.absolute(y_pred_VS_true_test))
        mse_test = np.mean(np.square(y_pred_VS_true_test))
        corrcoef_test = r2_score(scaler.inverse_transform(y_test), \
                                 scaler.inverse_transform(y_pred_test_mean.reshape(-1,1)))
        print("For the test set:\nMAE: {0:0.4f} RMSE: {1:0.4f} R^2: {2:0.4f}\n".\
              format(mae_test, np.sqrt(mse_test), corrcoef_test))

        # Plot the final result
        # Unscaling the data
        y_train = scaler.inverse_transform(y_train)
        y_pred_train_mean = scaler.inverse_transform(y_pred_train_mean.reshape(-1,1))
        y_valid = scaler.inverse_transform(y_valid)
        y_pred_valid_mean = scaler.inverse_transform(y_pred_valid_mean.reshape(-1,1))
        y_test = scaler.inverse_transform(y_test)
        y_pred_test_mean = scaler.inverse_transform(y_pred_test_mean.reshape(-1,1))

        # Changed colors, scaling and sizes
        plt.figure(figsize=(12, 8))

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Setting plot limits
        y_true_min = min(np.min(y_train), np.min(y_valid), np.min(y_test))
        y_true_max = max(np.max(y_train), np.max(y_valid), np.max(y_test))
        y_pred_min = min(np.min(y_pred_train_mean), np.min(y_pred_valid_mean), np.min(y_pred_test_mean))
        y_pred_max = max(np.max(y_pred_train_mean), np.max(y_pred_valid_mean), np.max(y_pred_test_mean))
        # Expanding slightly the canvas around the data points (by 10%)
        axmin = y_true_min-0.1*(y_true_max-y_true_min)
        axmax = y_true_max+0.1*(y_true_max-y_true_min)
        aymin = y_pred_min-0.1*(y_pred_max-y_pred_min)
        aymax = y_pred_max+0.1*(y_pred_max-y_pred_min)

        plt.xlim(min(axmin, aymin), max(axmax, aymax))
        plt.ylim(min(axmin, aymin), max(axmax, aymax))
                        
        plt.errorbar(y_train, 
                    y_pred_train_mean,
                    fmt='o',
                    label="Train",
                    elinewidth = 0, 
                    ms=5,
                    mfc='#519fc4',
                    markeredgewidth = 0,
                    alpha=0.7)
        plt.errorbar(y_valid,
                    y_pred_valid_mean,
                    elinewidth = 0,
                    fmt='o',
                    label="Validation", 
                    ms=5, 
                    mfc='#db702e',
                    markeredgewidth = 0,
                    alpha=0.7)
        plt.errorbar(y_test,
                    y_pred_test_mean,
                    elinewidth = 0,
                    fmt='o',
                    label="Test", 
                    ms=5, 
                    mfc='#cc1b00',
                    markeredgewidth = 0,
                    alpha=0.7)


        # Plot X=Y line
        plt.plot([max(plt.xlim()[0], plt.ylim()[0]), 
                  min(plt.xlim()[1], plt.ylim()[1])],
                 [max(plt.xlim()[0], plt.ylim()[0]), 
                  min(plt.xlim()[1], plt.ylim()[1])],
                 ':', color = '#595f69')
        
        plt.xlabel('Observations ' + data_units, fontsize = 12)
        plt.ylabel('Predictions ' + data_units, fontsize = 12)
        plt.legend()

        # Added fold number
        plt.savefig(save_dir+'TrainValid_Plot_LSTMAtt_'+data_name+'_model_weights.best_fold_'+str(ifold)+'.png', bbox_inches='tight', dpi=80)
        plt.close()