def Embedding_Vis(data, data_name, data_units='', k_fold_number=8, k_fold_index=0, augmentation=False, outdir="../data/", affinity_propn=True, verbose=0): if augmentation: p_dir_temp = 'Augm' else: p_dir_temp = 'Can' input_dir = outdir + 'Main/' + '{}/{}/'.format(data_name, p_dir_temp) save_dir = outdir + 'Embedding_Vis/' + '{}/{}/'.format( data_name, p_dir_temp) os.makedirs(save_dir, exist_ok=True) print("***SMILES_X for embedding visualization starts...***\n\n") np.random.seed(seed=123) seed_list = np.random.randint(int(1e6), size=k_fold_number).tolist() print("******") print("***Fold #{} initiated...***".format(k_fold_index)) print("******") print("***Sampling and splitting of the dataset.***\n") # Reproducing the data split of the requested fold (k_fold_index) x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \ utils.random_split(smiles_input=data.smiles, prop_input=np.array(data.iloc[:,1]), random_state=seed_list[k_fold_index], scaling = True) # data augmentation or not if augmentation == True: print("***Data augmentation.***\n") canonical = False rotation = True else: print("***No data augmentation has been required.***\n") canonical = True rotation = False x_train_enum, x_train_enum_card, y_train_enum = \ augm.Augmentation(x_train, y_train, canon=canonical, rotate=rotation) x_valid_enum, x_valid_enum_card, y_valid_enum = \ augm.Augmentation(x_valid, y_valid, canon=canonical, rotate=rotation) x_test_enum, x_test_enum_card, y_test_enum = \ augm.Augmentation(x_test, y_test, canon=canonical, rotate=rotation) print("Enumerated SMILES:\n\tTraining set: {}\n\tValidation set: {}\n\tTest set: {}\n".\ format(x_train_enum.shape[0], x_valid_enum.shape[0], x_test_enum.shape[0])) print("***Tokenization of SMILES.***\n") # Tokenize SMILES per dataset x_train_enum_tokens = token.get_tokens(x_train_enum) x_valid_enum_tokens = token.get_tokens(x_valid_enum) x_test_enum_tokens = token.get_tokens(x_test_enum) print("Examples of tokenized SMILES from a training set:\n{}\n".\ format(x_train_enum_tokens[:5])) # Vocabulary size computation all_smiles_tokens = x_train_enum_tokens + x_valid_enum_tokens + x_test_enum_tokens tokens = token.extract_vocab(all_smiles_tokens) vocab_size = len(tokens) train_unique_tokens = list(token.extract_vocab(x_train_enum_tokens)) print(train_unique_tokens) print("Number of tokens only present in a training set: {}\n".format( len(train_unique_tokens))) train_unique_tokens.insert(0, 'pad') # Tokens as a list tokens = token.get_vocab(input_dir + data_name + '_tokens_set_fold_' + str(k_fold_index) + '.txt') # Add 'pad', 'unk' tokens to the existing list tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size) print("Full vocabulary: {}\nOf size: {}\n".format(tokens, vocab_size)) # Maximum of length of SMILES to process max_length = np.max([len(ismiles) for ismiles in all_smiles_tokens]) print( "Maximum length of tokenized SMILES: {} tokens (termination spaces included)\n" .format(max_length)) # Transformation of tokenized SMILES to vector of intergers and vice-versa token_to_int = token.get_tokentoint(tokens) int_to_token = token.get_inttotoken(tokens) model_train = load_model(input_dir + 'LSTMAtt_' + data_name + '_model.best_fold_' + str(k_fold_index) + '.hdf5', custom_objects={'AttentionM': model.AttentionM()}) print("Chosen model summary:\n") print(model_train.summary()) print("\n") print("***Embedding of the individual tokens from the chosen model.***\n") model_train.compile(loss="mse", optimizer='adam', metrics=[metrics.mae, metrics.mse]) model_embed_weights = model_train.layers[1].get_weights()[0] #print(model_embed_weights.shape) #tsne = TSNE(perplexity=30, early_exaggeration=120 , n_components=2, random_state=123, verbose=0) pca = PCA(n_components=2, random_state=123) transformed_weights = pca.fit_transform(model_embed_weights) #transformed_weights = tsne.fit_transform(model_embed_weights) f = plt.figure(figsize=(9, 9)) ax = plt.subplot(aspect='equal') if affinity_propn: # Compute Affinity Propagation af = AffinityPropagation().fit(model_embed_weights) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) # Plot it colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(n_clusters_), colors): class_members = np.where(np.array(labels == k) == True)[0].tolist() for ilabpt in class_members: alpha_tmp = 0.5 if tokens[ ilabpt] in train_unique_tokens else 0.5 line_tmp = 1 if tokens[ilabpt] in train_unique_tokens else 5 marker_tmp = 'o' if tokens[ ilabpt] in train_unique_tokens else 'x' edge_color_tmp = 'black' if tokens[ ilabpt] in train_unique_tokens else col ax.plot(transformed_weights[ilabpt, 0], transformed_weights[ilabpt, 1], col, marker=marker_tmp, markeredgecolor=edge_color_tmp, markeredgewidth=line_tmp, alpha=alpha_tmp, markersize=10) else: # Black and white plot for ilabpt in range(vocab_size): alpha_tmp = 0.5 if tokens[ilabpt] in train_unique_tokens else 0.2 size_tmp = 40 if tokens[ilabpt] in train_unique_tokens else 20 ax.scatter(transformed_weights[ilabpt, 0], transformed_weights[ilabpt, 1], lw=1, s=size_tmp, facecolor='black', marker='o', alpha=alpha_tmp) annotations = [] weight_tmp = 'bold' ilabpt = 0 for ilabpt, (x_i, y_i) in enumerate( zip(transformed_weights[:, 0].tolist(), transformed_weights[:, 1].tolist())): weight_tmp = 'black' if tokens[ ilabpt] in train_unique_tokens else 'normal' tokens_tmp = tokens[ilabpt] if tokens_tmp == ' ': tokens_tmp = 'space' elif tokens_tmp == '.': tokens_tmp = 'dot' annotations.append( plt.text(x_i, y_i, tokens_tmp, fontsize=12, weight=weight_tmp)) adjust_text(annotations, x=transformed_weights[:, 0].tolist(), y=transformed_weights[:, 1].tolist(), arrowprops=dict(arrowstyle="-", color='k', lw=0.5)) plt.xticks([]) plt.yticks([]) ax.axis('tight') plt.savefig(save_dir + 'Visualization_' + data_name + '_Embedding_fold_' + str(k_fold_index) + '.png', bbox_inches='tight') plt.show()
def Inference(data_name, smiles_list = ['CC','CCC','C=O'], data_units = '', k_fold_number = 8, augmentation = False, outdir = "../data/"): if augmentation: p_dir_temp = 'Augm' else: p_dir_temp = 'Can' input_dir = outdir+'Main/'+'{}/{}/'.format(data_name,p_dir_temp) save_dir = outdir+'Inference/'+'{}/{}/'.format(data_name,p_dir_temp) os.makedirs(save_dir, exist_ok=True) print("***SMILES_X for inference starts...***\n\n") print("***Checking the SMILES list for inference***\n") smiles_checked = list() smiles_rejected = list() for ismiles in smiles_list: mol_tmp = Chem.MolFromSmiles(ismiles) if mol_tmp != None: smiles_can = Chem.MolToSmiles(mol_tmp) smiles_checked.append(smiles_can) else: smiles_rejected.append(ismiles) if len(smiles_rejected) > 0: with open(save_dir+'rejected_smiles.txt','w') as f: for ismiles in smiles_rejected: f.write("%s\n" % ismiles) if len(smiles_checked) == 0: print("***Process of inference automatically aborted!***") print("The provided SMILES are all incorrect and could not be verified via RDKit.") return smiles_x = np.array(smiles_checked) smiles_y = np.array([[np.nan]*len(smiles_checked)]).flatten() # data augmentation or not if augmentation == True: print("***Data augmentation.***\n") canonical = False rotation = True else: print("***No data augmentation has been required.***\n") canonical = True rotation = False smiles_x_enum, smiles_x_enum_card, smiles_y_enum = \ augm.Augmentation(smiles_x, smiles_y, canon=canonical, rotate=rotation) print("Enumerated SMILES: {}\n".format(smiles_x_enum.shape[0])) print("***Tokenization of SMILES.***\n") # Tokenize SMILES smiles_x_enum_tokens = token.get_tokens(smiles_x_enum) # models ensembling smiles_y_pred_mean_array = np.empty(shape=(0,len(smiles_checked)), dtype='float') for ifold in range(k_fold_number): # Tokens as a list tokens = token.get_vocab(input_dir+data_name+'_tokens_set_fold_'+str(ifold)+'.txt') # Add 'pad', 'unk' tokens to the existing list vocab_size = len(tokens) tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size) # Transformation of tokenized SMILES to vector of intergers and vice-versa token_to_int = token.get_tokentoint(tokens) int_to_token = token.get_inttotoken(tokens) # Best architecture to visualize from model_train = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(ifold)+'.hdf5', custom_objects={'AttentionM': model.AttentionM()}) if ifold == 0: # Maximum of length of SMILES to process max_length = model_train.layers[0].output_shape[-1] print("Full vocabulary: {}\nOf size: {}\n".format(tokens, vocab_size)) print("Maximum length of tokenized SMILES: {} tokens\n".format(max_length)) model_train.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse]) # predict and compare for the training, validation and test sets smiles_x_enum_tokens_tointvec = token.int_vec_encode(tokenized_smiles_list = smiles_x_enum_tokens, max_length = max_length, vocab = tokens) smiles_y_pred = model_train.predict(smiles_x_enum_tokens_tointvec) # compute a mean per set of augmented SMILES smiles_y_pred_mean, _ = utils.mean_median_result(smiles_x_enum_card, smiles_y_pred) smiles_y_pred_mean_array = np.append(smiles_y_pred_mean_array, smiles_y_pred_mean.reshape(1,-1), axis = 0) if ifold == (k_fold_number-1): smiles_y_pred_mean_ensemble = np.mean(smiles_y_pred_mean_array, axis = 0) smiles_y_pred_sd_ensemble = np.std(smiles_y_pred_mean_array, axis = 0) pred_from_ens = pd.DataFrame(data=[smiles_x, smiles_y_pred_mean_ensemble, smiles_y_pred_sd_ensemble]).T pred_from_ens.columns = ['SMILES', 'ens_pred_mean', 'ens_pred_sd'] print("***Inference of SMILES property done.***") return pred_from_ens
def TokensFinder(data, data_name, data_units='', k_fold_number=8, k_fold_index=0, augmentation=False, token_tofind='', verbose=1): print("***SMILES_X token's finder starts...***\n\n") np.random.seed(seed=123) seed_list = np.random.randint(int(1e6), size=k_fold_number).tolist() # Train/validation/test data splitting - 80/10/10 % at random with diff. seeds for k_fold_number times selection_seed = seed_list[k_fold_index] print("******") print("***Fold #{} initiated...***".format(selection_seed)) print("******") print("***Sampling and splitting of the dataset.***\n") x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \ utils.random_split(smiles_input=data.smiles, prop_input=np.array(data.iloc[:,1]), random_state=selection_seed, scaling = True) # data augmentation or not if augmentation == True: print("***Data augmentation.***\n") canonical = False rotation = True else: print("***No data augmentation has been required.***\n") canonical = True rotation = False x_train_enum, x_train_enum_card, y_train_enum = \ augm.Augmentation(x_train, y_train, canon=canonical, rotate=rotation) x_valid_enum, x_valid_enum_card, y_valid_enum = \ augm.Augmentation(x_valid, y_valid, canon=canonical, rotate=rotation) x_test_enum, x_test_enum_card, y_test_enum = \ augm.Augmentation(x_test, y_test, canon=canonical, rotate=rotation) print("Enumerated SMILES:\n\tTraining set: {}\n\tValidation set: {}\n\tTest set: {}\n".\ format(x_train_enum.shape[0], x_valid_enum.shape[0], x_test_enum.shape[0])) print("***Tokenization of SMILES.***\n") # Tokenize SMILES per dataset x_train_enum_tokens = get_tokens(x_train_enum) x_valid_enum_tokens = get_tokens(x_valid_enum) x_test_enum_tokens = get_tokens(x_test_enum) print("Examples of tokenized SMILES from a training set:\n{}\n".\ format(x_train_enum_tokens[:5])) # Vocabulary size computation all_smiles_tokens = x_train_enum_tokens + x_valid_enum_tokens + x_test_enum_tokens tokens = extract_vocab(all_smiles_tokens) vocab_size = len(tokens) train_unique_tokens = list(extract_vocab(x_train_enum_tokens)) # Token finder print("The finder is processing the search...") n_found = 0 for ismiles in x_train_enum_tokens: if token_tofind in ismiles: n_found += 1 if verbose == 1: print(''.join(ismiles)) print("\n{} SMILES found with {} token in the training set.".format( n_found, token_tofind))
def Main(data, data_name, bayopt_bounds, data_units = '', k_fold_number = 8, augmentation = False, outdir = "../data/", bayopt_n_epochs = 10, bayopt_n_rounds = 25, bayopt_it_factor = 1, bayopt_on = True, lstmunits_ref = 512, denseunits_ref = 512, embedding_ref = 512, batch_size_ref = 64, alpha_ref = 3, n_gpus = 1, bridge_type = 'None', patience = 25, n_epochs = 1000): if augmentation: p_dir_temp = 'Augm' else: p_dir_temp = 'Can' save_dir = outdir+'Main/'+'{}/{}/'.format(data_name,p_dir_temp) os.makedirs(save_dir, exist_ok=True) print("***SMILES_X starts...***\n\n") np.random.seed(seed=123) seed_list = np.random.randint(int(1e6), size = k_fold_number).tolist() # Train/validation/test data splitting - 80/10/10 % at random with diff. seeds for k_fold_number times for ifold in range(k_fold_number): print("******") print("***Fold #{} initiated...***".format(ifold)) print("******") print("***Sampling and splitting of the dataset.***\n") x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \ utils.random_split(smiles_input=data.smiles, prop_input=np.array(data.iloc[:,1]), random_state=seed_list[ifold], scaling = True) # data augmentation or not if augmentation == True: print("***Data augmentation to {}***\n".format(augmentation)) canonical = False rotation = True else: print("***No data augmentation has been required.***\n") canonical = True rotation = False x_train_enum, x_train_enum_card, y_train_enum = \ augm.Augmentation(x_train, y_train, canon=canonical, rotate=rotation) x_valid_enum, x_valid_enum_card, y_valid_enum = \ augm.Augmentation(x_valid, y_valid, canon=canonical, rotate=rotation) x_test_enum, x_test_enum_card, y_test_enum = \ augm.Augmentation(x_test, y_test, canon=canonical, rotate=rotation) print("Enumerated SMILES:\n\tTraining set: {}\n\tValidation set: {}\n\tTest set: {}\n".\ format(x_train_enum.shape[0], x_valid_enum.shape[0], x_test_enum.shape[0])) print("***Tokenization of SMILES.***\n") # Tokenize SMILES per dataset x_train_enum_tokens = token.get_tokens(x_train_enum) x_valid_enum_tokens = token.get_tokens(x_valid_enum) x_test_enum_tokens = token.get_tokens(x_test_enum) print("Examples of tokenized SMILES from a training set:\n{}\n".\ format(x_train_enum_tokens[:5])) # Vocabulary size computation all_smiles_tokens = x_train_enum_tokens+x_valid_enum_tokens+x_test_enum_tokens # Check if the vocabulary for current dataset exists already if os.path.exists(save_dir+data_name+'_Vocabulary.txt'): tokens = token.get_vocab(save_dir+data_name+'_Vocabulary.txt') else: tokens = token.extract_vocab(all_smiles_tokens) token.save_vocab(tokens, save_dir+data_name+'_Vocabulary.txt') tokens = token.get_vocab(save_dir+data_name+'_Vocabulary.txt') vocab_size = len(tokens) train_unique_tokens = token.extract_vocab(x_train_enum_tokens) print("Number of tokens only present in a training set: {}\n".format(len(train_unique_tokens))) valid_unique_tokens = token.extract_vocab(x_valid_enum_tokens) print("Number of tokens only present in a validation set: {}".format(len(valid_unique_tokens))) print("Is the validation set a subset of the training set: {}".\ format(valid_unique_tokens.issubset(train_unique_tokens))) print("What are the tokens by which they differ: {}\n".\ format(valid_unique_tokens.difference(train_unique_tokens))) test_unique_tokens = token.extract_vocab(x_test_enum_tokens) print("Number of tokens only present in a test set: {}".format(len(test_unique_tokens))) print("Is the test set a subset of the training set: {}".\ format(test_unique_tokens.issubset(train_unique_tokens))) print("What are the tokens by which they differ: {}".\ format(test_unique_tokens.difference(train_unique_tokens))) print("Is the test set a subset of the validation set: {}".\ format(test_unique_tokens.issubset(valid_unique_tokens))) print("What are the tokens by which they differ: {}\n".\ format(test_unique_tokens.difference(valid_unique_tokens))) print("Full vocabulary: {}\nOf size: {}\n".format(tokens, vocab_size)) # Add 'pad', 'unk' tokens to the existing list tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size) # Maximum of length of SMILES to process max_length = np.max([len(ismiles) for ismiles in all_smiles_tokens]) print("Maximum length of tokenized SMILES: {} tokens (termination spaces included)\n".format(max_length)) print("***Bayesian Optimization of the SMILESX's architecture.***\n") if bayopt_on: # Operate the bayesian optimization of the neural architecture def create_mod(params): print('Model: {}'.format(params)) model_tag = data_name K.clear_session() if n_gpus > 1: if bridge_type == 'NVLink': model_opt = model.LSTMAttModel.create(inputtokens = max_length+1, vocabsize = vocab_size, lstmunits=int(params[:,0][0]), denseunits = int(params[:,1]), embedding = int(params[:,2][0])) else: with tf.device('/cpu'): # necessary to multi-GPU scaling model_opt = model.LSTMAttModel.create(inputtokens = max_length+1, vocabsize = vocab_size, lstmunits=int(params[:,0][0]), denseunits = int(params[:,1]), embedding = int(params[:,2][0])) multi_model = model.ModelMGPU(model_opt, gpus=n_gpus, bridge_type=bridge_type) else: # single GPU model_opt = model.LSTMAttModel.create(inputtokens = max_length+1, vocabsize = vocab_size, lstmunits=int(params[:,0][0]), denseunits = int(params[:,1]), embedding = int(params[:,2][0])) multi_model = model_opt batch_size = int(params[:,3][0]) custom_adam = Adam(lr=math.pow(10,-float(params[:,4][0]))) multi_model.compile(loss='mse', optimizer=custom_adam, metrics=[metrics.mae,metrics.mse]) history = multi_model.fit_generator(generator = DataSequence(x_train_enum_tokens, vocab = tokens, max_length = max_length, props_set = y_train_enum, batch_size = batch_size), steps_per_epoch = math.ceil(len(x_train_enum_tokens)/batch_size)//bayopt_it_factor, validation_data = DataSequence(x_valid_enum_tokens, vocab = tokens, max_length = max_length, props_set = y_valid_enum, batch_size = min(len(x_valid_enum_tokens), batch_size)), validation_steps = math.ceil(len(x_valid_enum_tokens)/min(len(x_valid_enum_tokens), batch_size))//bayopt_it_factor, epochs = bayopt_n_epochs, shuffle = True, initial_epoch = 0, verbose = 0) best_epoch = np.argmin(history.history['val_loss']) mae_valid = history.history['val_mean_absolute_error'][best_epoch] mse_valid = history.history['val_mean_squared_error'][best_epoch] if math.isnan(mse_valid): # discard diverging architectures (rare event) mae_valid = math.inf mse_valid = math.inf print('Valid MAE: {0:0.4f}, RMSE: {1:0.4f}'.format(mae_valid, mse_valid)) return mse_valid print("Random initialization:\n") Bayes_opt = GPyOpt.methods.BayesianOptimization(f=create_mod, domain=bayopt_bounds, acquisition_type = 'EI', initial_design_numdata = bayopt_n_rounds, exact_feval = False, normalize_Y = True, num_cores = multiprocessing.cpu_count()-1) print("Optimization:\n") Bayes_opt.run_optimization(max_iter=bayopt_n_rounds) best_arch = Bayes_opt.x_opt else: best_arch = [lstmunits_ref, denseunits_ref, embedding_ref, batch_size_ref, alpha_ref] print("\nThe architecture for this datatset is:\n\tLSTM units: {}\n\tDense units: {}\n\tEmbedding dimensions {}".\ format(int(best_arch[0]), int(best_arch[1]), int(best_arch[2]))) print("\tBatch size: {0:}\n\tLearning rate: 10^-({1:.1f})\n".format(int(best_arch[3]), float(best_arch[4]))) print("***Training of the best model.***\n") # Train the model and predict K.clear_session() # Define the multi-gpus model if necessary if n_gpus > 1: if bridge_type == 'NVLink': model_train = model.LSTMAttModel.create(inputtokens = max_length+1, vocabsize = vocab_size, lstmunits= int(best_arch[0]), denseunits = int(best_arch[1]), embedding = int(best_arch[2])) else: with tf.device('/cpu'): model_train = model.LSTMAttModel.create(inputtokens = max_length+1, vocabsize = vocab_size, lstmunits= int(best_arch[0]), denseunits = int(best_arch[1]), embedding = int(best_arch[2])) print("Best model summary:\n") print(model_train.summary()) print("\n") multi_model = model.ModelMGPU(model_train, gpus=n_gpus, bridge_type=bridge_type) else: model_train = model.LSTMAttModel.create(inputtokens = max_length+1, vocabsize = vocab_size, lstmunits= int(best_arch[0]), denseunits = int(best_arch[1]), embedding = int(best_arch[2])) print("Best model summary:\n") print(model_train.summary()) print("\n") multi_model = model_train batch_size = int(best_arch[3]) custom_adam = Adam(lr=math.pow(10,-float(best_arch[4]))) # Compile the model multi_model.compile(loss="mse", optimizer=custom_adam, metrics=[metrics.mae,metrics.mse]) # Checkpoint, Early stopping and callbacks definition filepath=save_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(ifold)+'.hdf5' checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True, mode='min') earlystopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=patience, verbose=0, mode='min') callbacks_list = [checkpoint, earlystopping] # Fit the model history = multi_model.fit_generator(generator = DataSequence(x_train_enum_tokens, vocab = tokens, max_length = max_length, props_set = y_train_enum, batch_size = batch_size), validation_data = DataSequence(x_valid_enum_tokens, vocab = tokens, max_length = max_length, props_set = y_valid_enum, batch_size = min(len(x_valid_enum_tokens), batch_size)), epochs = n_epochs, shuffle = True, initial_epoch = 0, callbacks = callbacks_list) # Summarize history for losses per epoch plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('') plt.ylabel('Loss') plt.xlabel('Epoch') plt.legend(['Train', 'Validation'], loc='upper right') plt.savefig(save_dir+'History_fit_LSTMAtt_'+data_name+'_model_weights.best_fold_'+str(ifold)+'.png', bbox_inches='tight') plt.close() print("Best val_loss @ Epoch #{}\n".format(np.argmin(history.history['val_loss'])+1)) print("***Predictions from the best model.***\n") model_train.load_weights(save_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(ifold)+'.hdf5') model_train.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse]) # predict and compare for the training, validation and test sets x_train_enum_tokens_tointvec = token.int_vec_encode(tokenized_smiles_list = x_train_enum_tokens, max_length = max_length+1, vocab = tokens) x_valid_enum_tokens_tointvec = token.int_vec_encode(tokenized_smiles_list = x_valid_enum_tokens, max_length = max_length+1, vocab = tokens) x_test_enum_tokens_tointvec = token.int_vec_encode(tokenized_smiles_list = x_test_enum_tokens, max_length = max_length+1, vocab = tokens) y_pred_train = model_train.predict(x_train_enum_tokens_tointvec) y_pred_valid = model_train.predict(x_valid_enum_tokens_tointvec) y_pred_test = model_train.predict(x_test_enum_tokens_tointvec) # compute a mean per set of augmented SMILES y_pred_train_mean, _ = utils.mean_median_result(x_train_enum_card, y_pred_train) y_pred_valid_mean, _ = utils.mean_median_result(x_valid_enum_card, y_pred_valid) y_pred_test_mean, _ = utils.mean_median_result(x_test_enum_card, y_pred_test) # inverse transform the scaling of the property and plot 'predictions VS observations' y_pred_VS_true_train = scaler.inverse_transform(y_train) - \ scaler.inverse_transform(y_pred_train_mean.reshape(-1,1)) mae_train = np.mean(np.absolute(y_pred_VS_true_train)) mse_train = np.mean(np.square(y_pred_VS_true_train)) corrcoef_train = r2_score(scaler.inverse_transform(y_train), \ scaler.inverse_transform(y_pred_train_mean.reshape(-1,1))) print("For the training set:\nMAE: {0:0.4f} RMSE: {1:0.4f} R^2: {2:0.4f}\n".\ format(mae_train, np.sqrt(mse_train), corrcoef_train)) y_pred_VS_true_valid = scaler.inverse_transform(y_valid) - \ scaler.inverse_transform(y_pred_valid_mean.reshape(-1,1)) mae_valid = np.mean(np.absolute(y_pred_VS_true_valid)) mse_valid = np.mean(np.square(y_pred_VS_true_valid)) corrcoef_valid = r2_score(scaler.inverse_transform(y_valid), \ scaler.inverse_transform(y_pred_valid_mean.reshape(-1,1))) print("For the validation set:\nMAE: {0:0.4f} RMSE: {1:0.4f} R^2: {2:0.4f}\n".\ format(mae_valid, np.sqrt(mse_valid), corrcoef_valid)) y_pred_VS_true_test = scaler.inverse_transform(y_test) - \ scaler.inverse_transform(y_pred_test_mean.reshape(-1,1)) mae_test = np.mean(np.absolute(y_pred_VS_true_test)) mse_test = np.mean(np.square(y_pred_VS_true_test)) corrcoef_test = r2_score(scaler.inverse_transform(y_test), \ scaler.inverse_transform(y_pred_test_mean.reshape(-1,1))) print("For the test set:\nMAE: {0:0.4f} RMSE: {1:0.4f} R^2: {2:0.4f}\n".\ format(mae_test, np.sqrt(mse_test), corrcoef_test)) # Plot the final result # Unscaling the data y_train = scaler.inverse_transform(y_train) y_pred_train_mean = scaler.inverse_transform(y_pred_train_mean.reshape(-1,1)) y_valid = scaler.inverse_transform(y_valid) y_pred_valid_mean = scaler.inverse_transform(y_pred_valid_mean.reshape(-1,1)) y_test = scaler.inverse_transform(y_test) y_pred_test_mean = scaler.inverse_transform(y_pred_test_mean.reshape(-1,1)) # Changed colors, scaling and sizes plt.figure(figsize=(12, 8)) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Setting plot limits y_true_min = min(np.min(y_train), np.min(y_valid), np.min(y_test)) y_true_max = max(np.max(y_train), np.max(y_valid), np.max(y_test)) y_pred_min = min(np.min(y_pred_train_mean), np.min(y_pred_valid_mean), np.min(y_pred_test_mean)) y_pred_max = max(np.max(y_pred_train_mean), np.max(y_pred_valid_mean), np.max(y_pred_test_mean)) # Expanding slightly the canvas around the data points (by 10%) axmin = y_true_min-0.1*(y_true_max-y_true_min) axmax = y_true_max+0.1*(y_true_max-y_true_min) aymin = y_pred_min-0.1*(y_pred_max-y_pred_min) aymax = y_pred_max+0.1*(y_pred_max-y_pred_min) plt.xlim(min(axmin, aymin), max(axmax, aymax)) plt.ylim(min(axmin, aymin), max(axmax, aymax)) plt.errorbar(y_train, y_pred_train_mean, fmt='o', label="Train", elinewidth = 0, ms=5, mfc='#519fc4', markeredgewidth = 0, alpha=0.7) plt.errorbar(y_valid, y_pred_valid_mean, elinewidth = 0, fmt='o', label="Validation", ms=5, mfc='#db702e', markeredgewidth = 0, alpha=0.7) plt.errorbar(y_test, y_pred_test_mean, elinewidth = 0, fmt='o', label="Test", ms=5, mfc='#cc1b00', markeredgewidth = 0, alpha=0.7) # Plot X=Y line plt.plot([max(plt.xlim()[0], plt.ylim()[0]), min(plt.xlim()[1], plt.ylim()[1])], [max(plt.xlim()[0], plt.ylim()[0]), min(plt.xlim()[1], plt.ylim()[1])], ':', color = '#595f69') plt.xlabel('Observations ' + data_units, fontsize = 12) plt.ylabel('Predictions ' + data_units, fontsize = 12) plt.legend() # Added fold number plt.savefig(save_dir+'TrainValid_Plot_LSTMAtt_'+data_name+'_model_weights.best_fold_'+str(ifold)+'.png', bbox_inches='tight', dpi=80) plt.close()
def Interpretation(data, data_name, data_units = '', k_fold_number = 8, k_fold_index=0, augmentation = False, outdir = "../data/", smiles_toviz = 'CCC', font_size = 15, font_rotation = 'horizontal'): if augmentation: p_dir_temp = 'Augm' else: p_dir_temp = 'Can' input_dir = outdir+'Main/'+'{}/{}/'.format(data_name,p_dir_temp) save_dir = outdir+'Interpretation/'+'{}/{}/'.format(data_name,p_dir_temp) os.makedirs(save_dir, exist_ok=True) print("***SMILES_X Interpreter starts...***\n\n") np.random.seed(seed=123) seed_list = np.random.randint(int(1e6), size = k_fold_number).tolist() # Train/validation/test data splitting - 80/10/10 % at random with diff. seeds for k_fold_number times selection_seed = seed_list[k_fold_index] print("******") print("***Fold #{} initiated...***".format(selection_seed)) print("******") print("***Sampling and splitting of the dataset.***\n") x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \ utils.random_split(smiles_input=data.smiles, prop_input=np.array(data.iloc[:,1]), random_state=selection_seed, scaling = True) np.savetxt(save_dir+'smiles_train.txt', np.asarray(x_train), newline="\n", fmt='%s') np.savetxt(save_dir+'smiles_valid.txt', np.asarray(x_valid), newline="\n", fmt='%s') np.savetxt(save_dir+'smiles_test.txt', np.asarray(x_test), newline="\n", fmt='%s') mol_toviz = Chem.MolFromSmiles(smiles_toviz) if mol_toviz != None: smiles_toviz_can = Chem.MolToSmiles(mol_toviz) else: print("***Process of visualization automatically aborted!***") print("The smiles_toviz is incorrect and cannot be canonicalized by RDKit.") return smiles_toviz_x = np.array([smiles_toviz_can]) if smiles_toviz_can in np.array(data.smiles): smiles_toviz_y = np.array([[data.iloc[np.where(data.smiles == smiles_toviz_x[0])[0][0],1]]]) else: smiles_toviz_y = np.array([[np.nan]]) # data augmentation or not if augmentation == True: print("***Data augmentation.***\n") canonical = False rotation = True else: print("***No data augmentation has been required.***\n") canonical = True rotation = False x_train_enum, x_train_enum_card, y_train_enum = \ augm.Augmentation(x_train, y_train, canon=canonical, rotate=rotation) x_valid_enum, x_valid_enum_card, y_valid_enum = \ augm.Augmentation(x_valid, y_valid, canon=canonical, rotate=rotation) x_test_enum, x_test_enum_card, y_test_enum = \ augm.Augmentation(x_test, y_test, canon=canonical, rotate=rotation) smiles_toviz_x_enum, smiles_toviz_x_enum_card, smiles_toviz_y_enum = \ augm.Augmentation(smiles_toviz_x, smiles_toviz_y, canon=canonical, rotate=rotation) print("Enumerated SMILES:\n\tTraining set: {}\n\tValidation set: {}\n\tTest set: {}\n".\ format(x_train_enum.shape[0], x_valid_enum.shape[0], x_test_enum.shape[0])) print("***Tokenization of SMILES.***\n") # Tokenize SMILES per dataset x_train_enum_tokens = token.get_tokens(x_train_enum) x_valid_enum_tokens = token.get_tokens(x_valid_enum) x_test_enum_tokens = token.get_tokens(x_test_enum) smiles_toviz_x_enum_tokens = token.get_tokens(smiles_toviz_x_enum) print("Examples of tokenized SMILES from a training set:\n{}\n".\ format(x_train_enum_tokens[:5])) # Vocabulary size computation all_smiles_tokens = x_train_enum_tokens+x_valid_enum_tokens+x_test_enum_tokens tokens = token.extract_vocab(all_smiles_tokens) vocab_size = len(tokens) train_unique_tokens = list(token.extract_vocab(x_train_enum_tokens)) print(train_unique_tokens) print("Number of tokens only present in a training set: {}\n".format(len(train_unique_tokens))) train_unique_tokens.insert(0,'pad') # Tokens as a list tokens = token.get_vocab(input_dir+data_name+'_Vocabulary.txt') # Add 'pad', 'unk' tokens to the existing list tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size) print("Full vocabulary: {}\nOf size: {}\n".format(tokens, vocab_size)) # Maximum of length of SMILES to process max_length = np.max([len(ismiles) for ismiles in all_smiles_tokens]) print("Maximum length of tokenized SMILES: {} tokens\n".format(max_length)) # Transformation of tokenized SMILES to vector of intergers and vice-versa token_to_int = token.get_tokentoint(tokens) int_to_token = token.get_inttotoken(tokens) # Best architecture to visualize from model_topredict = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5', custom_objects={'AttentionM': model.AttentionM()}) best_arch = [model_topredict.layers[2].output_shape[-1]/2, model_topredict.layers[3].output_shape[-1], model_topredict.layers[1].output_shape[-1]] # Architecture to return attention weights model_att = model.LSTMAttModel.create(inputtokens = max_length+1, vocabsize = vocab_size, lstmunits= int(best_arch[0]), denseunits = int(best_arch[1]), embedding = int(best_arch[2]), return_proba = True) print("Best model summary:\n") print(model_att.summary()) print("\n") print("***Interpretation from the best model.***\n") model_att.load_weights(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5') model_att.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse]) smiles_toviz_x_enum_tokens_tointvec = token.int_vec_encode(tokenized_smiles_list= smiles_toviz_x_enum_tokens, max_length = max_length+1, vocab = tokens) intermediate_layer_model = Model(inputs=model_att.input, outputs=model_att.layers[-2].output) intermediate_output = intermediate_layer_model.predict(smiles_toviz_x_enum_tokens_tointvec) smiles_toviz_x_card_cumsum_viz = np.cumsum(smiles_toviz_x_enum_card) smiles_toviz_x_card_cumsum_shift_viz = shift(smiles_toviz_x_card_cumsum_viz, 1, cval=0) mols_id = 0 ienumcard = smiles_toviz_x_card_cumsum_shift_viz[mols_id] smiles_len_tmp = len(smiles_toviz_x_enum_tokens[ienumcard]) intermediate_output_tmp = intermediate_output[ienumcard,-smiles_len_tmp+1:-1].flatten().reshape(1,-1) max_intermediate_output_tmp = np.max(intermediate_output_tmp) plt.matshow(intermediate_output_tmp, cmap='Reds') plt.tick_params(axis='x', bottom = False) plt.xticks([ix for ix in range(smiles_len_tmp-2)]) plt.xticks(range(smiles_len_tmp-2), [int_to_token[iint].replace('pad','') \ for iint in smiles_toviz_x_enum_tokens_tointvec[ienumcard,-smiles_len_tmp+1:-1]], fontsize = font_size, rotation = font_rotation) plt.yticks([]) plt.savefig(save_dir+'Interpretation_1D_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight') #plt.show() smiles_tmp = smiles_toviz_x_enum[ienumcard] mol_tmp = Chem.MolFromSmiles(smiles_tmp) smiles_len_tmp = len(smiles_toviz_x_enum_tokens[ienumcard]) mol_df_tmp = pd.DataFrame([smiles_toviz_x_enum_tokens[ienumcard][1:-1],intermediate_output[ienumcard].\ flatten().\ tolist()[-smiles_len_tmp+1:-1]]).transpose() bond = ['-','=','#','$','/','\\','.','(',')'] mol_df_tmp = mol_df_tmp[~mol_df_tmp.iloc[:,0].isin(bond)] mol_df_tmp = mol_df_tmp[[not itoken.isdigit() for itoken in mol_df_tmp.iloc[:,0].values.tolist()]] minmaxscaler = MinMaxScaler(feature_range=(0,1)) norm_weights = minmaxscaler.fit_transform(mol_df_tmp.iloc[:,1].values.reshape(-1,1)).flatten().tolist() fig = GetSimilarityMapFromWeights(mol=mol_tmp, size = (250,250), scale=-1, sigma=0.05, weights=norm_weights, colorMap='Reds', contourLines = 10, alpha = 0.25) fig.savefig(save_dir+'Interpretation_2D_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight') #fig.show() model_topredict.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse]) y_pred_test_tmp = model_topredict.predict(smiles_toviz_x_enum_tokens_tointvec[ienumcard].reshape(1,-1))[0,0] y_test_tmp = smiles_toviz_y_enum[ienumcard,0] if not np.isnan(y_test_tmp): print("True value: {0:.2f} Predicted: {1:.2f}".format(y_test_tmp, scaler.inverse_transform(y_pred_test_tmp.reshape(1, -1))[0][0])) else: print("Predicted: {0:.2f}".format(scaler.inverse_transform(y_pred_test_tmp.reshape(1, -1))[0][0])) smiles_len_tmp = len(smiles_toviz_x_enum_tokens[ienumcard]) diff_topred_list = list() diff_totrue_list = list() for csubsmiles in range(1,smiles_len_tmp): isubsmiles = smiles_toviz_x_enum_tokens[ienumcard][:csubsmiles]+[' '] isubsmiles_tointvec= token.int_vec_encode(tokenized_smiles_list = [isubsmiles], max_length = max_length+1, vocab = tokens) predict_prop_tmp = model_topredict.predict(isubsmiles_tointvec)[0,0] diff_topred_tmp = (predict_prop_tmp-y_pred_test_tmp)/np.abs(y_pred_test_tmp) diff_topred_list.append(diff_topred_tmp) diff_totrue_tmp = (predict_prop_tmp-y_test_tmp)/np.abs(y_test_tmp) diff_totrue_list.append(diff_totrue_tmp) max_diff_topred_tmp = np.max(diff_topred_list) max_diff_totrue_tmp = np.max(diff_totrue_list) plt.figure(figsize=(15,7)) markers, stemlines, baseline = plt.stem([ix for ix in range(smiles_len_tmp-1)], diff_topred_list, 'k.-', use_line_collection=True) plt.setp(baseline, color='k', linewidth=2, linestyle='--') plt.setp(markers, linewidth=1, marker='o', markersize=10, markeredgecolor = 'black') plt.setp(stemlines, color = 'k', linewidth=0.5, linestyle='-') plt.xticks(range(smiles_len_tmp-1), smiles_toviz_x_enum_tokens[ienumcard][:-1], fontsize = font_size, rotation = font_rotation) plt.yticks(fontsize = 20) plt.ylabel('Temporal relative distance', fontsize = 25, labelpad = 15) plt.savefig(save_dir+'Interpretation_temporal_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight')