def Eva(n_neighbors, min_dist, log_file): min_dist = min_dist n_neighbors = n_neighbors print({'min_dist': min_dist, 'n_neighbors': n_neighbors}) mp_new = loadmap('../fingerprint.mp') mp_new.fit(method='umap', min_dist=min_dist, n_neighbors=n_neighbors) X_new = mp2.rearrangement(X2, mp_new) trainX = X_new[train_idx] validX = X_new[valid_idx] testX = X_new[test_idx] clf = MultiLabelEstimator( n_outputs=1, fmap_shape1=trainX.shape[1:], batch_size=128, dense_layers=[128, 32], gpuid="0", patience=1000000, #find best epoch in total 200 epochs monitor='val_auc', epochs=200) clf.fit(trainX, trainY, validX, validY) best_epoch = clf._performance.best_epoch train_aucs = clf._performance.evaluate(trainX, trainY) valid_aucs = clf._performance.evaluate(validX, validY) test_aucs = clf._performance.evaluate(testX, testY) train_best_auc = np.nanmean(train_aucs) valid_best_auc = np.nanmean(valid_aucs) test_auc = np.nanmean(test_aucs) dfx = pd.DataFrame(clf._performance.history) valid_best_loss = dfx[dfx.epoch == clf._performance.best_epoch].val_loss.iloc[0] with open(log_file, 'a') as f: f.write(','.join([ str(min_dist), str(n_neighbors), str(valid_best_loss), str(valid_best_auc), str(train_best_auc), str(best_epoch), str(test_auc) ]) + '\n') return [valid_best_auc, train_best_auc, best_epoch]
toxcast = dataset.load_ToxCast() sider = dataset.load_SIDER() clintox = dataset.load_ClinTox() muv = dataset.load_MUV() datasets = [muv, tox21, toxcast, sider, clintox] MASK = -1 tmp_feature_dir = '/raid/shenwanxiang/10_FP_effect/tempignore' if not os.path.exists(tmp_feature_dir): os.makedirs(tmp_feature_dir) mps = [] fp_save_folder = '/raid/shenwanxiang/FP_maps' for fp_type in fp_types: mp = loadmap(os.path.join(fp_save_folder, '%s.mp' % fp_type)) mps.append(mp) classification_res = [] ## classification for data in datasets: task_name = data.task_name task_type = data.task_type _, induces = load_data(task_name) smiles = data.x Y = pd.DataFrame(data.y).fillna(MASK).values for mp, fp_type in zip(mps, fp_types): print(fp_type)
import tensorflow as tf import os os.environ["CUDA_VISIBLE_DEVICES"] = "6" np.random.seed(123) tf.compat.v1.set_random_seed(123) #tmp_feature_dir = './tmpignore' tmp_feature_dir = '/raid/shenwanxiang/tempignore' if not os.path.exists(tmp_feature_dir): os.makedirs(tmp_feature_dir) # In[2]: mp1 = molmap.loadmap('../descriptor.mp') mp2 = molmap.loadmap('../fingerprint.mp') # In[3]: task_name = 'PCBA' from chembench import load_data df, induces = load_data(task_name) print(len(induces[0][0]), len(induces[0][1]), len(induces[0][2]), df.shape) nan_idx = df[df.smiles.isna()].index.to_list() MASK = -1 smiles_col = df.columns[0] values_col = df.columns[1:] Y = df[values_col].astype('float').fillna(MASK).values
#load dataset data = dataset.load_ESOL() df = data.data Y = data.y valid_idx = df.sample(frac=0.2).index.to_list() train_idx = list(set(df.index) - set(valid_idx)) batch_size = 200 res = [] for epochs in [1, 10, 50, 100, 150, 300, 500]: start = time.time() mp = loadmap('../descriptor.mp') X = mp.batch_transform(data.x, n_jobs=10) trainX = X[train_idx] validX = X[valid_idx] trainY = Y[train_idx] validY = Y[valid_idx] performace = molmodel.cbks.Reg_EarlyStoppingAndPerformance( (trainX, trainY), (validX, validY), patience=10000000000, criteria='val_loss') model = molmodel.net.SinglePathNet(X.shape[1:], n_outputs=1, dense_layers=[128, 32],
train_df = pd.read_csv('./train.csv') valid_df = pd.read_csv('./val.csv') test_df = pd.read_csv('./test.csv') train_idx = df[df.smiles.isin(train_df.smiles)].index valid_idx = df[df.smiles.isin(valid_df.smiles)].index test_idx = df[df.smiles.isin(test_df.smiles)].index trainY = Y[train_idx] validY = Y[valid_idx] testY = Y[test_idx] print(len(train_idx), len(valid_idx), len(test_idx)) mp2 = loadmap('../fingerprint.mp') tmp_feature_dir = '/raid/shenwanxiang/08_Robustness/tempignore' #feature path if not os.path.exists(tmp_feature_dir): os.makedirs(tmp_feature_dir) X2_name = os.path.join(tmp_feature_dir, 'X2_%s.data' % task_name) if not os.path.exists(X2_name): X2 = mp2.batch_transform(df.smiles, n_jobs=8) dump(X2, X2_name) else: X2 = load(X2_name) def get_pos_weights(trainY): """pos_weights: neg_n / pos_n """
def Eva(n_neighbors, min_dist): min_dist = min_dist n_neighbors = n_neighbors print({'min_dist': min_dist, 'n_neighbors': n_neighbors}) mp_new = loadmap('../../descriptor.mp') mp_new.fit(method='umap', min_dist=min_dist, n_neighbors=n_neighbors) X_new = mp1.rearrangement(X1, mp_new) trainX = X_new[train_idx] validX = X_new[valid_idx] opt = tf.keras.optimizers.Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) # model = molmodel.net.SinglePathNet(trainX.shape[1:], n_outputs=1, dense_layers=[128, 32], dense_avf='tanh', last_avf='linear') model.compile(optimizer=opt, loss='mse') performance = molmodel.cbks.Reg_EarlyStoppingAndPerformance( (trainX, trainY), (validX, validY), patience=1000000, #find best epoch in total 500 epochs criteria='val_loss') model.fit(trainX, trainY, batch_size=128, epochs=500, verbose=0, shuffle=True, validation_data=(validX, validY), callbacks=[performance]) #performance.model.set_weights(performance.best_weights) #set best model as the final model valid_rmse, valid_r2 = performance.evaluate(validX, validY) train_rmse, train_r2 = performance.evaluate(trainX, trainY) valid_best_rmse = np.nanmean(valid_rmse) train_best_rmse = np.nanmean(train_rmse) valid_best_loss = perfomrance.best best_epoch = performance.best_epoch with open(log_file, 'a') as f: f.write(','.join([ str(min_dist), str(n_neighbors), str(valid_best_loss), str(valid_best_rmse), str(train_best_rmse), str(best_epoch) ]) + '\n') return [valid_best_loss, valid_best_rmse, train_best_rmse, best_epoch]
test_idx = df[df.smiles.isin(test.smiles)].index print('training set: %s, valid set: %s, test set %s' % (len(train_idx), len(valid_idx), len(test_idx))) return train_idx, valid_idx, test_idx #load dataset data = dataset.load_ESOL() df = data.data Y = data.y task_name = 'ESOL' tmp_feature_dir = './tmpignore' if not os.path.exists(tmp_feature_dir): os.makedirs(tmp_feature_dir) mp1 = loadmap('../../descriptor.mp') X1_name = os.path.join(tmp_feature_dir, 'X1_%s.data' % task_name) if not os.path.exists(X1_name): X1 = mp1.batch_transform(df.smiles, n_jobs=8) dump(X1, X1_name) else: X1 = load(X1_name) train_idx, valid_idx, test_idx = get_attentiveFP_idx(df) trainY = Y[train_idx] validY = Y[valid_idx] import time start_time = str(time.ctime()).replace(':', '-').replace(' ', '_') log_file = data.task_name + '_' + start_time + '.log'
data = dataset.load_FreeSolv() task_name = data.task_name smiles = data.x df = data.data # In[5]: from chembench import load_data _, induces = load_data(task_name) # In[6]: mp1 = loadmap('../descriptor.mp') mp2 = loadmap('../fingerprint.mp') # In[7]: tmp_feature_dir = '/raid/shenwanxiang/09_batchsize_effect/tempignore' if not os.path.exists(tmp_feature_dir): os.makedirs(tmp_feature_dir) X1_name = os.path.join(tmp_feature_dir, 'X1_%s.data' % task_name) X2_name = os.path.join(tmp_feature_dir, 'X2_%s.data' % task_name) if not os.path.exists(X1_name): X1 = mp1.batch_transform(smiles, n_jobs=8) dump(X1, X1_name) else: X1 = load(X1_name)
def Eva(n_neighbors, min_dist): min_dist = min_dist n_neighbors = n_neighbors print({'min_dist': min_dist, 'n_neighbors': n_neighbors}) mp1_new = loadmap('../descriptor.mp') mp1_new.fit(method='umap', min_dist=min_dist, n_neighbors=n_neighbors) mp2_new = loadmap('../fingerprint.mp') mp2_new.fit(method='umap', min_dist=min_dist, n_neighbors=n_neighbors) X1_new = mp1.rearrangement(X1, mp1_new) X2_new = mp2.rearrangement(X2, mp2_new) trainX = (X1_new[train_idx], X2_new[train_idx]) validX = (X1_new[valid_idx], X2_new[valid_idx]) pos_weights, neg_weights = get_pos_weights(trainY) loss = lambda y_true, y_pred: molmodel.loss.weighted_cross_entropy( y_true, y_pred, pos_weights, MASK=-1) model = molmodel.net.DoublePathNet(molmap1_size, molmap2_size, n_outputs=Y.shape[-1], dense_layers=dense_layers, dense_avf=dense_avf, last_avf=last_avf) opt = tf.keras.optimizers.Adam(lr=lr, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) # #import tensorflow_addons as tfa #opt = tfa.optimizers.AdamW(weight_decay = 0.1,learning_rate=0.001,beta1=0.9,beta2=0.999, epsilon=1e-08) model.compile(optimizer=opt, loss=loss) performance = molmodel.cbks.CLA_EarlyStoppingAndPerformance( (trainX, trainY), (validX, validY), patience=patience, criteria=monitor, metric='ROC', ) model.fit(trainX, trainY, batch_size=batch_size, epochs=epochs, verbose=0, shuffle=True, validation_data=(validX, validY), callbacks=[performance]) best_epoch = performance.best_epoch train_aucs = performance.evaluate(trainX, trainY) valid_aucs = performance.evaluate(validX, validY) train_best_auc = np.nanmean(train_aucs) valid_best_auc = np.nanmean(valid_aucs) dfx = pd.DataFrame(performance.history) valid_best_loss = dfx[dfx.epoch == performance.best_epoch].val_loss.iloc[0] with open(log_file, 'a') as f: f.write(','.join([ str(min_dist), str(n_neighbors), str(valid_best_loss), str(valid_best_auc), str(train_best_auc), str(best_epoch) ]) + '\n') return [valid_best_auc, train_best_auc, best_epoch]
train_idx = base_indices[(nb_test + nb_val):len(base_indices)] print(len(train_idx), len(valid_idx), len(test_idx)) return train_idx, valid_idx, test_idx if __name__ == '__main__': epochs = 500 patience = 30 batch_size = 128 lr = 0.0001 data_split_seed = 1 mp1 = molmap.loadmap('../descriptor_grid_split.mp') mp2 = molmap.loadmap('../fingerprint_grid_split.mp') for cell_line in dataset.cell_lines: df = dataset.load_data(cell_line) df = df[~df.pIC50.isna()].reset_index(drop=True) train_idx, valid_idx, test_idx = split(df, random_state=data_split_seed) Y = df['pIC50'].astype('float').values.reshape(-1, 1) X1_name = 'X1_%s.data' % cell_line X2_name = 'X2_%s.data' % cell_line if not os.path.exists(X1_name): X1 = mp1.batch_transform(df.smiles, n_jobs=8) dump(X1, X1_name)