def Eva(n_neighbors, min_dist, log_file):

    min_dist = min_dist
    n_neighbors = n_neighbors

    print({'min_dist': min_dist, 'n_neighbors': n_neighbors})
    mp_new = loadmap('../fingerprint.mp')
    mp_new.fit(method='umap', min_dist=min_dist, n_neighbors=n_neighbors)
    X_new = mp2.rearrangement(X2, mp_new)

    trainX = X_new[train_idx]
    validX = X_new[valid_idx]
    testX = X_new[test_idx]

    clf = MultiLabelEstimator(
        n_outputs=1,
        fmap_shape1=trainX.shape[1:],
        batch_size=128,
        dense_layers=[128, 32],
        gpuid="0",
        patience=1000000,  #find best epoch in total 200 epochs
        monitor='val_auc',
        epochs=200)

    clf.fit(trainX, trainY, validX, validY)

    best_epoch = clf._performance.best_epoch
    train_aucs = clf._performance.evaluate(trainX, trainY)
    valid_aucs = clf._performance.evaluate(validX, validY)
    test_aucs = clf._performance.evaluate(testX, testY)

    train_best_auc = np.nanmean(train_aucs)
    valid_best_auc = np.nanmean(valid_aucs)
    test_auc = np.nanmean(test_aucs)

    dfx = pd.DataFrame(clf._performance.history)
    valid_best_loss = dfx[dfx.epoch ==
                          clf._performance.best_epoch].val_loss.iloc[0]

    with open(log_file, 'a') as f:
        f.write(','.join([
            str(min_dist),
            str(n_neighbors),
            str(valid_best_loss),
            str(valid_best_auc),
            str(train_best_auc),
            str(best_epoch),
            str(test_auc)
        ]) + '\n')

    return [valid_best_auc, train_best_auc, best_epoch]
Exemplo n.º 2
0
toxcast = dataset.load_ToxCast()
sider = dataset.load_SIDER()
clintox = dataset.load_ClinTox()
muv = dataset.load_MUV()

datasets = [muv, tox21, toxcast, sider, clintox]
MASK = -1

tmp_feature_dir = '/raid/shenwanxiang/10_FP_effect/tempignore'
if not os.path.exists(tmp_feature_dir):
    os.makedirs(tmp_feature_dir)

mps = []
fp_save_folder = '/raid/shenwanxiang/FP_maps'
for fp_type in fp_types:
    mp = loadmap(os.path.join(fp_save_folder, '%s.mp' % fp_type))
    mps.append(mp)

classification_res = []
## classification
for data in datasets:

    task_name = data.task_name
    task_type = data.task_type
    _, induces = load_data(task_name)
    smiles = data.x
    Y = pd.DataFrame(data.y).fillna(MASK).values

    for mp, fp_type in zip(mps, fp_types):

        print(fp_type)
import tensorflow as tf
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "6"

np.random.seed(123)
tf.compat.v1.set_random_seed(123)

#tmp_feature_dir = './tmpignore'
tmp_feature_dir = '/raid/shenwanxiang/tempignore'

if not os.path.exists(tmp_feature_dir):
    os.makedirs(tmp_feature_dir)

# In[2]:

mp1 = molmap.loadmap('../descriptor.mp')
mp2 = molmap.loadmap('../fingerprint.mp')

# In[3]:

task_name = 'PCBA'
from chembench import load_data
df, induces = load_data(task_name)
print(len(induces[0][0]), len(induces[0][1]), len(induces[0][2]), df.shape)

nan_idx = df[df.smiles.isna()].index.to_list()

MASK = -1
smiles_col = df.columns[0]
values_col = df.columns[1:]
Y = df[values_col].astype('float').fillna(MASK).values
Exemplo n.º 4
0
#load dataset
data = dataset.load_ESOL()
df = data.data
Y = data.y

valid_idx = df.sample(frac=0.2).index.to_list()
train_idx = list(set(df.index) - set(valid_idx))

batch_size = 200

res = []
for epochs in [1, 10, 50, 100, 150, 300, 500]:

    start = time.time()

    mp = loadmap('../descriptor.mp')
    X = mp.batch_transform(data.x, n_jobs=10)

    trainX = X[train_idx]
    validX = X[valid_idx]
    trainY = Y[train_idx]
    validY = Y[valid_idx]

    performace = molmodel.cbks.Reg_EarlyStoppingAndPerformance(
        (trainX, trainY), (validX, validY),
        patience=10000000000,
        criteria='val_loss')

    model = molmodel.net.SinglePathNet(X.shape[1:],
                                       n_outputs=1,
                                       dense_layers=[128, 32],
train_df = pd.read_csv('./train.csv')
valid_df = pd.read_csv('./val.csv')
test_df = pd.read_csv('./test.csv')

train_idx = df[df.smiles.isin(train_df.smiles)].index
valid_idx = df[df.smiles.isin(valid_df.smiles)].index
test_idx = df[df.smiles.isin(test_df.smiles)].index

trainY = Y[train_idx]
validY = Y[valid_idx]
testY = Y[test_idx]

print(len(train_idx), len(valid_idx), len(test_idx))

mp2 = loadmap('../fingerprint.mp')

tmp_feature_dir = '/raid/shenwanxiang/08_Robustness/tempignore'  #feature path
if not os.path.exists(tmp_feature_dir):
    os.makedirs(tmp_feature_dir)

X2_name = os.path.join(tmp_feature_dir, 'X2_%s.data' % task_name)
if not os.path.exists(X2_name):
    X2 = mp2.batch_transform(df.smiles, n_jobs=8)
    dump(X2, X2_name)
else:
    X2 = load(X2_name)


def get_pos_weights(trainY):
    """pos_weights: neg_n / pos_n """
def Eva(n_neighbors, min_dist):

    min_dist = min_dist
    n_neighbors = n_neighbors

    print({'min_dist': min_dist, 'n_neighbors': n_neighbors})
    mp_new = loadmap('../../descriptor.mp')
    mp_new.fit(method='umap', min_dist=min_dist, n_neighbors=n_neighbors)
    X_new = mp1.rearrangement(X1, mp_new)

    trainX = X_new[train_idx]
    validX = X_new[valid_idx]

    opt = tf.keras.optimizers.Adam(lr=1e-4,
                                   beta_1=0.9,
                                   beta_2=0.999,
                                   epsilon=1e-08,
                                   decay=0.0)  #
    model = molmodel.net.SinglePathNet(trainX.shape[1:],
                                       n_outputs=1,
                                       dense_layers=[128, 32],
                                       dense_avf='tanh',
                                       last_avf='linear')

    model.compile(optimizer=opt, loss='mse')
    performance = molmodel.cbks.Reg_EarlyStoppingAndPerformance(
        (trainX, trainY),
        (validX, validY),
        patience=1000000,  #find best epoch in total 500 epochs
        criteria='val_loss')
    model.fit(trainX,
              trainY,
              batch_size=128,
              epochs=500,
              verbose=0,
              shuffle=True,
              validation_data=(validX, validY),
              callbacks=[performance])

    #performance.model.set_weights(performance.best_weights) #set best model as the final model

    valid_rmse, valid_r2 = performance.evaluate(validX, validY)
    train_rmse, train_r2 = performance.evaluate(trainX, trainY)

    valid_best_rmse = np.nanmean(valid_rmse)
    train_best_rmse = np.nanmean(train_rmse)
    valid_best_loss = perfomrance.best

    best_epoch = performance.best_epoch

    with open(log_file, 'a') as f:
        f.write(','.join([
            str(min_dist),
            str(n_neighbors),
            str(valid_best_loss),
            str(valid_best_rmse),
            str(train_best_rmse),
            str(best_epoch)
        ]) + '\n')

    return [valid_best_loss, valid_best_rmse, train_best_rmse, best_epoch]
    test_idx = df[df.smiles.isin(test.smiles)].index
    print('training set: %s, valid set: %s, test set %s' %
          (len(train_idx), len(valid_idx), len(test_idx)))
    return train_idx, valid_idx, test_idx


#load dataset
data = dataset.load_ESOL()
df = data.data
Y = data.y

task_name = 'ESOL'
tmp_feature_dir = './tmpignore'
if not os.path.exists(tmp_feature_dir):
    os.makedirs(tmp_feature_dir)
mp1 = loadmap('../../descriptor.mp')

X1_name = os.path.join(tmp_feature_dir, 'X1_%s.data' % task_name)
if not os.path.exists(X1_name):
    X1 = mp1.batch_transform(df.smiles, n_jobs=8)
    dump(X1, X1_name)
else:
    X1 = load(X1_name)

train_idx, valid_idx, test_idx = get_attentiveFP_idx(df)
trainY = Y[train_idx]
validY = Y[valid_idx]

import time
start_time = str(time.ctime()).replace(':', '-').replace(' ', '_')
log_file = data.task_name + '_' + start_time + '.log'
Exemplo n.º 8
0

data = dataset.load_FreeSolv()

task_name = data.task_name
smiles = data.x
df = data.data

# In[5]:

from chembench import load_data
_, induces = load_data(task_name)

# In[6]:

mp1 = loadmap('../descriptor.mp')
mp2 = loadmap('../fingerprint.mp')

# In[7]:

tmp_feature_dir = '/raid/shenwanxiang/09_batchsize_effect/tempignore'
if not os.path.exists(tmp_feature_dir):
    os.makedirs(tmp_feature_dir)

X1_name = os.path.join(tmp_feature_dir, 'X1_%s.data' % task_name)
X2_name = os.path.join(tmp_feature_dir, 'X2_%s.data' % task_name)
if not os.path.exists(X1_name):
    X1 = mp1.batch_transform(smiles, n_jobs=8)
    dump(X1, X1_name)
else:
    X1 = load(X1_name)
Exemplo n.º 9
0
def Eva(n_neighbors, min_dist):

    min_dist = min_dist
    n_neighbors = n_neighbors

    print({'min_dist': min_dist, 'n_neighbors': n_neighbors})

    mp1_new = loadmap('../descriptor.mp')
    mp1_new.fit(method='umap', min_dist=min_dist, n_neighbors=n_neighbors)

    mp2_new = loadmap('../fingerprint.mp')
    mp2_new.fit(method='umap', min_dist=min_dist, n_neighbors=n_neighbors)

    X1_new = mp1.rearrangement(X1, mp1_new)
    X2_new = mp2.rearrangement(X2, mp2_new)

    trainX = (X1_new[train_idx], X2_new[train_idx])
    validX = (X1_new[valid_idx], X2_new[valid_idx])

    pos_weights, neg_weights = get_pos_weights(trainY)
    loss = lambda y_true, y_pred: molmodel.loss.weighted_cross_entropy(
        y_true, y_pred, pos_weights, MASK=-1)

    model = molmodel.net.DoublePathNet(molmap1_size,
                                       molmap2_size,
                                       n_outputs=Y.shape[-1],
                                       dense_layers=dense_layers,
                                       dense_avf=dense_avf,
                                       last_avf=last_avf)

    opt = tf.keras.optimizers.Adam(lr=lr,
                                   beta_1=0.9,
                                   beta_2=0.999,
                                   epsilon=1e-08,
                                   decay=0.0)  #
    #import tensorflow_addons as tfa
    #opt = tfa.optimizers.AdamW(weight_decay = 0.1,learning_rate=0.001,beta1=0.9,beta2=0.999, epsilon=1e-08)
    model.compile(optimizer=opt, loss=loss)

    performance = molmodel.cbks.CLA_EarlyStoppingAndPerformance(
        (trainX, trainY),
        (validX, validY),
        patience=patience,
        criteria=monitor,
        metric='ROC',
    )
    model.fit(trainX,
              trainY,
              batch_size=batch_size,
              epochs=epochs,
              verbose=0,
              shuffle=True,
              validation_data=(validX, validY),
              callbacks=[performance])

    best_epoch = performance.best_epoch
    train_aucs = performance.evaluate(trainX, trainY)
    valid_aucs = performance.evaluate(validX, validY)

    train_best_auc = np.nanmean(train_aucs)
    valid_best_auc = np.nanmean(valid_aucs)

    dfx = pd.DataFrame(performance.history)
    valid_best_loss = dfx[dfx.epoch == performance.best_epoch].val_loss.iloc[0]

    with open(log_file, 'a') as f:
        f.write(','.join([
            str(min_dist),
            str(n_neighbors),
            str(valid_best_loss),
            str(valid_best_auc),
            str(train_best_auc),
            str(best_epoch)
        ]) + '\n')

    return [valid_best_auc, train_best_auc, best_epoch]
Exemplo n.º 10
0
    train_idx = base_indices[(nb_test + nb_val):len(base_indices)]

    print(len(train_idx), len(valid_idx), len(test_idx))

    return train_idx, valid_idx, test_idx


if __name__ == '__main__':

    epochs = 500
    patience = 30
    batch_size = 128
    lr = 0.0001
    data_split_seed = 1

    mp1 = molmap.loadmap('../descriptor_grid_split.mp')
    mp2 = molmap.loadmap('../fingerprint_grid_split.mp')

    for cell_line in dataset.cell_lines:

        df = dataset.load_data(cell_line)
        df = df[~df.pIC50.isna()].reset_index(drop=True)
        train_idx, valid_idx, test_idx = split(df,
                                               random_state=data_split_seed)
        Y = df['pIC50'].astype('float').values.reshape(-1, 1)

        X1_name = 'X1_%s.data' % cell_line
        X2_name = 'X2_%s.data' % cell_line
        if not os.path.exists(X1_name):
            X1 = mp1.batch_transform(df.smiles, n_jobs=8)
            dump(X1, X1_name)