Exemplo n.º 1
0
def initial_data_split(x, smiles_list, y, seed, split, test_size=0.2, init_size=0.25):

    """
    Splits data by initial/train/test
    :param x: input features
    :param y: output targets
    :param test_size: size of test set expressed as fraction of total data - 20% (Yao's paper)
    :param init_size: size of initial training subset as fraction of training set - 25% (Yao's paper)
    :return: X_init, y_init, X_holdout, y_holdout, X_test, y_test
    """
    if split=='random':
       X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=seed)
    
       X_holdout, X_init, y_holdout, y_init = train_test_split(X_train, y_train, test_size=init_size, random_state=seed)
    elif split=='scaffold':
       train_ind, test_ind = scaffold_split(smiles_list, seed=seed)
       smiles_train = np.array(smiles_list)[train_ind]
       X_train, X_test = x[train_ind], x[test_ind]
       y_train, y_test = y[train_ind], y[test_ind] 
    
       holdout_ind, init_ind = scaffold_split(smiles_train.tolist(), sizes=(1-init_size, init_size), seed=seed)
       X_holdout, X_init = X_train[holdout_ind], X_train[init_ind]
       y_holdout, y_init = y_train[holdout_ind], y_train[init_ind]
    y_init = y_init.reshape(-1, 1)
    y_holdout = y_holdout.reshape(-1, 1)
    y_test = y_test.reshape(-1, 1)
    return X_init, y_init, X_holdout, y_holdout, X_test, y_test
Exemplo n.º 2
0
def main(args):

    warnings.filterwarnings('ignore')

    print('\nTraining SOAP-GP on '+task+' dataset')
    print('\nGenerating features...')

    smiles_list, y  = parse_dataset(args.task, subtask=args.subtask)
    X = np.arange(len(smiles_list)).reshape(-1,1) # array of data indices

    if args.task!='IC50':
        rem_mat = np.load(args.kernel_path+args.task+'_soap.npy')
    else:
        print('Subtask: {}'.format(args.subtask))
        rem_mat = np.load(args.kernel_path+args.subtask+'_soap.npy')

    r2_list = []
    rmse_list = []

    print('\nBeginning training loop...')
  
    j=0 
    for i in range(args.n_runs):
        if split=='random':
           kf = KFold(n_splits=args.n_folds, random_state=i, shuffle=True)
           split_list = kf.split(X)
        elif split=='scaffold':
            train_ind, test_ind = scaffold_split(smiles_list, seed=i)
            split_list = [train_ind, test_ind]
        for train_ind, test_ind in split_list:
            X_train, X_test = X[train_ind], X[test_ind]
            y_train, y_test = y[train_ind], y[test_ind]

            X_train = tf.convert_to_tensor(X_train, dtype = tf.float64)
            X_test = tf.convert_to_tensor(X_test, dtype = tf.float64)        

            m = train_soapgp(X_train, y_train, rem_mat)
  
            #mean and variance GP prediction
            y_pred, y_var = m.predict_f(X_test)

            score = r2_score(y_test, y_pred)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        
            print("\nR^2: {:.3f}".format(score))
            print("RMSE: {:.3f}".format(rmse))
        
            r2_list.append(score)
            rmse_list.append(rmse)
        
            #np.savetxt('results/soapgp_'+task+'_split_'+split+'_run_'+str(j)+'_ypred.txt', y_pred)
            #np.savetxt('results/soapgp_'+task+'_split_'+split+'_run_'+str(j)+'_ytest.txt', y_test)
            #np.savetxt('results/soapgp_'+task+'_split_'+split+'_run_'+str(j)+'_ystd.txt', np.sqrt(y_var))
            j+=1

    print("\nmean R^2: {:.4f} +- {:.4f}".format(np.mean(r2_list), np.std(r2_list)/np.sqrt(len(r2_list))))
    print("mean RMSE: {:.4f} +- {:.4f}".format(np.mean(rmse_list), np.std(rmse_list)/np.sqrt(len(rmse_list))))
Exemplo n.º 3
0
def main(args):
    warnings.filterwarnings('ignore')

    print('\nTraining ECFP-RF on ' + args.task + ' dataset')
    print('\nGenerating features...')

    if args.task=='IC50':
        print('Subtask: {}'.format(args.subtask))

    smiles_list, y = parse_dataset(args.task, subtask=args.subtask)
    X = np.arange(len(smiles_list)).reshape(-1,1) # array of data indices

    r2_list = []
    rmse_list = []

    print('\nBeginning training loop...')

    j = 0
    for i in range(args.n_runs):
        if args.split == 'random':
            kf = KFold(n_splits=args.n_folds, random_state=i, shuffle=True)
            split_list = kf.split(X)
        elif args.split == 'scaffold':
            train_ind, test_ind = scaffold_split(smiles_list, seed=i)
            split_list = [train_ind, test_ind]
        for train_ind, test_ind in split_list:
            y_train, y_test = y[train_ind], y[test_ind]

            smiles_df = pd.DataFrame(smiles_list, columns=['smiles'])
            train_smiles = smiles_df.iloc[train_ind]['smiles'].to_list()
            test_smiles = smiles_df.iloc[test_ind]['smiles'].to_list()

            X_train = np.asarray([generate_fingerprints(s) for s in train_smiles])
            X_test = np.asarray([generate_fingerprints(s) for s in test_smiles])

            m = fit_forest(X_train, y_train)

            y_pred = m.predict(X_test)

            score = r2_score(y_test, y_pred)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))

            print("\nR^2: {:.3f}".format(score))
            print("RMSE: {:.3f}".format(rmse))

            r2_list.append(score)
            rmse_list.append(rmse)

            # np.savetxt('results/ecfp_'+task+'_split_'+split+'_run_'+str(j)+'_ypred.txt', y_pred)
            # np.savetxt('results/ecfp_'+task+'_split_'+split+'_run_'+str(j)+'_ytest.txt', y_test)
            # np.savetxt('results/ecfp_'+task+'_split_'+split+'_run_'+str(j)+'_ystd.txt', np.sqrt(y_var))
            j += 1

    print("\nmean R^2: {:.4f} +- {:.4f}".format(np.mean(r2_list), np.std(r2_list) / np.sqrt(len(r2_list))))
    print("mean RMSE: {:.4f} +- {:.4f}".format(np.mean(rmse_list), np.std(rmse_list) / np.sqrt(len(rmse_list))))
Exemplo n.º 4
0
def main(task, split, n_runs, n_fold, n_bits):
    global rem_mat, rem_diag, max_rem

    warnings.filterwarnings('ignore')

    print('\nTraining E3FP-GP on '+task+' dataset')
    print('\nGenerating features...')

    if task in PATHS:
        smiles_list, y  = parse_dataset(task, PATHS[task]) #NEED TO FIX MALARIA
        X = np.arange(len(smiles_list)).reshape(-1,1)
    else:
        raise Exception('Must provide dataset')
    
    m = None

    def objective_closure():
        return -m.log_marginal_likelihood()

    print('\nBeginning training loop...')
  
    if n_bits==-1:
       bit_list = [512, 1024, 2048, 4096, 8192]
    else:
       bit_list = [n_bits]
    for bits in bit_list:
        r2_list = []
        rmse_list = []
        logP_list = []
        j=0 
        for i in range(n_runs):
                if split=='random':
                   kf = KFold(n_splits=n_fold, random_state=i, shuffle=True)
                   split_list = kf.split(X)
                elif split=='scaffold':
                    train_ind, test_ind = scaffold_split(smiles_list, seed=i)
                    split_list = [(train_ind, test_ind)]
                for train_ind, test_ind in split_list:
                    X_train, X_test = X[train_ind], X[test_ind]
                    y_train, y_test = y[train_ind], y[test_ind]
                     
                    y_train, y_test, y_scaler = transform_data(y_train, y_test)
                        
                    X_train = tf.convert_to_tensor(X_train, dtype = tf.float64)
                    X_test = tf.convert_to_tensor(X_test, dtype = tf.float64)        
        
                    #rem_mat = np.load('kernels/'+task+'_ecfp_'+str(bits)+'.npy')
                    rem_mat = np.load('/rds-d2/user/wjm41/hpc-work/kernels/e3fp/'+task+'_e3fp.npy')
                
                    rem_diag = tf.constant(np.diag(rem_mat),dtype=tf.float64)
                    rem_mat = tf.constant(rem_mat,dtype=tf.float64)
               
                    from gpflow.utilities import positive
                    class Matern32_rem(gpflow.kernels.Kernel):
                            def __init__(self):
                                    super().__init__(active_dims=[0])
                                    self.var = gpflow.Parameter(1.0, transform=positive())
                                    self.mag = gpflow.Parameter(1.0, transform=positive())
                    
                            def K(self, X, X2=None, presliced=None):
                                    global rem_mat
                                    if X2 is None:
                                            X2=X
                                    A = tf.cast(X,tf.int32)
                                    A = tf.reshape(A,[-1])
                                    A2 = tf.reshape(X2,[-1])
                                    A2 = tf.cast(A2,tf.int32)
                                    K_mat = tf.gather(rem_mat, A, axis=0)
                                    K_mat = tf.gather(K_mat, A2, axis=1)
                                    z = tf.math.sqrt(3*K_mat)*self.var
                                    K_final = self.mag*(1+z)*tf.math.exp(-z)
                                    return K_final
                    
                            def K_diag(self, X, presliced=None):
                                    global rem_diag
                                    A=tf.cast(X,tf.int32)
                                    K_diag = tf.gather_nd(rem_diag, A)
                                    z = tf.math.sqrt(3*K_diag)*self.var
                                    return self.mag*(1+z)*tf.math.exp(-z)
        
                    k = Matern32_rem()+gpflow.kernels.White(0.1)
                    m = gpflow.models.GPR( data=(X_train, y_train), kernel=k)
                    
                    opt = gpflow.optimizers.Scipy()
               
                    opt_logs = opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=10000))
               
                    #print_summary(m)
               
                    y_pred, y_var = m.predict_f(X_test)
                    y_pred = y_scaler.inverse_transform(y_pred)
                    y_test = y_scaler.inverse_transform(y_test)
                    y_var = y_scaler.var_ * y_var
                    score = r2_score(y_test, y_pred)
                
                    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
                    logP = -m.log_likelihood()       
                
                    #print("\nR^2: {:.3f}".format(score))
                    #print("RMSE: {:.3f}".format(rmse))
                    #print("-ve logP: {:.3f}".format(logP))
                    r2_list.append(score)
                    rmse_list.append(rmse)
                    logP_list.append(logP)
                
                    np.savetxt('results/e3fp_'+task+'_split_'+split+'_run_'+str(j)+'_ypred.txt', y_pred)
                    np.savetxt('results/e3fp_'+task+'_split_'+split+'_run_'+str(j)+'_ytest.txt', y_test)
                    np.savetxt('results/e3fp_'+task+'_split_'+split+'_run_'+str(j)+'_ystd.txt', np.sqrt(y_var))
                    j+=1

        r2_list = np.array(r2_list)
        rmse_list = np.array(rmse_list)
        logP_list = np.array(logP_list)
        
        print("\nbits: {}".format(bits))
        print("mean R^2: {:.4f} +- {:.4f}".format(np.mean(r2_list), np.std(r2_list)/np.sqrt(len(r2_list))))
        print("mean RMSE: {:.4f} +- {:.4f}".format(np.mean(rmse_list), np.std(rmse_list)/np.sqrt(len(rmse_list))))
        print("mean -ve logP: {:.4f} +- {:.4f}\n".format(np.mean(logP_list), np.std(logP_list)/np.sqrt(len(logP_list))))