示例#1
0
def main(task, samp, split, seed, n_iter):
    global rem_mat, rem_diag
    warnings.filterwarnings('ignore')

    print(task + ' active learning with E3FP-GP, ' + samp + ' sampling, ' +
          split + ' splitting, seed = ' + str(seed))

    print('\nGenerating features...')
    if task in PATHS:
        smiles_list, y = parse_dataset(task, PATHS[task])  #NEED TO FIX MALARIA
        X = np.arange(len(smiles_list)).reshape(-1, 1)
    else:
        raise Exception('Must provide dataset')

    n_samp = round(len(X) * 0.8 * 0.025)

    X_init, y_init, X_holdout, y_holdout, X_test, y_test = initial_data_split(
        X, smiles_list, y, seed, split)

    rmse_list = []

    rem_mat = np.load('/rds-d2/user/wjm41/hpc-work/kernels/e3fp/' + task +
                      '_e3fp.npy')

    max_rem = rem_mat.max()
    rem_diag = tf.constant(np.diag(rem_mat), dtype=tf.float64)
    rem_mat = tf.constant(rem_mat, dtype=tf.float64)
    from gpflow.utilities import positive

    class Matern32_rem(gpflow.kernels.Kernel):
        def __init__(self):
            super().__init__(active_dims=[0])
            self.var = gpflow.Parameter(1.0, transform=positive())
            self.mag = gpflow.Parameter(1.0, transform=positive())

        def K(self, X, X2=None, presliced=None):
            global rem_mat
            if X2 is None:
                X2 = X
            A = tf.cast(X, tf.int32)
            A = tf.reshape(A, [-1])
            A2 = tf.reshape(X2, [-1])
            A2 = tf.cast(A2, tf.int32)
            K_mat = tf.gather(rem_mat, A, axis=0)
            K_mat = tf.gather(K_mat, A2, axis=1)
            z = tf.math.sqrt(3 * K_mat) * self.var
            K_final = self.mag * (1 + z) * tf.math.exp(-z)
            return K_final

        def K_diag(self, X, presliced=None):
            global rem_diag
            A = tf.cast(X, tf.int32)
            K_diag = tf.gather_nd(rem_diag, A)
            z = tf.math.sqrt(3 * K_diag) * self.var
            return self.mag * (1 + z) * tf.math.exp(-z)

    m = None

    def objective_closure():
        return -m.log_marginal_likelihood()

    opt = gpflow.optimizers.Scipy()

    # Active learning loop

    for i in range(n_iter + 1):
        y_init_scaled, y_test_scaled, y_scaler = transform_data(y_init, y_test)

        X_init_tf = tf.convert_to_tensor(X_init, dtype=tf.float64)
        X_test_tf = tf.convert_to_tensor(X_test, dtype=tf.float64)

        k = Matern32_rem() + gpflow.kernels.White(0.1)
        m = gpflow.models.GPR(data=(X_init_tf, y_init_scaled),
                              kernel=k,
                              mean_function=None,
                              noise_variance=1)

        opt_logs = opt.minimize(objective_closure,
                                m.trainable_variables,
                                options=dict(maxiter=10000))

        y_pred, _ = m.predict_f(X_test_tf)
        y_pred = y_scaler.inverse_transform(y_pred)

        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        rmse_list.append(rmse)
        r2 = r2_score(y_test, y_pred)
        print('\nIteration ' + str(i) + ' RMSE = ' + str(rmse))
        print('Iteration ' + str(i) + ' R2 = ' + str(r2))
        print('model training size = ' + str(len(X_init)))

        X_holdout_tf = tf.convert_to_tensor(X_holdout, dtype=tf.float64)
        y_init_scaled, y_holdout_scaled, y_scaler = transform_data(
            y_init, y_holdout)

        #Find sample indices and update init and holdouting training sets
        sample_indices = suggest_sample(X_holdout_tf, m, samp, n_samp)
        X_init = np.vstack((X_init, X_holdout[sample_indices]))
        y_init = np.vstack((y_init, y_holdout[sample_indices]))
        X_holdout = np.delete(X_holdout, sample_indices, axis=0)
        y_holdout = np.delete(y_holdout, sample_indices, axis=0)

    # Saves rmse vs num acquisitions into a 'results' folder

    np.save(
        'results/e3fp_' + task + '_samp_' + samp + '_split_' + split +
        '_seed_' + str(seed) + '.npy', rmse_list)
示例#2
0
def main(task, split, n_runs, n_fold, n_bits):
    global rem_mat, rem_diag, max_rem

    warnings.filterwarnings('ignore')

    print('\nTraining E3FP-GP on '+task+' dataset')
    print('\nGenerating features...')

    if task in PATHS:
        smiles_list, y  = parse_dataset(task, PATHS[task]) #NEED TO FIX MALARIA
        X = np.arange(len(smiles_list)).reshape(-1,1)
    else:
        raise Exception('Must provide dataset')
    
    m = None

    def objective_closure():
        return -m.log_marginal_likelihood()

    print('\nBeginning training loop...')
  
    if n_bits==-1:
       bit_list = [512, 1024, 2048, 4096, 8192]
    else:
       bit_list = [n_bits]
    for bits in bit_list:
        r2_list = []
        rmse_list = []
        logP_list = []
        j=0 
        for i in range(n_runs):
                if split=='random':
                   kf = KFold(n_splits=n_fold, random_state=i, shuffle=True)
                   split_list = kf.split(X)
                elif split=='scaffold':
                    train_ind, test_ind = scaffold_split(smiles_list, seed=i)
                    split_list = [(train_ind, test_ind)]
                for train_ind, test_ind in split_list:
                    X_train, X_test = X[train_ind], X[test_ind]
                    y_train, y_test = y[train_ind], y[test_ind]
                     
                    y_train, y_test, y_scaler = transform_data(y_train, y_test)
                        
                    X_train = tf.convert_to_tensor(X_train, dtype = tf.float64)
                    X_test = tf.convert_to_tensor(X_test, dtype = tf.float64)        
        
                    #rem_mat = np.load('kernels/'+task+'_ecfp_'+str(bits)+'.npy')
                    rem_mat = np.load('/rds-d2/user/wjm41/hpc-work/kernels/e3fp/'+task+'_e3fp.npy')
                
                    rem_diag = tf.constant(np.diag(rem_mat),dtype=tf.float64)
                    rem_mat = tf.constant(rem_mat,dtype=tf.float64)
               
                    from gpflow.utilities import positive
                    class Matern32_rem(gpflow.kernels.Kernel):
                            def __init__(self):
                                    super().__init__(active_dims=[0])
                                    self.var = gpflow.Parameter(1.0, transform=positive())
                                    self.mag = gpflow.Parameter(1.0, transform=positive())
                    
                            def K(self, X, X2=None, presliced=None):
                                    global rem_mat
                                    if X2 is None:
                                            X2=X
                                    A = tf.cast(X,tf.int32)
                                    A = tf.reshape(A,[-1])
                                    A2 = tf.reshape(X2,[-1])
                                    A2 = tf.cast(A2,tf.int32)
                                    K_mat = tf.gather(rem_mat, A, axis=0)
                                    K_mat = tf.gather(K_mat, A2, axis=1)
                                    z = tf.math.sqrt(3*K_mat)*self.var
                                    K_final = self.mag*(1+z)*tf.math.exp(-z)
                                    return K_final
                    
                            def K_diag(self, X, presliced=None):
                                    global rem_diag
                                    A=tf.cast(X,tf.int32)
                                    K_diag = tf.gather_nd(rem_diag, A)
                                    z = tf.math.sqrt(3*K_diag)*self.var
                                    return self.mag*(1+z)*tf.math.exp(-z)
        
                    k = Matern32_rem()+gpflow.kernels.White(0.1)
                    m = gpflow.models.GPR( data=(X_train, y_train), kernel=k)
                    
                    opt = gpflow.optimizers.Scipy()
               
                    opt_logs = opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=10000))
               
                    #print_summary(m)
               
                    y_pred, y_var = m.predict_f(X_test)
                    y_pred = y_scaler.inverse_transform(y_pred)
                    y_test = y_scaler.inverse_transform(y_test)
                    y_var = y_scaler.var_ * y_var
                    score = r2_score(y_test, y_pred)
                
                    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
                    logP = -m.log_likelihood()       
                
                    #print("\nR^2: {:.3f}".format(score))
                    #print("RMSE: {:.3f}".format(rmse))
                    #print("-ve logP: {:.3f}".format(logP))
                    r2_list.append(score)
                    rmse_list.append(rmse)
                    logP_list.append(logP)
                
                    np.savetxt('results/e3fp_'+task+'_split_'+split+'_run_'+str(j)+'_ypred.txt', y_pred)
                    np.savetxt('results/e3fp_'+task+'_split_'+split+'_run_'+str(j)+'_ytest.txt', y_test)
                    np.savetxt('results/e3fp_'+task+'_split_'+split+'_run_'+str(j)+'_ystd.txt', np.sqrt(y_var))
                    j+=1

        r2_list = np.array(r2_list)
        rmse_list = np.array(rmse_list)
        logP_list = np.array(logP_list)
        
        print("\nbits: {}".format(bits))
        print("mean R^2: {:.4f} +- {:.4f}".format(np.mean(r2_list), np.std(r2_list)/np.sqrt(len(r2_list))))
        print("mean RMSE: {:.4f} +- {:.4f}".format(np.mean(rmse_list), np.std(rmse_list)/np.sqrt(len(rmse_list))))
        print("mean -ve logP: {:.4f} +- {:.4f}\n".format(np.mean(logP_list), np.std(logP_list)/np.sqrt(len(logP_list))))
示例#3
0
def main(args):
    """
    :param path: str specifying path to dataset.
    :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n']
    :param n_trials: int specifying number of random train/test splits to use
    :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set
    """

    # data_loader = TaskDataLoader(args.task, args.path)
    # smiles_list, y = data_loader.load_property_data()

    smiles_list, y = parse_dataset(args.task, PATHS[args.task], args.reg)
    X = [Chem.MolFromSmiles(m) for m in smiles_list]

    # Initialise featurisers
    atom_featurizer = CanonicalAtomFeaturizer()
    bond_featurizer = CanonicalBondFeaturizer()

    e_feats = bond_featurizer.feat_size('e')
    n_feats = atom_featurizer.feat_size('h')
    print('Number of features: ', n_feats)

    X = [
        mol_to_bigraph(m,
                       node_featurizer=atom_featurizer,
                       edge_featurizer=bond_featurizer) for m in X
    ]

    r2_list = []
    rmse_list = []
    mae_list = []
    skipped_trials = 0

    for i in range(args.n_trials):

        # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=args.test_set_size, random_state=i + 5)

        kf = StratifiedKFold(n_splits=args.n_folds,
                             random_state=i,
                             shuffle=True)
        split_list = kf.split(X, y)
        j = 0
        for train_ind, test_ind in split_list:
            if args.reg:
                writer = SummaryWriter('runs/' + args.task + '/mpnn/reg/run_' +
                                       str(i) + '_fold_' + str(j))
            else:
                writer = SummaryWriter('runs/' + args.task +
                                       '/mpnn/class/run_' + str(i) + '_fold_' +
                                       str(j))
            X_train, X_test = np.array(X)[train_ind], np.array(X)[test_ind]
            y_train, y_test = np.array(y)[train_ind], np.array(y)[test_ind]

            y_train = y_train.reshape(-1, 1)
            y_test = y_test.reshape(-1, 1)

            #  We standardise the outputs but leave the inputs unchanged
            if args.reg:
                y_scaler = StandardScaler()
                y_train_scaled = torch.Tensor(y_scaler.fit_transform(y_train))
                y_test_scaled = torch.Tensor(y_scaler.transform(y_test))
            else:
                y_train_scaled = torch.Tensor(y_train)
                y_test_scaled = torch.Tensor(y_test)

            train_data = list(zip(X_train, y_train_scaled))
            test_data = list(zip(X_test, y_test_scaled))

            train_loader = DataLoader(train_data,
                                      batch_size=32,
                                      shuffle=True,
                                      collate_fn=collate,
                                      drop_last=False)
            test_loader = DataLoader(test_data,
                                     batch_size=32,
                                     shuffle=False,
                                     collate_fn=collate,
                                     drop_last=False)

            mpnn_net = MPNNPredictor(node_in_feats=n_feats,
                                     edge_in_feats=e_feats)
            mpnn_net.to(device)

            if args.reg:
                loss_fn = MSELoss()
            else:
                loss_fn = BCELoss()
            optimizer = torch.optim.Adam(mpnn_net.parameters(), lr=1e-4)

            mpnn_net.train()

            epoch_losses = []
            epoch_rmses = []
            for epoch in tqdm(range(1, args.n_epochs)):
                epoch_loss = 0
                preds = []
                labs = []
                for i, (bg, labels) in tqdm(enumerate(train_loader)):
                    labels = labels.to(device)
                    atom_feats = bg.ndata.pop('h').to(device)
                    bond_feats = bg.edata.pop('e').to(device)
                    atom_feats, bond_feats, labels = atom_feats.to(
                        device), bond_feats.to(device), labels.to(device)
                    y_pred = mpnn_net(bg, atom_feats, bond_feats)
                    labels = labels.unsqueeze(dim=1)
                    loss = loss_fn(y_pred, labels)
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    epoch_loss += loss.detach().item()

                    if args.reg:
                        # Inverse transform to get RMSE
                        labels = y_scaler.inverse_transform(
                            labels.cpu().reshape(-1, 1))
                        y_pred = y_scaler.inverse_transform(
                            y_pred.detach().cpu().numpy().reshape(-1, 1))
                    else:
                        labels = labels.cpu().numpy()
                        y_pred = y_pred.detach().cpu().numpy()

                    # store labels and preds
                    preds.append(y_pred)
                    labs.append(labels)

                labs = np.concatenate(labs, axis=None)
                preds = np.concatenate(preds, axis=None)
                pearson, p = pearsonr(preds, labs)
                if args.reg:
                    mae = mean_absolute_error(preds, labs)
                    rmse = np.sqrt(mean_squared_error(preds, labs))
                    r2 = r2_score(preds, labs)
                else:
                    r2 = roc_auc_score(labs, preds)
                    precision, recall, thresholds = precision_recall_curve(
                        labs, preds)
                    rmse = auc(recall, precision)
                    mae = 0

                if args.reg:
                    writer.add_scalar('Loss/train', epoch_loss, epoch)
                    writer.add_scalar('RMSE/train', rmse, epoch)
                    writer.add_scalar('R2/train', r2, epoch)
                else:
                    writer.add_scalar('Loss/train', epoch_loss, epoch)
                    writer.add_scalar('ROC-AUC/train', r2, epoch)
                    writer.add_scalar('PRC-AUC/train', rmse, epoch)

                if epoch % 20 == 0:
                    if args.reg:
                        print(f"epoch: {epoch}, "
                              f"LOSS: {epoch_loss:.3f}, "
                              f"RMSE: {rmse:.3f}, "
                              f"MAE: {mae:.3f}, "
                              f"rho: {pearson:.3f}, "
                              f"R2: {r2:.3f}")

                    else:
                        print(f"epoch: {epoch}, "
                              f"LOSS: {epoch_loss:.3f}, "
                              f"ROC-AUC: {r2:.3f}, "
                              f"PRC-AUC: {rmse:.3f}, "
                              f"rho: {pearson:.3f}")
                epoch_losses.append(epoch_loss)
                epoch_rmses.append(rmse)

            # Discount trial if train RMSE finishes as a negative value (optimiser error).

            if r2 < -1:
                skipped_trials += 1
                print('Skipped trials is {}'.format(skipped_trials))
                continue

            # Evaluate
            mpnn_net.eval()
            preds = []
            labs = []
            for i, (bg, labels) in enumerate(test_loader):
                labels = labels.to(device)
                atom_feats = bg.ndata.pop('h').to(device)
                bond_feats = bg.edata.pop('e').to(device)
                atom_feats, bond_feats, labels = atom_feats.to(
                    device), bond_feats.to(device), labels.to(device)
                y_pred = mpnn_net(bg, atom_feats, bond_feats)
                labels = labels.unsqueeze(dim=1)

                if args.reg:
                    # Inverse transform to get RMSE
                    labels = y_scaler.inverse_transform(labels.cpu().reshape(
                        -1, 1))
                    y_pred = y_scaler.inverse_transform(
                        y_pred.detach().cpu().numpy().reshape(-1, 1))
                else:
                    labels = labels.cpu().numpy()
                    y_pred = y_pred.detach().cpu().numpy()
                preds.append(y_pred)
                labs.append(labels)

            labs = np.concatenate(labs, axis=None)
            preds = np.concatenate(preds, axis=None)
            pearson, p = pearsonr(preds, labs)
            if args.reg:
                mae = mean_absolute_error(preds, labs)
                rmse = np.sqrt(mean_squared_error(preds, labs))
                r2 = r2_score(preds, labs)
                writer.add_scalar('RMSE/test', rmse)
                writer.add_scalar('R2/test', r2)
                print(
                    f'Test RMSE: {rmse:.3f}, MAE: {mae:.3f}, R: {pearson:.3f}, R2: {r2:.3f}'
                )
            else:
                r2 = roc_auc_score(labs, preds)
                precision, recall, thresholds = precision_recall_curve(
                    labs, preds)
                rmse = auc(recall, precision)
                mae = 0
                writer.add_scalar('ROC-AUC/test', r2)
                writer.add_scalar('PRC-AUC/test', rmse)
                print(
                    f'Test ROC-AUC: {r2:.3f}, PRC-AUC: {rmse:.3f}, rho: {pearson:.3f}'
                )

            r2_list.append(r2)
            rmse_list.append(rmse)
            mae_list.append(mae)
            j += 1

    r2_list = np.array(r2_list)
    rmse_list = np.array(rmse_list)
    mae_list = np.array(mae_list)
    if args.reg:
        print("\nmean R^2: {:.4f} +- {:.4f}".format(
            np.mean(r2_list),
            np.std(r2_list) / np.sqrt(len(r2_list))))
        print("mean RMSE: {:.4f} +- {:.4f}".format(
            np.mean(rmse_list),
            np.std(rmse_list) / np.sqrt(len(rmse_list))))
        print("mean MAE: {:.4f} +- {:.4f}\n".format(
            np.mean(mae_list),
            np.std(mae_list) / np.sqrt(len(mae_list))))
    else:
        print("mean ROC-AUC^2: {:.3f} +- {:.3f}".format(
            np.mean(r2_list),
            np.std(r2_list) / np.sqrt(len(r2_list))))
        print("mean PRC-AUC: {:.3f} +- {:.3f}".format(
            np.mean(rmse_list),
            np.std(rmse_list) / np.sqrt(len(rmse_list))))
    print("\nSkipped trials is {}".format(skipped_trials))
示例#4
0
MALARIA_PATH = 'data/Malaria/Malaria.csv'

PATHS = {
    'FreeSolv': FREESOLV_PATH,
    'esol': ESOL_PATH,
    'lipo': LIPO_PATH,
    'dls': DLS_PATH,
    'CatS': CATS_PATH,
    'bradley': BRADLEY_PATH,
    'Malaria': MALARIA_PATH
}

task = sys.argv[1]
#TASK_NAME = 'FreeSolv'  # Change dataset. Options: ['ESOL', 'FreeSolv', 'dls', 'CEP', 'CatS', 'bradley', 'Malaria']

smiles_list, y = parse_dataset(task, PATHS[task])  #NEED TO FIX MALARIA

dat_size = len(smiles_list)

mpi_comm = MPI.COMM_WORLD
mpi_rank = mpi_comm.Get_rank()
mpi_size = mpi_comm.Get_size()

my_border_low, my_border_high = return_borders(mpi_rank, dat_size, mpi_size)

my_list = smiles_list[my_border_low:my_border_high]

bit_list = [512, 1024, 2048, 4096, 8192]
for bits in bit_list:
    my_db = gen_e3fp_features(my_list, mpi_rank, mpi_size, bits)