예제 #1
0
def load_data(num): 
    atom_featurizer = CanonicalAtomFeaturizer()
    bond_featurizer = CanonicalBondFeaturizer()
    # atom_featurizer = WeaveAtomFeaturizer()
    # bond_featurizer = WeaveEdgeFeaturizer()
    # atom_featurizer = node_featurizer
    # bond_featurizer = edge_featurizer
    
    # m = 'CCO'
    # mol = Chem.MolFromSmiles(m)
    # num_bonds = mol.GetNumBonds()
    # print(mol,num_bonds)
    # smiles_to_bigraph(m, add_self_loop=False, node_featurizer=atom_featurizer,edge_featurizer=bond_featurizer)
    # print(bond_featurizer(mol)['distance'].shape)
    # print(atom_featurizer(mol)['node_type'].shape)
    trainmols, train_y = read_from_rdkit(num,0)
    testmols, test_y = read_from_rdkit(num,2)
    train_g = [smiles_to_bigraph(m, add_self_loop=False, node_featurizer=atom_featurizer,edge_featurizer=bond_featurizer) for m in trainmols]
    
    train_y = np.array(train_y, dtype=np.int64)
    train_y = OneHotEncoder(sparse=False).fit(train_y.reshape(-1,1)).transform(train_y.reshape(-1,1))
    train_y = np.array(train_y, dtype=np.float32)
    print("Training set ",len(train_g))
    
    test_g = [smiles_to_bigraph(m, add_self_loop=False, node_featurizer=atom_featurizer,edge_featurizer=bond_featurizer) for m in testmols]
    test_y = np.array(test_y, dtype=np.int64)
    test_y = OneHotEncoder(sparse=False).fit(test_y.reshape(-1,1)).transform(test_y.reshape(-1,1))
    test_y = np.array(test_y, dtype=np.float32)
    print("Test set",len(test_g))
    print("Data loaded.")

    return train_g, train_y, test_g, test_y
예제 #2
0
def main():
    """
    :param n_trials: int specifying number of random train/test splits to use
    :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set
    """
    mpi_comm = MPI.COMM_WORLD
    mpi_rank = mpi_comm.Get_rank()
    mpi_size = mpi_comm.Get_size()

    df = pd.read_csv('data/sars_lip.csv')
    smiles_list = df['smiles']

    my_border_low, my_border_high = return_borders(mpi_rank, len(smiles_list), mpi_size)

    my_smiles = smiles_list[my_border_low:my_border_high]
    my_mols = np.array([Chem.MolFromSmiles(m) for m in my_smiles])

    # Initialise featurisers
    atom_featurizer = CanonicalAtomFeaturizer()
    bond_featurizer = CanonicalBondFeaturizer()

    e_feats = bond_featurizer.feat_size('e')
    n_feats = atom_featurizer.feat_size('h')

    my_graphs = np.array([mol_to_bigraph(m, node_featurizer=atom_featurizer,
                                         edge_featurizer=bond_featurizer) for m in my_mols])

    sendcounts = np.array(mpi_comm.gather(len(my_graphs), root=0))

    # my_descs = np.array([generate_descriptors(m) for m in my_smiles])
    # if mpi_rank == 0:
        # descs = np.empty((len(smiles_list), 114), dtype=np.float64)
    # else:
        # descs = None

    # mpi_comm.Gatherv(sendbuf=my_descs, recvbuf=(descs, sendcounts), root=0)
    graphs = mpi_comm.gather(my_graphs, root=0)
    X = graphs[0]
    if mpi_rank==0:
        for graph in graphs:
            X = X.vstack([X,graph])
        # np.save('/rds-d2/user/wjm41/hpc-work/sars_descs.npy', descs)
        np.save('/rds-d2/user/wjm41/hpc-work/sars_graphs.npy', X)
        print('SAVED!')
예제 #3
0
    def __CanonicalFeatureize(self, train_mols) -> list:
        atom_featurizer = CanonicalAtomFeaturizer("n_feat")
        bond_featurizer = CanonicalBondFeaturizer("e_feat")

        train_graph = [
            mol_to_bigraph(mol,
                           node_featurizer=atom_featurizer,
                           edge_featurizer=bond_featurizer)
            for mol in train_mols
        ]
        return train_graph
예제 #4
0
def mol_to_graph(mols: list, canonical: bool = False) -> dgl.DGLGraph:
    if canonical:
        graph = [
            mol_to_bigraph(
                mol,
                node_featurizer=CanonicalAtomFeaturizer(),
                edge_featurizer=CanonicalBondFeaturizer(),
            )
            for mol in mols
        ]
    else:
        graph = [mol_to_bigraph(m, node_featurizer=MyNodeFeaturizer()) for m in mols]

    return graph
예제 #5
0
def moonshot():

    from dgllife.utils import mol_to_bigraph, CanonicalAtomFeaturizer
    import pandas as pd
    import os
    df = pd.read_csv(
        os.path.dirname(graca.data.collections.__file__) +
        "/covid_submissions_all_info.csv")
    df = df.dropna(subset=["f_avg_pIC50"])

    from rdkit import Chem
    from rdkit.Chem import MCS

    ds = []
    for idx0, row0 in df.iterrows():
        smiles0 = row0["SMILES"]
        mol0 = Chem.MolFromSmiles(smiles0)
        for idx1, row1 in df.iloc[idx0 + 1:].iterrows():
            smiles1 = row1["SMILES"]
            mol1 = Chem.MolFromSmiles(smiles1)
            res = MCS.FindMCS([mol0, mol1])
            if res.numAtoms > 15:
                ds.append((
                    mol_to_bigraph(mol1,
                                   node_featurizer=CanonicalAtomFeaturizer(
                                       atom_data_field='feat')),
                    mol_to_bigraph(mol0,
                                   node_featurizer=CanonicalAtomFeaturizer(
                                       atom_data_field='feat')),
                    row1["f_avg_pIC50"],
                    row0["f_avg_pIC50"],
                ))

    ds_tr = ds[:500]
    ds_te = ds[500:]

    return ds_tr, ds_te
예제 #6
0
파일: point.py 프로젝트: yuanqing-wang/malt
 def __init__(
     self,
     smiles: str,
     g: Union[dgl.DGLGraph, None] = None,
     y: Union[float, None] = None,
     extra: dict = {},
     featurizer: callable = functools.partial(
         smiles_to_bigraph,
         node_featurizer=CanonicalAtomFeaturizer(atom_data_field="h"),
     ),
 ) -> None:
     self.smiles = smiles
     self.g = g
     self.y = y
     self.extra = extra
     self.featurizer = featurizer
예제 #7
0
def init_featurizer(args):
    """Initialize node/edge featurizer

    Parameters
    ----------
    args : dict
        Settings

    Returns
    -------
    args : dict
        Settings with featurizers updated
    """
    if args['model'] in [
            'gin_supervised_contextpred', 'gin_supervised_infomax',
            'gin_supervised_edgepred', 'gin_supervised_masking'
    ]:
        from dgllife.utils import PretrainAtomFeaturizer, PretrainBondFeaturizer
        args['atom_featurizer_type'] = 'pre_train'
        args['bond_featurizer_type'] = 'pre_train'
        args['node_featurizer'] = PretrainAtomFeaturizer()
        args['edge_featurizer'] = PretrainBondFeaturizer()

        return args

    if args['atom_featurizer_type'] == 'canonical':
        from dgllife.utils import CanonicalAtomFeaturizer
        args['node_featurizer'] = CanonicalAtomFeaturizer()
    elif args['atom_featurizer_type'] == 'attentivefp':
        from dgllife.utils import AttentiveFPAtomFeaturizer
        args['node_featurizer'] = AttentiveFPAtomFeaturizer()
    else:
        return ValueError(
            "Expect node_featurizer to be in ['canonical', 'attentivefp'], "
            "got {}".format(args['atom_featurizer_type']))

    if args['model'] in ['Weave', 'MPNN', 'AttentiveFP']:
        if args['bond_featurizer_type'] == 'canonical':
            from dgllife.utils import CanonicalBondFeaturizer
            args['edge_featurizer'] = CanonicalBondFeaturizer()
        elif args['bond_featurizer_type'] == 'attentivefp':
            from dgllife.utils import AttentiveFPBondFeaturizer
            args['edge_featurizer'] = AttentiveFPBondFeaturizer()
    else:
        args['edge_featurizer'] = None

    return args
예제 #8
0
    def _make_atom_graph(self,
                         pdb_code=None,
                         pdb_path=None,
                         node_featurizer=None,
                         edge_featurizer=None,
                         graph_type='bigraph'):
        """
        Create atom-level graph from PDB structure

        :param graph_type:
        :param pdb_code:
        :param pdb_path:
        :param node_featurizer:
        :param edge_featurizer:
        :return:
        """

        if node_featurizer is None:
            node_featurizer = CanonicalAtomFeaturizer()
        if edge_featurizer is None:
            edge_featurizer = CanonicalBondFeaturizer()

        # Read in protein as mol
        # if pdb_path:
        if pdb_code:
            pdb_path = self.pdb_dir + pdb_code + '.pdb'
            if not os.path.isfile(pdb_path):
                self._download_pdb(pdb_code)

        assert os.path.isfile(pdb_path)
        mol = MolFromPDBFile(pdb_path)

        # DGL mol to graph
        if graph_type == 'bigraph':
            g = mol_to_bigraph(mol,
                               node_featurizer=node_featurizer,
                               edge_featurizer=edge_featurizer)
        elif graph_type == 'complete':
            g = mol_to_complete_graph(
                mol,
                node_featurizer=node_featurizer,
            )
        elif graph_type == 'k_nn':
            raise NotImplementedError
        print(g)
        return g
예제 #9
0
 def __init__(self,
              smiles,
              y,
              node_enc=CanonicalAtomFeaturizer(),
              edge_enc=None,
              graph_type=mol_to_bigraph,
              canonical_atom_order=False):
     super(graph_dataset, self).__init__()
     #        self.num_graphs = num_graphs
     self.smiles = smiles
     self.y = y
     self.graph_type = graph_type
     self.node_enc = node_enc
     self.edge_enc = edge_enc
     self.canonical_atom_order = canonical_atom_order
     self.graphs = []
     self.labels = []
     self._generate()
예제 #10
0
def main(args):
    dataset = UnlabeledSMILES(args['smiles'],
                              node_featurizer=CanonicalAtomFeaturizer())
    dataloader = DataLoader(dataset,
                            batch_size=args['batch_size'],
                            collate_fn=collate_molgraphs_unlabeled)
    model = load_model(args).to(args['device'])
    checkpoint = torch.load(args['train_result_path'] + '/model.pth',
                            map_location='cpu')
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()

    smiles_list = []
    predictions = []

    with torch.no_grad():
        for batch_id, batch_data in enumerate(
                tqdm(dataloader, desc="Iteration")):
            batch_smiles, bg = batch_data
            smiles_list.extend(batch_smiles)
            batch_pred = predict(args, model, bg)
            if not args['soft_classification']:
                batch_pred = (batch_pred >= 0.5).float()
            predictions.append(batch_pred.detach().cpu())

    predictions = torch.cat(predictions, dim=0)

    output_data = {'canonical_smiles': smiles_list}
    if args['task_names'] is None:
        args['task_names'] = [
            'task_{:d}'.format(t) for t in range(1, args['n_tasks'] + 1)
        ]
    else:
        args['task_names'] = args['task_names'].split(',')
    for task_id, task_name in enumerate(args['task_names']):
        output_data[task_name] = predictions[:, task_id]
    df = pd.DataFrame(output_data)
    df.to_csv(args['inference_result_path'] + '/prediction.csv', index=False)
예제 #11
0
def main(args):
    """
    :param n_trials: int specifying number of random train/test splits to use
    :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set
    """

    df = pd.read_csv('data/covid_multitask_pIC50.smi')
    smiles_list = df['SMILES']
    y = df[['acry_class', 'chloro_class', 'rest_class', 'acry_reg', 'chloro_reg', 'rest_reg']].to_numpy()
    n_tasks = y.shape[1]
    class_inds = [0,1,2]
    reg_inds = [3,4,5]
    X = [Chem.MolFromSmiles(m) for m in smiles_list]

    # Initialise featurisers
    atom_featurizer = CanonicalAtomFeaturizer()
    bond_featurizer = CanonicalBondFeaturizer()

    e_feats = bond_featurizer.feat_size('e')
    n_feats = atom_featurizer.feat_size('h')
    print('Number of features: ', n_feats)

    X = np.array([mol_to_bigraph(m, node_featurizer=atom_featurizer, edge_featurizer=bond_featurizer) for m in X])

    r2_list = []
    rmse_list = []
    roc_list = []
    prc_list = []

    for i in range(args.n_trials):
        #kf = StratifiedKFold(n_splits=3, random_state=i, shuffle=True)
        #split_list = kf.split(X, y)

        #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=args.test_set_size, random_state=i+5)
        X_train_acry, X_test_acry, \
        y_train_acry, y_test_acry = train_test_split(X[~np.isnan(y[:,0])], y[~np.isnan(y[:,0])],
                                                     stratify=y[:,0][~np.isnan(y[:,0])],
                                                     test_size=args.test_set_size, shuffle=True, random_state=i+5)
        X_train_chloro, X_test_chloro, \
        y_train_chloro, y_test_chloro = train_test_split(X[~np.isnan(y[:,1])], y[~np.isnan(y[:,1])],
                                                         stratify=y[:,1][~np.isnan(y[:,1])],
                                                         test_size=args.test_set_size, shuffle=True, random_state=i+5)
        X_train_rest, X_test_rest, \
        y_train_rest, y_test_rest = train_test_split(X[~np.isnan(y[:,2])], y[~np.isnan(y[:,2])],
                                                     stratify=y[:,2][~np.isnan(y[:,2])],
                                                     test_size=args.test_set_size, shuffle=True, random_state=i+5)

        X_train = np.concatenate([X_train_acry, X_train_chloro, X_train_rest])
        X_test = np.concatenate([X_test_acry, X_test_chloro, X_test_rest])

        y_train = np.concatenate([y_train_acry, y_train_chloro, y_train_rest])
        y_test = np.concatenate([y_test_acry, y_test_chloro, y_test_rest])

        writer = SummaryWriter('runs/multitask_pIC50/run_' + str(i))

        # writer = SummaryWriter('runs/multitask/run_' + str(i) + '_fold_' + str(j))


        y_train = torch.Tensor(y_train)
        y_test = torch.Tensor(y_test)

        train_data = list(zip(X_train, y_train))
        test_data = list(zip(X_test, y_test))

        train_loader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=collate, drop_last=False)
        test_loader = DataLoader(test_data, batch_size=32, shuffle=True, collate_fn=collate, drop_last=False)

        process = Net(class_inds, reg_inds)
        process.to(device)

        mpnn_net = MPNNPredictor(node_in_feats=n_feats,
                                       edge_in_feats=e_feats,
                                       node_out_feats=128,
                                       n_tasks=n_tasks
                                  )
        mpnn_net.to(device)

        reg_loss_fn = MSELoss()
        class_loss_fn = BCELoss()

        optimizer = torch.optim.Adam(mpnn_net.parameters(), lr=0.001)

        for epoch in range(1, args.n_epochs+1):
            epoch_loss = 0
            preds = []
            labs = []
            mpnn_net.train()
            for i, (bg, labels) in enumerate(train_loader):
                labels = labels.to(device)
                atom_feats = bg.ndata.pop('h').to(device)
                bond_feats = bg.edata.pop('e').to(device)
                #atom_feats, bond_feats, dcs, labels = atom_feats.to(device), bond_feats.to(device), dcs.to(device), labels.to(device)
                y_pred = mpnn_net(bg, atom_feats, bond_feats)
                y_pred = process(y_pred)
                loss=torch.tensor(0)
                loss = loss.to(device)
                for ind in reg_inds:
                    if len(labels[:,ind][~torch.isnan(labels[:,ind])])==0:
                        continue
                    loss = loss + reg_loss_fn(y_pred[:,ind][~torch.isnan(labels[:,ind])], labels[:,ind][~torch.isnan(labels[:,ind])])
                for ind in class_inds:
                    if len(labels[:,ind][~torch.isnan(labels[:,ind])])==0:
                        continue
                    loss = loss + class_loss_fn(y_pred[:,ind][~torch.isnan(labels[:,ind])], labels[:,ind][~torch.isnan(labels[:,ind])])
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                epoch_loss += loss.detach().item()

                labels = labels.cpu().numpy()
                y_pred = y_pred.detach().cpu().numpy()


                # store labels and preds
                preds.append(y_pred)
                labs.append(labels)

            labs = np.concatenate(labs, axis=0)
            preds = np.concatenate(preds, axis=0)
            rmses= []
            r2s = []
            rocs = []
            prcs = []
            for ind in reg_inds:
                rmse = np.sqrt(mean_squared_error(labs[:,ind][~np.isnan(labs[:,ind])],preds[:,ind][~np.isnan(labs[:,ind])]))
                r2 = r2_score(labs[:,ind][~np.isnan(labs[:,ind])],preds[:,ind][~np.isnan(labs[:,ind])])
                rmses.append(rmse)
                r2s.append(r2)

            for ind in class_inds:
                r2 = roc_auc_score(labs[:,ind][~np.isnan(labs[:,ind])],
                                   preds[:,ind][~np.isnan(labs[:,ind])])
                precision, recall, thresholds = precision_recall_curve(labs[:,ind][~np.isnan(labs[:,ind])],
                                                                       preds[:,ind][~np.isnan(labs[:,ind])])
                rmse = auc(recall, precision)
                rocs.append(r2)
                prcs.append(rmse)

            writer.add_scalar('LOSS/train', epoch_loss, epoch)
            writer.add_scalar('train/acry_rocauc', rocs[0], epoch)
            writer.add_scalar('train/acry_prcauc', prcs[0], epoch)
            writer.add_scalar('train/chloro_rocauc', rocs[1], epoch)
            writer.add_scalar('train/chloro_prcauc', prcs[1], epoch)
            writer.add_scalar('train/rest_rocauc', rocs[2], epoch)
            writer.add_scalar('train/rest_prcauc', prcs[2], epoch)

            writer.add_scalar('train/acry_rmse', rmses[0], epoch)
            writer.add_scalar('train/acry_r2', r2s[0], epoch)
            writer.add_scalar('train/chloro_rmse', rmses[1], epoch)
            writer.add_scalar('train/chloro_r2', r2s[1], epoch)
            writer.add_scalar('train/rest_rmse', rmses[2], epoch)
            writer.add_scalar('train/rest_r2', r2s[2], epoch)

            if epoch % 20 == 0:
                print(f"\nepoch: {epoch}, "
                      f"LOSS: {epoch_loss:.3f}"
                      f"\n acry ROC-AUC: {rocs[0]:.3f}, "
                      f"acry PRC-AUC: {prcs[0]:.3f}"
                      f"\n chloro ROC-AUC: {rocs[1]:.3f}, "
                      f"chloro PRC-AUC: {prcs[1]:.3f}"
                      f"\n rest ROC-AUC: {rocs[2]:.3f}, "
                      f"rest PRC-AUC: {prcs[2]:.3f}"
                      f"\n acry R2: {r2s[0]:.3f}, "
                      f"acry RMSE: {rmses[0]:.3f}"
                      f"\n chloro R2: {r2s[1]:.3f}, "
                      f"chloro RMSE: {rmses[1]:.3f}"
                      f"\n rest R2: {r2s[2]:.3f}, "
                      f"rest RMSE: {rmses[2]:.3f}")

            # Evaluate
            mpnn_net.eval()
            preds = []
            labs = []
            for i, (bg, labels) in enumerate(test_loader):
                labels = labels.to(device)
                atom_feats = bg.ndata.pop('h').to(device)
                bond_feats = bg.edata.pop('e').to(device)
                #atom_feats, bond_feats, labels = atom_feats.to(device), bond_feats.to(device), labels.to(device)
                y_pred = mpnn_net(bg, atom_feats, bond_feats)
                y_pred = process(y_pred)

                labels = labels.cpu().numpy()
                y_pred = y_pred.detach().cpu().numpy()

                preds.append(y_pred)
                labs.append(labels)

            labs = np.concatenate(labs, axis=0)
            preds = np.concatenate(preds, axis=0)
            rmses = []
            r2s = []
            rocs = []
            prcs = []
            for ind in reg_inds:

                rmse = np.sqrt(mean_squared_error(labs[:,ind][~np.isnan(labs[:,ind])],preds[:,ind][~np.isnan(labs[:,ind])]))

                r2 = r2_score(labs[:,ind][~np.isnan(labs[:,ind])],preds[:,ind][~np.isnan(labs[:,ind])])
                rmses.append(rmse)
                r2s.append(r2)
            for ind in class_inds:
                r2 = roc_auc_score(labs[:, ind][~np.isnan(labs[:,ind])],
                                   preds[:, ind][~np.isnan(labs[:,ind])])
                precision, recall, thresholds = precision_recall_curve(labs[:, ind][~np.isnan(labs[:,ind])],
                                                                       preds[:, ind][~np.isnan(labs[:,ind])])
                rmse = auc(recall, precision)
                rocs.append(r2)
                prcs.append(rmse)
            writer.add_scalar('test/acry_rocauc', rocs[0], epoch)
            writer.add_scalar('test/acry_prcauc', prcs[0], epoch)
            writer.add_scalar('test/chloro_rocauc', rocs[1], epoch)
            writer.add_scalar('test/chloro_prcauc', prcs[1], epoch)
            writer.add_scalar('test/rest_rocauc', rocs[2], epoch)
            writer.add_scalar('test/rest_prcauc', prcs[2], epoch)

            writer.add_scalar('test/acry_rmse', rmses[0], epoch)
            writer.add_scalar('test/acry_r2', r2s[0], epoch)
            writer.add_scalar('test/chloro_rmse', rmses[1], epoch)
            writer.add_scalar('test/chloro_r2', r2s[1], epoch)
            writer.add_scalar('test/rest_rmse', rmses[2], epoch)
            writer.add_scalar('test/rest_r2', r2s[2], epoch)
            if epoch==(args.n_epochs):
                print(f"\n======================== TEST ========================"
                      f"\n acry ROC-AUC: {rocs[0]:.3f}, "
                      f"acry PRC-AUC: {prcs[0]:.3f}"
                      f"\n chloro ROC-AUC: {rocs[1]:.3f}, "
                      f"chloro PRC-AUC: {prcs[1]:.3f}"
                      f"\n rest ROC-AUC: {rocs[2]:.3f}, "
                      f"rest PRC-AUC: {prcs[2]:.3f}"
                      f"\n acry R2: {r2s[0]:.3f}, "
                      f"acry RMSE: {rmses[0]:.3f}"
                      f"\n chloro R2: {r2s[1]:.3f}, "
                      f"chloro RMSE: {rmses[1]:.3f}"
                      f"\n rest R2: {r2s[2]:.3f}, "
                      f"rest RMSE: {rmses[2]:.3f}")
                roc_list.append(rocs)
                prc_list.append(prcs)
                r2_list.append(r2s)
                rmse_list.append(rmses)
    roc_list = np.array(roc_list).T
    prc_list = np.array(prc_list).T
    r2_list = np.array(r2_list).T
    rmse_list = np.array(rmse_list).T
    print("\n ACRY")
    print("R^2: {:.4f} +- {:.4f}".format(np.mean(r2_list[0]), np.std(r2_list[0])/np.sqrt(len(r2_list[0]))))
    print("RMSE: {:.4f} +- {:.4f}".format(np.mean(rmse_list[0]), np.std(rmse_list[0])/np.sqrt(len(rmse_list[0]))))
    print("ROC-AUC: {:.3f} +- {:.3f}".format(np.mean(roc_list[0]), np.std(roc_list[0]) / np.sqrt(len(roc_list[0]))))
    print("PRC-AUC: {:.3f} +- {:.3f}".format(np.mean(prc_list[0]), np.std(prc_list[0]) / np.sqrt(len(prc_list[0]))))
    print("\n CHLORO")
    print("R^2: {:.4f} +- {:.4f}".format(np.mean(r2_list[1]), np.std(r2_list[1])/np.sqrt(len(r2_list[1]))))
    print("RMSE: {:.4f} +- {:.4f}".format(np.mean(rmse_list[1]), np.std(rmse_list[1])/np.sqrt(len(rmse_list[1]))))
    print("ROC-AUC: {:.3f} +- {:.3f}".format(np.mean(roc_list[1]), np.std(roc_list[1]) / np.sqrt(len(roc_list[1]))))
    print("PRC-AUC: {:.3f} +- {:.3f}".format(np.mean(prc_list[1]), np.std(prc_list[1]) / np.sqrt(len(prc_list[1]))))
    print("\n REST")
    print("R^2: {:.4f} +- {:.4f}".format(np.mean(r2_list[2]), np.std(r2_list[2])/np.sqrt(len(r2_list[2]))))
    print("RMSE: {:.4f} +- {:.4f}".format(np.mean(rmse_list[2]), np.std(rmse_list[2])/np.sqrt(len(rmse_list[2]))))
    print("ROC-AUC: {:.3f} +- {:.3f}".format(np.mean(roc_list[2]), np.std(roc_list[2]) / np.sqrt(len(roc_list[2]))))
    print("PRC-AUC: {:.3f} +- {:.3f}".format(np.mean(prc_list[2]), np.std(prc_list[2]) / np.sqrt(len(prc_list[2]))))
예제 #12
0
def main(args):
    """
    :param path: str specifying path to dataset.
    :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n']
    :param n_trials: int specifying number of random train/test splits to use
    :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set
    """

    # data_loader = TaskDataLoader(args.task, args.path)
    # smiles_list, y = data_loader.load_property_data()

    smiles_list, y = parse_dataset(args.task, PATHS[args.task], args.reg)
    X = [Chem.MolFromSmiles(m) for m in smiles_list]

    # Initialise featurisers
    atom_featurizer = CanonicalAtomFeaturizer()
    bond_featurizer = CanonicalBondFeaturizer()

    e_feats = bond_featurizer.feat_size('e')
    n_feats = atom_featurizer.feat_size('h')
    print('Number of features: ', n_feats)

    X = [
        mol_to_bigraph(m,
                       node_featurizer=atom_featurizer,
                       edge_featurizer=bond_featurizer) for m in X
    ]

    r2_list = []
    rmse_list = []
    mae_list = []
    skipped_trials = 0

    for i in range(args.n_trials):

        # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=args.test_set_size, random_state=i + 5)

        kf = StratifiedKFold(n_splits=args.n_folds,
                             random_state=i,
                             shuffle=True)
        split_list = kf.split(X, y)
        j = 0
        for train_ind, test_ind in split_list:
            if args.reg:
                writer = SummaryWriter('runs/' + args.task + '/mpnn/reg/run_' +
                                       str(i) + '_fold_' + str(j))
            else:
                writer = SummaryWriter('runs/' + args.task +
                                       '/mpnn/class/run_' + str(i) + '_fold_' +
                                       str(j))
            X_train, X_test = np.array(X)[train_ind], np.array(X)[test_ind]
            y_train, y_test = np.array(y)[train_ind], np.array(y)[test_ind]

            y_train = y_train.reshape(-1, 1)
            y_test = y_test.reshape(-1, 1)

            #  We standardise the outputs but leave the inputs unchanged
            if args.reg:
                y_scaler = StandardScaler()
                y_train_scaled = torch.Tensor(y_scaler.fit_transform(y_train))
                y_test_scaled = torch.Tensor(y_scaler.transform(y_test))
            else:
                y_train_scaled = torch.Tensor(y_train)
                y_test_scaled = torch.Tensor(y_test)

            train_data = list(zip(X_train, y_train_scaled))
            test_data = list(zip(X_test, y_test_scaled))

            train_loader = DataLoader(train_data,
                                      batch_size=32,
                                      shuffle=True,
                                      collate_fn=collate,
                                      drop_last=False)
            test_loader = DataLoader(test_data,
                                     batch_size=32,
                                     shuffle=False,
                                     collate_fn=collate,
                                     drop_last=False)

            mpnn_net = MPNNPredictor(node_in_feats=n_feats,
                                     edge_in_feats=e_feats)
            mpnn_net.to(device)

            if args.reg:
                loss_fn = MSELoss()
            else:
                loss_fn = BCELoss()
            optimizer = torch.optim.Adam(mpnn_net.parameters(), lr=1e-4)

            mpnn_net.train()

            epoch_losses = []
            epoch_rmses = []
            for epoch in tqdm(range(1, args.n_epochs)):
                epoch_loss = 0
                preds = []
                labs = []
                for i, (bg, labels) in tqdm(enumerate(train_loader)):
                    labels = labels.to(device)
                    atom_feats = bg.ndata.pop('h').to(device)
                    bond_feats = bg.edata.pop('e').to(device)
                    atom_feats, bond_feats, labels = atom_feats.to(
                        device), bond_feats.to(device), labels.to(device)
                    y_pred = mpnn_net(bg, atom_feats, bond_feats)
                    labels = labels.unsqueeze(dim=1)
                    loss = loss_fn(y_pred, labels)
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    epoch_loss += loss.detach().item()

                    if args.reg:
                        # Inverse transform to get RMSE
                        labels = y_scaler.inverse_transform(
                            labels.cpu().reshape(-1, 1))
                        y_pred = y_scaler.inverse_transform(
                            y_pred.detach().cpu().numpy().reshape(-1, 1))
                    else:
                        labels = labels.cpu().numpy()
                        y_pred = y_pred.detach().cpu().numpy()

                    # store labels and preds
                    preds.append(y_pred)
                    labs.append(labels)

                labs = np.concatenate(labs, axis=None)
                preds = np.concatenate(preds, axis=None)
                pearson, p = pearsonr(preds, labs)
                if args.reg:
                    mae = mean_absolute_error(preds, labs)
                    rmse = np.sqrt(mean_squared_error(preds, labs))
                    r2 = r2_score(preds, labs)
                else:
                    r2 = roc_auc_score(labs, preds)
                    precision, recall, thresholds = precision_recall_curve(
                        labs, preds)
                    rmse = auc(recall, precision)
                    mae = 0

                if args.reg:
                    writer.add_scalar('Loss/train', epoch_loss, epoch)
                    writer.add_scalar('RMSE/train', rmse, epoch)
                    writer.add_scalar('R2/train', r2, epoch)
                else:
                    writer.add_scalar('Loss/train', epoch_loss, epoch)
                    writer.add_scalar('ROC-AUC/train', r2, epoch)
                    writer.add_scalar('PRC-AUC/train', rmse, epoch)

                if epoch % 20 == 0:
                    if args.reg:
                        print(f"epoch: {epoch}, "
                              f"LOSS: {epoch_loss:.3f}, "
                              f"RMSE: {rmse:.3f}, "
                              f"MAE: {mae:.3f}, "
                              f"rho: {pearson:.3f}, "
                              f"R2: {r2:.3f}")

                    else:
                        print(f"epoch: {epoch}, "
                              f"LOSS: {epoch_loss:.3f}, "
                              f"ROC-AUC: {r2:.3f}, "
                              f"PRC-AUC: {rmse:.3f}, "
                              f"rho: {pearson:.3f}")
                epoch_losses.append(epoch_loss)
                epoch_rmses.append(rmse)

            # Discount trial if train RMSE finishes as a negative value (optimiser error).

            if r2 < -1:
                skipped_trials += 1
                print('Skipped trials is {}'.format(skipped_trials))
                continue

            # Evaluate
            mpnn_net.eval()
            preds = []
            labs = []
            for i, (bg, labels) in enumerate(test_loader):
                labels = labels.to(device)
                atom_feats = bg.ndata.pop('h').to(device)
                bond_feats = bg.edata.pop('e').to(device)
                atom_feats, bond_feats, labels = atom_feats.to(
                    device), bond_feats.to(device), labels.to(device)
                y_pred = mpnn_net(bg, atom_feats, bond_feats)
                labels = labels.unsqueeze(dim=1)

                if args.reg:
                    # Inverse transform to get RMSE
                    labels = y_scaler.inverse_transform(labels.cpu().reshape(
                        -1, 1))
                    y_pred = y_scaler.inverse_transform(
                        y_pred.detach().cpu().numpy().reshape(-1, 1))
                else:
                    labels = labels.cpu().numpy()
                    y_pred = y_pred.detach().cpu().numpy()
                preds.append(y_pred)
                labs.append(labels)

            labs = np.concatenate(labs, axis=None)
            preds = np.concatenate(preds, axis=None)
            pearson, p = pearsonr(preds, labs)
            if args.reg:
                mae = mean_absolute_error(preds, labs)
                rmse = np.sqrt(mean_squared_error(preds, labs))
                r2 = r2_score(preds, labs)
                writer.add_scalar('RMSE/test', rmse)
                writer.add_scalar('R2/test', r2)
                print(
                    f'Test RMSE: {rmse:.3f}, MAE: {mae:.3f}, R: {pearson:.3f}, R2: {r2:.3f}'
                )
            else:
                r2 = roc_auc_score(labs, preds)
                precision, recall, thresholds = precision_recall_curve(
                    labs, preds)
                rmse = auc(recall, precision)
                mae = 0
                writer.add_scalar('ROC-AUC/test', r2)
                writer.add_scalar('PRC-AUC/test', rmse)
                print(
                    f'Test ROC-AUC: {r2:.3f}, PRC-AUC: {rmse:.3f}, rho: {pearson:.3f}'
                )

            r2_list.append(r2)
            rmse_list.append(rmse)
            mae_list.append(mae)
            j += 1

    r2_list = np.array(r2_list)
    rmse_list = np.array(rmse_list)
    mae_list = np.array(mae_list)
    if args.reg:
        print("\nmean R^2: {:.4f} +- {:.4f}".format(
            np.mean(r2_list),
            np.std(r2_list) / np.sqrt(len(r2_list))))
        print("mean RMSE: {:.4f} +- {:.4f}".format(
            np.mean(rmse_list),
            np.std(rmse_list) / np.sqrt(len(rmse_list))))
        print("mean MAE: {:.4f} +- {:.4f}\n".format(
            np.mean(mae_list),
            np.std(mae_list) / np.sqrt(len(mae_list))))
    else:
        print("mean ROC-AUC^2: {:.3f} +- {:.3f}".format(
            np.mean(r2_list),
            np.std(r2_list) / np.sqrt(len(r2_list))))
        print("mean PRC-AUC: {:.3f} +- {:.3f}".format(
            np.mean(rmse_list),
            np.std(rmse_list) / np.sqrt(len(rmse_list))))
    print("\nSkipped trials is {}".format(skipped_trials))
예제 #13
0
    test_y = np.array(test_y, dtype=np.int64)
    print("Test set", len(test_g))
    print("Data loaded.")

    return train_g, train_y, test_g, test_y


def collate(sample):
    graphs, labels = map(list, zip(*sample))
    batched_graph = dgl.batch(graphs)
    batched_graph.set_n_initializer(dgl.init.zero_initializer)
    batched_graph.set_e_initializer(dgl.init.zero_initializer)
    return batched_graph, torch.tensor(labels)


atom_featurizer = CanonicalAtomFeaturizer()
bond_featurizer = CanonicalBondFeaturizer()


def featurize_edges(mol, add_self_loop=False):
    feats = []
    num_atoms = mol.GetNumAtoms()
    atoms = list(mol.GetAtoms())
    distance_matrix = Chem.GetDistanceMatrix(mol)
    for i in range(num_atoms):
        for j in range(num_atoms):
            if i != j or add_self_loop:
                feats.append(float(distance_matrix[i, j]))
    return {'e': torch.tensor(feats).reshape(-1, 1).float()}

예제 #14
0
def _get_collection(collection):
    from dgllife.utils import smiles_to_bigraph, CanonicalAtomFeaturizer
    ds = getattr(dgllife.data, collection)(smiles_to_bigraph,
                                           CanonicalAtomFeaturizer())
    return _dataset_from_dgllife(ds)
예제 #15
0
                        help='Print the training progress every X mini-batches')
    parser.add_argument('-p', '--result-path', type=str, default='classification_results',
                        help='Path to save training results (default: classification_results)')
    parser.add_argument('-ne', '--num-evals', type=int, default=None,
                        help='Number of trials for hyperparameter search (default: None)')
    args = parser.parse_args().__dict__

    if torch.cuda.is_available():
        args['device'] = torch.device('cuda:0')
    else:
        args['device'] = torch.device('cpu')

    if args['task_names'] is not None:
        args['task_names'] = args['task_names'].split(',')

    args['node_featurizer'] = CanonicalAtomFeaturizer()
    df = pd.read_csv(args['csv_path'])
    mkdir_p(args['result_path'])
    dataset = MoleculeCSVDataset(df=df,
                                 smiles_to_graph=smiles_to_bigraph,
                                 node_featurizer=args['node_featurizer'],
                                 edge_featurizer=None,
                                 smiles_column=args['smiles_column'],
                                 cache_file_path=args['result_path'] + '/graph.bin',
                                 task_names=args['task_names'])
    args['n_tasks'] = dataset.n_tasks
    train_set, val_set, test_set = split_dataset(args, dataset)

    if args['num_evals'] is not None:
        assert args['num_evals'] > 0, 'Expect the number of hyperparameter search trials to ' \
                                      'be greater than 0, got {:d}'.format(args['num_evals'])
def main(path, task, n_trials, test_set_size):
    """
    :param path: str specifying path to dataset.
    :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n']
    :param n_trials: int specifying number of random train/test splits to use
    :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set.
    """

    data_loader = TaskDataLoader(task, path)
    smiles_list, y = data_loader.load_property_data()
    X = [Chem.MolFromSmiles(m) for m in smiles_list]

    # Collate Function for Dataloader
    def collate(sample):
        graphs, labels = map(list, zip(*sample))
        batched_graph = dgl.batch(graphs)
        batched_graph.set_n_initializer(dgl.init.zero_initializer)
        batched_graph.set_e_initializer(dgl.init.zero_initializer)
        return batched_graph, torch.tensor(labels)

    # Initialise featurisers
    atom_featurizer = CanonicalAtomFeaturizer()
    bond_featurizer = CanonicalBondFeaturizer()

    e_feats = bond_featurizer.feat_size('e')
    n_feats = atom_featurizer.feat_size('h')
    print('Number of features: ', n_feats)

    X = [
        mol_to_bigraph(m,
                       node_featurizer=atom_featurizer,
                       edge_featurizer=bond_featurizer) for m in X
    ]

    r2_list = []
    rmse_list = []
    mae_list = []
    skipped_trials = 0

    for i in range(0, n_trials):

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_set_size, random_state=i + 5)

        y_train = y_train.reshape(-1, 1)
        y_test = y_test.reshape(-1, 1)

        #  We standardise the outputs but leave the inputs unchanged

        y_scaler = StandardScaler()
        y_train_scaled = torch.Tensor(y_scaler.fit_transform(y_train))
        y_test_scaled = torch.Tensor(y_scaler.transform(y_test))

        train_data = list(zip(X_train, y_train_scaled))
        test_data = list(zip(X_test, y_test_scaled))

        train_loader = DataLoader(train_data,
                                  batch_size=32,
                                  shuffle=True,
                                  collate_fn=collate,
                                  drop_last=False)
        test_loader = DataLoader(test_data,
                                 batch_size=32,
                                 shuffle=False,
                                 collate_fn=collate,
                                 drop_last=False)

        gat_net = GATPredictor(in_feats=n_feats)

        gat_net.to(device)

        loss_fn = MSELoss()
        optimizer = torch.optim.Adam(gat_net.parameters(), lr=0.001)

        gat_net.train()

        epoch_losses = []
        epoch_rmses = []
        for epoch in range(1, 201):
            epoch_loss = 0
            preds = []
            labs = []
            for i, (bg, labels) in enumerate(train_loader):
                labels = labels.to(device)
                atom_feats = bg.ndata.pop('h').to(device)
                bond_feats = bg.edata.pop('e').to(device)
                atom_feats, bond_feats, labels = atom_feats.to(
                    device), bond_feats.to(device), labels.to(device)
                y_pred = gat_net(bg, atom_feats)
                labels = labels.unsqueeze(dim=1)
                loss = loss_fn(y_pred, labels)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                epoch_loss += loss.detach().item()

                # Inverse transform to get RMSE
                labels = y_scaler.inverse_transform(labels.reshape(-1, 1))
                y_pred = y_scaler.inverse_transform(
                    y_pred.detach().numpy().reshape(-1, 1))

                # store labels and preds
                preds.append(y_pred)
                labs.append(labels)

            labs = np.concatenate(labs, axis=None)
            preds = np.concatenate(preds, axis=None)
            pearson, p = pearsonr(preds, labs)
            mae = mean_absolute_error(preds, labs)
            rmse = np.sqrt(mean_squared_error(preds, labs))
            r2 = r2_score(preds, labs)

            epoch_loss /= (i + 1)
            if epoch % 20 == 0:
                print(f"epoch: {epoch}, "
                      f"LOSS: {epoch_loss:.3f}, "
                      f"RMSE: {rmse:.3f}, "
                      f"MAE: {mae:.3f}, "
                      f"R: {pearson:.3f}, "
                      f"R2: {r2:.3f}")
            epoch_losses.append(epoch_loss)
            epoch_rmses.append(rmse)

        # Discount trial if train RMSE finishes as a negative value (optimiser error).

        if r2 < 0:
            skipped_trials += 1
            print('Skipped trials is {}'.format(skipped_trials))
            continue

        # Evaluate
        gat_net.eval()
        preds = []
        labs = []
        for i, (bg, labels) in enumerate(test_loader):
            labels = labels.to(device)
            atom_feats = bg.ndata.pop('h').to(device)
            bond_feats = bg.edata.pop('e').to(device)
            atom_feats, labels = atom_feats.to(device), labels.to(device)
            y_pred = gat_net(bg, atom_feats)
            labels = labels.unsqueeze(dim=1)

            # Inverse transform to get RMSE
            labels = y_scaler.inverse_transform(labels.reshape(-1, 1))
            y_pred = y_scaler.inverse_transform(
                y_pred.detach().numpy().reshape(-1, 1))

            preds.append(y_pred)
            labs.append(labels)

        labs = np.concatenate(labs, axis=None)
        preds = np.concatenate(preds, axis=None)

        pearson, p = pearsonr(preds, labs)
        mae = mean_absolute_error(preds, labs)
        rmse = np.sqrt(mean_squared_error(preds, labs))
        r2 = r2_score(preds, labs)

        r2_list.append(r2)
        rmse_list.append(rmse)
        mae_list.append(mae)

        print(
            f'Test RMSE: {rmse:.3f}, MAE: {mae:.3f}, R: {pearson:.3f}, R2: {r2:.3f}'
        )

    r2_list = np.array(r2_list)
    rmse_list = np.array(rmse_list)
    mae_list = np.array(mae_list)

    print("\nmean R^2: {:.4f} +- {:.4f}".format(
        np.mean(r2_list),
        np.std(r2_list) / np.sqrt(len(r2_list))))
    print("mean RMSE: {:.4f} +- {:.4f}".format(
        np.mean(rmse_list),
        np.std(rmse_list) / np.sqrt(len(rmse_list))))
    print("mean MAE: {:.4f} +- {:.4f}\n".format(
        np.mean(mae_list),
        np.std(mae_list) / np.sqrt(len(mae_list))))
    print("\nSkipped trials is {}".format(skipped_trials))
예제 #17
0
def name2g(data_path, name):
    path = os.path.join(data_path, name + '.sdf')
    for mol in Chem.SDMolSupplier(path):
        g = mol_to_bigraph(mol, node_featurizer=CanonicalAtomFeaturizer())
    return g
예제 #18
0
def main(args):
    """
    :param n_trials: int specifying number of random train/test splits to use
    :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set
    """

    df = pd.read_csv('data/covid_multitask_HTS.smi')
    if args.dry:
        df = df[:2000]
    smiles_list = df['SMILES'].values
    y = df[[
        'acry_reg', 'chloro_reg', 'rest_reg', 'acry_class', 'chloro_class',
        'rest_class', 'activity'
    ]].to_numpy()
    n_tasks = y.shape[1]
    reg_inds = [0, 1, 2]
    class_inds = [3, 4, 5, 6]
    # print(smiles_list)
    X = [Chem.MolFromSmiles(m) for m in smiles_list]

    # Initialise featurisers
    atom_featurizer = CanonicalAtomFeaturizer()
    bond_featurizer = CanonicalBondFeaturizer()

    e_feats = bond_featurizer.feat_size('e')
    n_feats = atom_featurizer.feat_size('h')
    print('Number of features: ', n_feats)

    X = np.array([
        mol_to_bigraph(m,
                       node_featurizer=atom_featurizer,
                       edge_featurizer=bond_featurizer) for m in X
    ])

    r2_list = []
    rmse_list = []
    roc_list = []
    prc_list = []

    for i in range(args.n_trials):
        writer = SummaryWriter('runs/' + args.savename)

        if args.test:
            X_train_acry, X_test_acry, \
            y_train_acry, y_test_acry = train_test_split(X[~np.isnan(y[:,3])],
                                                         y[~np.isnan(y[:,3])], stratify=y[:,3][~np.isnan(y[:,3])],
                                                         test_size=args.test_set_size, shuffle=True, random_state=i+5)
            X_train_chloro, X_test_chloro, \
            y_train_chloro, y_test_chloro = train_test_split(X[~np.isnan(y[:,4])],
                                                             y[~np.isnan(y[:,4])], stratify=y[:,4][~np.isnan(y[:,4])],
                                                              test_size=args.test_set_size, shuffle=True, random_state=i+5)
            X_train_rest, X_test_rest, \
            y_train_rest, y_test_rest = train_test_split(X[~np.isnan(y[:,5])],
                                                         y[~np.isnan(y[:,5])], stratify=y[:,5][~np.isnan(y[:,5])],
                                                          test_size=args.test_set_size, shuffle=True, random_state=i+5)

            X_train = np.concatenate([
                X_train_acry, X_train_chloro, X_train_rest,
                X[~np.isnan(y[:, 6])]
            ])
            X_test = np.concatenate([X_test_acry, X_test_chloro, X_test_rest])
            y_train = np.concatenate([
                y_train_acry, y_train_chloro, y_train_rest,
                y[~np.isnan(y[:, 6])]
            ])
            y_test = np.concatenate([y_test_acry, y_test_chloro, y_test_rest])

            y_train = torch.Tensor(y_train)
            y_test = torch.Tensor(y_test)

            train_data = list(zip(X_train, y_train))
            test_data = list(zip(X_test, y_test))

            train_loader = DataLoader(train_data,
                                      batch_size=32,
                                      shuffle=True,
                                      collate_fn=collate,
                                      drop_last=False)
            test_loader = DataLoader(test_data,
                                     batch_size=32,
                                     shuffle=True,
                                     collate_fn=collate,
                                     drop_last=False)
        else:
            y = torch.Tensor(y)
            train_data = list(zip(X, y))

            train_loader = DataLoader(train_data,
                                      batch_size=32,
                                      shuffle=True,
                                      collate_fn=collate,
                                      drop_last=False)

        process = Net(class_inds, reg_inds)
        process = process.to(device)

        mpnn_net = MPNNPredictor(node_in_feats=n_feats,
                                 edge_in_feats=e_feats,
                                 node_out_feats=128,
                                 n_tasks=n_tasks)
        mpnn_net = mpnn_net.to(device)
        # try:
        #     mpnn_net.load_state_dict(torch.load('/rds-d2/user/wjm41/hpc-work/models/' + args.savename +
        #                        '/model_epoch_20.pt'))
        # except: pass
        reg_loss_fn = MSELoss()
        class_loss_fn = BCELoss()

        optimizer = torch.optim.Adam(mpnn_net.parameters(), lr=1e-4)

        for epoch in range(1, args.n_epochs + 1):
            epoch_loss = 0
            preds = []
            labs = []
            mpnn_net.train()
            n = 0
            for i, (bg, labels) in enumerate(train_loader):
                labels = labels.to(device)
                atom_feats = bg.ndata.pop('h').to(device)
                bond_feats = bg.edata.pop('e').to(device)
                y_pred = mpnn_net(bg, atom_feats, bond_feats)
                y_pred = process(y_pred)
                loss = torch.tensor(0)
                loss = loss.to(device)

                if args.debug:
                    print('label: {}'.format(labels))
                    print('y_pred: {}'.format(y_pred))
                for ind in reg_inds:
                    if len(labels[:, ind][~torch.isnan(labels[:, ind])]) == 0:
                        continue
                    loss = loss + reg_loss_fn(
                        y_pred[:, ind][~torch.isnan(labels[:, ind])],
                        labels[:, ind][~torch.isnan(labels[:, ind])])
                if args.debug:
                    print('reg loss: {}'.format(loss))
                for ind in class_inds:
                    if len(labels[:, ind][~torch.isnan(labels[:, ind])]) == 0:
                        continue
                    loss = loss + class_loss_fn(
                        y_pred[:, ind][~torch.isnan(labels[:, ind])],
                        labels[:, ind][~torch.isnan(labels[:, ind])])
                if args.debug:
                    print('class + reg loss: {}'.format(loss))
                optimizer.zero_grad()
                loss.backward()

                optimizer.step()
                epoch_loss += loss.detach().item()

                labels = labels.cpu().numpy()
                y_pred = y_pred.detach().cpu().numpy()

                # store labels and preds
                preds.append(y_pred)
                labs.append(labels)

            labs = np.concatenate(labs, axis=0)
            preds = np.concatenate(preds, axis=0)
            rmses = []
            r2s = []
            rocs = []
            prcs = []
            for ind in reg_inds:
                rmse = np.sqrt(
                    mean_squared_error(labs[:, ind][~np.isnan(labs[:, ind])],
                                       preds[:, ind][~np.isnan(labs[:, ind])]))
                r2 = r2_score(labs[:, ind][~np.isnan(labs[:, ind])],
                              preds[:, ind][~np.isnan(labs[:, ind])])
                rmses.append(rmse)
                r2s.append(r2)

            for ind in class_inds:
                roc = roc_auc_score(labs[:, ind][~np.isnan(labs[:, ind])],
                                    preds[:, ind][~np.isnan(labs[:, ind])])
                precision, recall, thresholds = precision_recall_curve(
                    labs[:, ind][~np.isnan(labs[:, ind])],
                    preds[:, ind][~np.isnan(labs[:, ind])])
                prc = auc(recall, precision)
                rocs.append(roc)
                prcs.append(prc)

            writer.add_scalar('LOSS/train', epoch_loss, epoch)
            writer.add_scalar('train/acry_rocauc', rocs[0], epoch)
            writer.add_scalar('train/acry_prcauc', prcs[0], epoch)
            writer.add_scalar('train/chloro_rocauc', rocs[1], epoch)
            writer.add_scalar('train/chloro_prcauc', prcs[1], epoch)
            writer.add_scalar('train/rest_rocauc', rocs[2], epoch)
            writer.add_scalar('train/rest_prcauc', prcs[2], epoch)
            writer.add_scalar('train/HTS_rocauc', rocs[3], epoch)
            writer.add_scalar('train/HTS_prcauc', prcs[3], epoch)

            writer.add_scalar('train/acry_rmse', rmses[0], epoch)
            writer.add_scalar('train/acry_r2', r2s[0], epoch)
            writer.add_scalar('train/chloro_rmse', rmses[1], epoch)
            writer.add_scalar('train/chloro_r2', r2s[1], epoch)
            writer.add_scalar('train/rest_rmse', rmses[2], epoch)
            writer.add_scalar('train/rest_r2', r2s[2], epoch)

            if epoch % 20 == 0:
                print(f"\nepoch: {epoch}, "
                      f"LOSS: {epoch_loss:.3f}"
                      f"\n acry ROC-AUC: {rocs[0]:.3f}, "
                      f"acry PRC-AUC: {prcs[0]:.3f}"
                      f"\n chloro ROC-AUC: {rocs[1]:.3f}, "
                      f"chloro PRC-AUC: {prcs[1]:.3f}"
                      f"\n rest ROC-AUC: {rocs[2]:.3f}, "
                      f"rest PRC-AUC: {prcs[2]:.3f}"
                      f"\n HTS ROC-AUC: {rocs[3]:.3f}, "
                      f"HTS PRC-AUC: {prcs[3]:.3f}"
                      f"\n acry R2: {r2s[0]:.3f}, "
                      f"acry RMSE: {rmses[0]:.3f}"
                      f"\n chloro R2: {r2s[1]:.3f}, "
                      f"chloro RMSE: {rmses[1]:.3f}"
                      f"\n rest R2: {r2s[2]:.3f}, "
                      f"rest RMSE: {rmses[2]:.3f}")
                try:
                    torch.save(
                        mpnn_net.state_dict(),
                        '/rds-d2/user/wjm41/hpc-work/models/' + args.savename +
                        '/model_epoch_' + str(epoch) + '.pt')
                except FileNotFoundError:
                    cmd = 'mkdir /rds-d2/user/wjm41/hpc-work/models/' + args.savename
                    os.system(cmd)
                    torch.save(
                        mpnn_net.state_dict(),
                        '/rds-d2/user/wjm41/hpc-work/models/' + args.savename +
                        '/model_epoch_' + str(epoch) + '.pt')
            if args.test:
                # Evaluate
                mpnn_net.eval()
                preds = []
                labs = []
                for i, (bg, labels) in enumerate(test_loader):
                    labels = labels.to(device)
                    atom_feats = bg.ndata.pop('h').to(device)
                    bond_feats = bg.edata.pop('e').to(device)
                    y_pred = mpnn_net(bg, atom_feats, bond_feats)
                    y_pred = process(y_pred)

                    labels = labels.cpu().numpy()
                    y_pred = y_pred.detach().cpu().numpy()

                    preds.append(y_pred)
                    labs.append(labels)

                labs = np.concatenate(labs, axis=0)
                preds = np.concatenate(preds, axis=0)
                rmses = []
                r2s = []
                rocs = []
                prcs = []
                for ind in reg_inds:

                    rmse = np.sqrt(
                        mean_squared_error(
                            labs[:, ind][~np.isnan(labs[:, ind])],
                            preds[:, ind][~np.isnan(labs[:, ind])]))

                    r2 = r2_score(labs[:, ind][~np.isnan(labs[:, ind])],
                                  preds[:, ind][~np.isnan(labs[:, ind])])
                    rmses.append(rmse)
                    r2s.append(r2)
                for ind in class_inds[:3]:
                    roc = roc_auc_score(labs[:, ind][~np.isnan(labs[:, ind])],
                                        preds[:, ind][~np.isnan(labs[:, ind])])
                    precision, recall, thresholds = precision_recall_curve(
                        labs[:, ind][~np.isnan(labs[:, ind])],
                        preds[:, ind][~np.isnan(labs[:, ind])])
                    prc = auc(recall, precision)
                    rocs.append(roc)
                    prcs.append(prc)
                writer.add_scalar('test/acry_rocauc', rocs[0], epoch)
                writer.add_scalar('test/acry_prcauc', prcs[0], epoch)
                writer.add_scalar('test/chloro_rocauc', rocs[1], epoch)
                writer.add_scalar('test/chloro_prcauc', prcs[1], epoch)
                writer.add_scalar('test/rest_rocauc', rocs[2], epoch)
                writer.add_scalar('test/rest_prcauc', prcs[2], epoch)

                writer.add_scalar('test/acry_rmse', rmses[0], epoch)
                writer.add_scalar('test/acry_r2', r2s[0], epoch)
                writer.add_scalar('test/chloro_rmse', rmses[1], epoch)
                writer.add_scalar('test/chloro_r2', r2s[1], epoch)
                writer.add_scalar('test/rest_rmse', rmses[2], epoch)
                writer.add_scalar('test/rest_r2', r2s[2], epoch)
                if epoch == (args.n_epochs):
                    print(
                        f"\n======================== TEST ========================"
                        f"\n acry ROC-AUC: {rocs[0]:.3f}, "
                        f"acry PRC-AUC: {prcs[0]:.3f}"
                        f"\n chloro ROC-AUC: {rocs[1]:.3f}, "
                        f"chloro PRC-AUC: {prcs[1]:.3f}"
                        f"\n rest ROC-AUC: {rocs[2]:.3f}, "
                        f"rest PRC-AUC: {prcs[2]:.3f}"
                        f"\n acry R2: {r2s[0]:.3f}, "
                        f"acry RMSE: {rmses[0]:.3f}"
                        f"\n chloro R2: {r2s[1]:.3f}, "
                        f"chloro RMSE: {rmses[1]:.3f}"
                        f"\n rest R2: {r2s[2]:.3f}, "
                        f"rest RMSE: {rmses[2]:.3f}")
                    roc_list.append(rocs)
                    prc_list.append(prcs)
                    r2_list.append(r2s)
                    rmse_list.append(rmses)
        torch.save(
            mpnn_net.state_dict(), '/rds-d2/user/wjm41/hpc-work/models/' +
            args.savename + '/model_epoch_final.pt')
    if args.test:
        roc_list = np.array(roc_list).T
        prc_list = np.array(prc_list).T
        r2_list = np.array(r2_list).T
        rmse_list = np.array(rmse_list).T
        print("\n ACRY")
        print("R^2: {:.4f} +- {:.4f}".format(
            np.mean(r2_list[0]),
            np.std(r2_list[0]) / np.sqrt(len(r2_list[0]))))
        print("RMSE: {:.4f} +- {:.4f}".format(
            np.mean(rmse_list[0]),
            np.std(rmse_list[0]) / np.sqrt(len(rmse_list[0]))))
        print("ROC-AUC: {:.3f} +- {:.3f}".format(
            np.mean(roc_list[0]),
            np.std(roc_list[0]) / np.sqrt(len(roc_list[0]))))
        print("PRC-AUC: {:.3f} +- {:.3f}".format(
            np.mean(prc_list[0]),
            np.std(prc_list[0]) / np.sqrt(len(prc_list[0]))))
        print("\n CHLORO")
        print("R^2: {:.4f} +- {:.4f}".format(
            np.mean(r2_list[1]),
            np.std(r2_list[1]) / np.sqrt(len(r2_list[1]))))
        print("RMSE: {:.4f} +- {:.4f}".format(
            np.mean(rmse_list[1]),
            np.std(rmse_list[1]) / np.sqrt(len(rmse_list[1]))))
        print("ROC-AUC: {:.3f} +- {:.3f}".format(
            np.mean(roc_list[1]),
            np.std(roc_list[1]) / np.sqrt(len(roc_list[1]))))
        print("PRC-AUC: {:.3f} +- {:.3f}".format(
            np.mean(prc_list[1]),
            np.std(prc_list[1]) / np.sqrt(len(prc_list[1]))))
        print("\n REST")
        print("R^2: {:.4f} +- {:.4f}".format(
            np.mean(r2_list[2]),
            np.std(r2_list[2]) / np.sqrt(len(r2_list[2]))))
        print("RMSE: {:.4f} +- {:.4f}".format(
            np.mean(rmse_list[2]),
            np.std(rmse_list[2]) / np.sqrt(len(rmse_list[2]))))
        print("ROC-AUC: {:.3f} +- {:.3f}".format(
            np.mean(roc_list[2]),
            np.std(roc_list[2]) / np.sqrt(len(roc_list[2]))))
        print("PRC-AUC: {:.3f} +- {:.3f}".format(
            np.mean(prc_list[2]),
            np.std(prc_list[2]) / np.sqrt(len(prc_list[2]))))
예제 #19
0
def main(path, task, graph_type):
    """
    :param path: str specifying path to dataset.
    :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n']
    :param graph_type: str. either 'bigraph' or 'complete'
    """

    data_loader = TaskDataLoader(task, path)
    X, y = data_loader.load_property_data()
    X = [Chem.MolFromSmiles(m) for m in X]

    # Collate Function for Dataloader
    def collate(sample):
        graphs, labels = map(list, zip(*sample))
        batched_graph = dgl.batch(graphs)
        batched_graph.set_n_initializer(dgl.init.zero_initializer)
        batched_graph.set_e_initializer(dgl.init.zero_initializer)
        return batched_graph, torch.tensor(labels)

    # Initialise featurisers
    atom_featurizer = CanonicalAtomFeaturizer()
    n_feats = atom_featurizer.feat_size('h')
    print('Number of features: ', n_feats)

    X_full, _, y_full, _ = train_test_split(X,
                                            y,
                                            test_size=0.2,
                                            random_state=30)
    y_full = y_full.reshape(-1, 1)

    #  We standardise the outputs but leave the inputs unchanged

    y_scaler = StandardScaler()
    y_full = torch.Tensor(y_scaler.fit_transform(y_full))

    # Set up cross-validation splits

    n_splits = 5
    kf = KFold(n_splits=n_splits)

    X_train_splits = []
    y_train_splits = []
    X_val_splits = []
    y_val_splits = []

    for train_index, test_index in kf.split(X_full):
        X_train, X_val = np.array(X_full)[train_index], np.array(
            X_full)[test_index]
        y_train, y_val = y_full[train_index], y_full[test_index]
        # Create graphs and labels
        if graph_type == 'complete':
            X_train = [
                mol_to_complete_graph(m, node_featurizer=atom_featurizer)
                for m in X_train
            ]
            X_val = [
                mol_to_complete_graph(m, node_featurizer=atom_featurizer)
                for m in X_val
            ]
        elif graph_type == 'bigraph':
            X_train = [
                mol_to_bigraph(m, node_featurizer=atom_featurizer) for m in X
            ]
            X_val = [
                mol_to_bigraph(m, node_featurizer=atom_featurizer)
                for m in X_val
            ]
        X_train_splits.append(X_train)
        X_val_splits.append(X_val)
        y_train_splits.append(y_train)
        y_val_splits.append(y_val)

    def lognuniform(low=1, high=5, size=None, base=10):
        return np.power(base, -np.random.uniform(low, high, size))

    best_rmse = 100000000

    for i in range(1000):

        num_layers = np.random.randint(1, 4)
        classifier_hidden_feats = np.random.randint(1, 128)
        hidden_feats = [np.random.choice([16, 32, 64])] * num_layers
        dropout = [np.random.uniform(0, 0.5)] * num_layers
        batchnorm = [np.random.choice([True, False])] * num_layers
        learning_rate = lognuniform()

        param_set = {
            'num_layers': num_layers,
            'classifier_hidden_feats': classifier_hidden_feats,
            'hidden_feats': hidden_feats,
            'dropout': dropout,
            'batchnorm': batchnorm,
            'lr': learning_rate
        }

        print(f'\nParameter set in trial {i} is \n')
        print(param_set)
        print('\n')

        cv_rmse_list = []

        for j in range(n_splits):

            X_train = X_train_splits[j]
            y_train = y_train_splits[j]
            X_val = X_val_splits[j]
            y_val = y_val_splits[j]

            train_data = list(zip(X_train, y_train))
            test_data = list(zip(X_val, y_val))

            train_loader = DataLoader(train_data,
                                      batch_size=32,
                                      shuffle=True,
                                      collate_fn=collate,
                                      drop_last=False)
            test_loader = DataLoader(test_data,
                                     batch_size=32,
                                     shuffle=False,
                                     collate_fn=collate,
                                     drop_last=False)

            gcn_net = GCNPredictor(
                in_feats=n_feats,
                hidden_feats=hidden_feats,
                batchnorm=batchnorm,
                dropout=dropout,
                classifier_hidden_feats=classifier_hidden_feats,
            )
            gcn_net.to(device)

            loss_fn = MSELoss()
            optimizer = torch.optim.Adam(gcn_net.parameters(),
                                         lr=learning_rate)

            gcn_net.train()

            epoch_losses = []
            epoch_rmses = []
            for epoch in range(1, 501):
                epoch_loss = 0
                preds = []
                labs = []
                for i, (bg, labels) in enumerate(train_loader):
                    labels = labels.to(device)
                    atom_feats = bg.ndata.pop('h').to(device)
                    atom_feats, labels = atom_feats.to(device), labels.to(
                        device)
                    y_pred = gcn_net(bg, atom_feats)
                    labels = labels.unsqueeze(dim=1)
                    loss = loss_fn(y_pred, labels)
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    epoch_loss += loss.detach().item()

                    # Inverse transform to get RMSE
                    labels = y_scaler.inverse_transform(labels.reshape(-1, 1))
                    y_pred = y_scaler.inverse_transform(
                        y_pred.detach().numpy().reshape(-1, 1))
                    # store labels and preds
                    preds.append(y_pred)
                    labs.append(labels)

                labs = np.concatenate(labs, axis=None)
                preds = np.concatenate(preds, axis=None)
                pearson, p = pearsonr(preds, labs)
                mae = mean_absolute_error(preds, labs)
                rmse = np.sqrt(mean_squared_error(preds, labs))
                r2 = r2_score(preds, labs)

                epoch_loss /= (i + 1)
                if epoch % 20 == 0:
                    print(f"epoch: {epoch}, "
                          f"LOSS: {epoch_loss:.3f}, "
                          f"RMSE: {rmse:.3f}, "
                          f"MAE: {mae:.3f}, "
                          f"R: {pearson:.3f}, "
                          f"R2: {r2:.3f}")
                epoch_losses.append(epoch_loss)
                epoch_rmses.append(rmse)

            # Evaluate
            gcn_net.eval()
            preds = []
            labs = []
            for i, (bg, labels) in enumerate(test_loader):
                labels = labels.to(device)
                atom_feats = bg.ndata.pop('h').to(device)
                atom_feats, labels = atom_feats.to(device), labels.to(device)
                y_pred = gcn_net(bg, atom_feats)
                labels = labels.unsqueeze(dim=1)

                # Inverse transform to get RMSE
                labels = y_scaler.inverse_transform(labels.reshape(-1, 1))
                y_pred = y_scaler.inverse_transform(
                    y_pred.detach().numpy().reshape(-1, 1))

                preds.append(y_pred)
                labs.append(labels)

            preds = np.concatenate(preds, axis=None)
            labs = np.concatenate(labs, axis=None)

            pearson, p = pearsonr(preds, labs)
            mae = mean_absolute_error(preds, labs)
            rmse = np.sqrt(mean_squared_error(preds, labs))
            cv_rmse_list.append(rmse)
            r2 = r2_score(preds, labs)

            print(
                f'Test RMSE: {rmse:.3f}, MAE: {mae:.3f}, R: {pearson:.3f}, R2: {r2:.3f}'
            )

        param_rmse = np.mean(cv_rmse_list)
        if param_rmse < best_rmse:
            best_rmse = param_rmse
            best_params = param_set

    print('Best RMSE and best params \n')
    print(best_rmse)
    print(best_params)
    np.savetxt('saved_hypers/GCN', best_params)
예제 #20
0
df_train = pd.read_csv('reddb_reaction_train.csv')
#df =pd.read_csv('reddb-smiles.csv')
train = df_train.head(1000)
train_smiles = train['reactantSmiles']
train_y = torch.tensor(train['reactionEnergy']).reshape(-1,1).float()

if torch.cuda.is_available():
    print('use GPU')
    device='cuda'
else:
    print('use CPU')
    device='cpu'

train_mols = [Chem.MolFromSmiles(s) for s in train_smiles ]

atom_featurizer = CanonicalAtomFeaturizer(atom_data_field = 'h')
n_feats = atom_featurizer.feat_size('h')
bond_featurizer = CanonicalBondFeaturizer(bond_data_field='h')
b_feat = bond_featurizer.feat_size('h')

train_graph =[mol_to_bigraph(mol,node_featurizer=atom_featurizer, 
                           edge_featurizer=bond_featurizer) for mol in train_mols]


ncls = 1
model = GCNPredictor(in_feats=n_feats,
                     hidden_feats=[60,20],
                     n_tasks=ncls,
                     predictor_dropout=0.2)
#model = AttentiveFPGNN(n_feats,b_feat,2,200)
model = model.to(device)
예제 #21
0
GCN_Tox21 = {
    'random_seed': 2,
    'batch_size': 128,
    'lr': 1e-3,
    'num_epochs': 100,
    'node_data_field': 'h',
    'frac_train': 0.8,
    'frac_val': 0.1,
    'frac_test': 0.1,
    'in_feats': 74,
    'gcn_hidden_feats': [64, 64],
    'classifier_hidden_feats': 64,
    'patience': 10,
    'smiles_to_graph': smiles_to_bigraph,
    'node_featurizer': CanonicalAtomFeaturizer(),
    'metric_name': 'roc_auc_score'
}

GAT_Tox21 = {
    'random_seed': 2,
    'batch_size': 128,
    'lr': 1e-3,
    'num_epochs': 100,
    'node_data_field': 'h',
    'frac_train': 0.8,
    'frac_val': 0.1,
    'frac_test': 0.1,
    'in_feats': 74,
    'gat_hidden_feats': [32, 32],
    'classifier_hidden_feats': 64,
from dgllife.utils import CanonicalAtomFeaturizer, CanonicalBondFeaturizer, smiles_to_bigraph

from data.util import is_valid_smiles

_atom_featurizer = CanonicalAtomFeaturizer(atom_data_field="x")
_bond_featurizer = CanonicalBondFeaturizer(bond_data_field="w")


class MoleculeRepresentations(object):
    def __init__(self, smi, graph):
        self.smi = smi
        self.graph = graph


def smi2mol_repr(smi):
    if not is_valid_smiles(smi):
        return None

    graph = smiles_to_bigraph(smi,
                              node_featurizer=_atom_featurizer,
                              edge_featurizer=_bond_featurizer)
    return MoleculeRepresentations(smi, graph)


def smi2molgraph(smi):
    graph = smiles_to_bigraph(smi,
                              node_featurizer=_atom_featurizer,
                              edge_featurizer=_bond_featurizer)
    return graph