Exemplo n.º 1
0
    def _load(self, mol_to_graph, node_featurizer, edge_featurizer):
        if self.load:
            self.graphs, label_dict = load_graphs(
                osp.join(self.file_dir, "{}_graphs.bin".format(self.mode)))
            self.labels = label_dict['labels']
            with open(
                    osp.join(self.file_dir, "{}_smiles.txt".format(self.mode)),
                    'r') as f:
                smiles_ = f.readlines()
                self.smiles = [s.strip() for s in smiles_]
        else:
            print('Start preprocessing dataset...')
            target_file = pathlib.Path(self.file_dir,
                                       "{}_target.csv".format(self.mode))
            self.target = pd.read_csv(
                target_file,
                index_col=0,
                usecols=[
                    'gdb_idx',
                ] + ['property_{:d}'.format(x) for x in range(12)])
            self.target = self.target[[
                'property_{:d}'.format(x) for x in range(12)
            ]]
            self.graphs, self.labels, self.smiles = [], [], []

            supp = Chem.SDMolSupplier(
                osp.join(self.file_dir, self.mode + ".sdf"))
            cnt = 0
            dataset_size = len(self.target)
            for mol, label in zip(supp, self.target.iterrows()):
                cnt += 1
                print('Processing molecule {:d}/{:d}'.format(
                    cnt, dataset_size))
                graph = mol_to_graph(mol,
                                     node_featurizer=node_featurizer,
                                     edge_featurizer=edge_featurizer)
                smiles = Chem.MolToSmiles(mol)
                self.smiles.append(smiles)
                self.graphs.append(graph)
                label = F.tensor(
                    np.array(label[1].tolist()).astype(np.float32))
                self.labels.append(label)

            save_graphs(osp.join(self.file_dir,
                                 "{}_graphs.bin".format(self.mode)),
                        self.graphs,
                        labels={'labels': F.stack(self.labels, dim=0)})
            with open(
                    osp.join(self.file_dir, "{}_smiles.txt".format(self.mode)),
                    'w') as f:
                for s in self.smiles:
                    f.write(s + '\n')

        self.set_mean_and_std()
        print(len(self.graphs), "loaded!")
Exemplo n.º 2
0
def batcher_dev(batch):
    """Batch datapoints

    Parameters
    ----------
    batch : list
        batch[i][0] gives the DGLGraph for the ith datapoint,
        and batch[i][1] gives the label for the ith datapoint.

    Returns
    -------
    AlchemyBatcher
        An object holding the batch of data
    """
    graphs, labels = zip(*batch)
    batch_graphs = dgl.batch(graphs)
    labels = F.stack(labels, 0)

    return AlchemyBatcher(graph=batch_graphs, label=labels)
Exemplo n.º 3
0
    def _load_data(self):
        if self.load and self.preprocessed:
            self.data_list, label_dict = load_graphs(
                osp.join(self.data_path, f"{self.split}.bin"))
            all_label_list, all_mask_list = label_dict['labels'], label_dict[
                'masks']
            with open(osp.join(self.data_path, f'{self.split}_smiles.txt'),
                      'r') as f:
                smiles_ = f.readlines()
                smiles_list = [s.strip() for s in smiles_]
        else:
            print('preprocessing data ...')
            data_file = pathlib.Path(self.data_path, f"{self.split}.csv")
            all_data = pd.read_csv(data_file,
                                   usecols=['smiles'] + self.all_tasks)
            smiless = all_data['smiles'].values.tolist()
            targets = all_data[self.all_tasks]
            self.data_list,all_label_list,smiles_list,all_mask_list,length_list=[],[],[],[],[]
            for smiles, label in zip(smiless, targets.iterrows()):
                try:
                    mol = Chem.MolFromSmiles(smiles)
                    cano_smiles = Chem.MolToSmiles(mol)
                    length = F.tensor(
                        np.array(len(cano_smiles)).astype(np.int64))
                    data = smiles_to_bigraph(
                        cano_smiles,
                        node_featurizer=get_node_featurizer(),
                        edge_featurizer=None)

                    label = np.array(label[1].tolist())
                    mask = np.ones_like(label)
                    mask[np.isnan(label)] = 0
                    mask = F.tensor(mask.astype(np.float32))
                    label[np.isnan(label)] = 0
                    label = F.tensor(np.array(label.astype(np.float32)))
                except Exception as e:
                    print(e)
                else:
                    self.data_list.append(data)
                    all_label_list.append(label)
                    all_mask_list.append(mask)
                    smiles_list.append(cano_smiles)
                    length_list.append(length)
            all_label_list = F.stack(all_label_list, dim=0)
            all_mask_list = F.stack(all_mask_list, dim=0)
            self.length_list = torch.stack(length_list)
            save_graphs(osp.join(self.data_path, f"{self.split}.bin"),
                        self.data_list,
                        labels={
                            'labels': all_label_list,
                            'masks': all_mask_list
                        })
            with open(osp.join(self.data_path, f"{self.split}_smiles.txt"),
                      'w') as f:
                for smiles in smiles_list:
                    f.write(smiles + '\n')
        label_list, mask_list = [], []
        for task in self.tasks:
            label_list.append(all_label_list[:, self.all_tasks.index(task)])
            mask_list.append(all_mask_list[:, self.all_tasks.index(task)])
        self.smiles_list = np.array(smiles_list)
        self.label_list = torch.stack(label_list, dim=-1)
        self.mask_list = torch.stack(mask_list, dim=-1)
        if len(self.tasks) == 1:
            remain = (self.mask_list == 1.0).squeeze(-1)
            self.label_list = self.label_list[remain]
            self.smiles_list = self.smiles_list[remain.numpy() == 1]
            self.data_list = np.array(
                self.data_list)[remain.numpy() == 1].tolist()
            self.mask_list = torch.ones_like(self.label_list)
Exemplo n.º 4
0
def alchemy_nodes(mol):
    """Featurization for all atoms in a molecule. The atom indices
    will be preserved.

    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule object

    Returns
    -------
    atom_feats_dict : dict
        Dictionary for atom features
    """
    atom_feats_dict = defaultdict(list)
    is_donor = defaultdict(int)
    is_acceptor = defaultdict(int)

    fdef_name = osp.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')
    mol_featurizer = ChemicalFeatures.BuildFeatureFactory(fdef_name)
    mol_feats = mol_featurizer.GetFeaturesForMol(mol)
    mol_conformers = mol.GetConformers()
    assert len(mol_conformers) == 1

    for i in range(len(mol_feats)):
        if mol_feats[i].GetFamily() == 'Donor':
            node_list = mol_feats[i].GetAtomIds()
            for u in node_list:
                is_donor[u] = 1
        elif mol_feats[i].GetFamily() == 'Acceptor':
            node_list = mol_feats[i].GetAtomIds()
            for u in node_list:
                is_acceptor[u] = 1

    num_atoms = mol.GetNumAtoms()
    for u in range(num_atoms):
        atom = mol.GetAtomWithIdx(u)
        atom_type = atom.GetAtomicNum()
        num_h = atom.GetTotalNumHs()
        atom_feats_dict['node_type'].append(atom_type)

        h_u = []
        h_u += atom_type_one_hot(atom, ['H', 'C', 'N', 'O', 'F', 'S', 'Cl'])
        h_u.append(atom_type)
        h_u.append(is_acceptor[u])
        h_u.append(is_donor[u])
        h_u += atom_is_aromatic(atom)
        h_u += atom_hybridization_one_hot(atom, [
            Chem.rdchem.HybridizationType.SP,
            Chem.rdchem.HybridizationType.SP2,
            Chem.rdchem.HybridizationType.SP3
        ])
        h_u.append(num_h)
        atom_feats_dict['n_feat'].append(
            F.tensor(np.array(h_u).astype(np.float32)))

    atom_feats_dict['n_feat'] = F.stack(atom_feats_dict['n_feat'], dim=0)
    atom_feats_dict['node_type'] = F.tensor(
        np.array(atom_feats_dict['node_type']).astype(np.int64))

    return atom_feats_dict