Пример #1
0
def get_convmol_features(protein_pdb_file,
                         ligand_pdb_file,
                         data_dir='./data/pdbbind/v2018'):
    protein_pdb_file = os.path.join(data_dir, protein_pdb_file)
    ligand_pdb_file = os.path.join(data_dir, ligand_pdb_file)

    if not os.path.exists(protein_pdb_file):
        raise IOError(".pdb file not found in " + protein_pdb_file)
    if not os.path.exists(ligand_pdb_file):
        raise IOError(".pdb file not found in " + ligand_pdb_file)

    (_, _, compl) = get_molecules_from_pdb(protein_pdb_file, ligand_pdb_file)

    # This is from deepchem.models.graph_models.GraphConv
    # default generator
    nodes, adj_list = build_graph_from_molecule(compl)
    convmol = ConvMol(nodes, adj_list)
    multiConvMol = convmol.agglomerate_mols([convmol])
    (node_feat, deg_slice, membership, deg_adj_list) = \
        (multiConvMol.get_atom_features(),
         multiConvMol.deg_slice,
         np.array(multiConvMol.membership),
         multiConvMol.get_deg_adjacency_lists())

    return (node_feat, deg_slice, membership, deg_adj_list)
Пример #2
0
    def test_agglomerate_molecules(self):
        """Test AggrMol.agglomerate_mols."""
        molecules = []

        #### First example molecule
        N_feat = 4
        # Artificial feature array.
        atom_features = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])
        adj_list = [[1], [0, 2], [1]]
        molecules.append(ConvMol(atom_features, adj_list))

        #### Second example molecule
        atom_features = np.array([[20, 21, 22, 23], [24, 25, 26, 27],
                                  [28, 29, 30, 31], [32, 33, 34, 35]])
        adj_list = [[1, 2], [0, 3], [0, 3], [1, 2]]
        molecules.append(ConvMol(atom_features, adj_list))

        ### Third example molecule
        atom_features = np.array([[40, 41, 42, 43], [44, 45, 46, 47],
                                  [48, 49, 50, 51], [52, 53, 54, 55],
                                  [56, 57, 58, 59]])
        adj_list = [[1, 2], [0, 3], [0, 3], [1, 2, 4], [3]]
        molecules.append(ConvMol(atom_features, adj_list))

        # Test agglomerate molecule method
        concat_mol = ConvMol.agglomerate_mols(molecules)

        assert concat_mol.get_num_atoms() == 12
        assert concat_mol.get_num_molecules() == 3

        atom_features = concat_mol.get_atom_features()
        assert np.array_equal(atom_features[0, :], [1, 2, 3, 4])
        assert np.array_equal(atom_features[2, :], [56, 57, 58, 59])
        assert np.array_equal(atom_features[11, :], [52, 53, 54, 55])
        assert np.array_equal(atom_features[4, :], [20, 21, 22, 23])

        deg_adj_lists = concat_mol.get_deg_adjacency_lists()
        # No atoms of degree 0
        assert np.array_equal(deg_adj_lists[0], np.zeros([0, 0]))
        # 3 atoms of degree 1
        assert np.array_equal(deg_adj_lists[1], [[3], [3], [11]])
        # 8 atoms of degree 2
        assert np.array_equal(deg_adj_lists[2],
                              [[0, 1], [5, 6], [4, 7], [4, 7], [5, 6], [9, 10],
                               [8, 11], [8, 11]])
        # 1 atom of degree 3
        assert np.array_equal(deg_adj_lists[3], [[9, 10, 2]])
        # 0 atoms of degree 4
        assert np.array_equal(deg_adj_lists[4], np.zeros([0, 4]))
        # 0 atoms of degree 5
        assert np.array_equal(deg_adj_lists[5], np.zeros([0, 5]))
Пример #3
0
 def test_get_adjacency_list(self):
   """Tests that adj-list is canonicalized properly."""
   atom_features = np.array([[40, 41, 42, 43], [44, 45, 46, 47],
                             [48, 49, 50, 51], [52, 53, 54,
                                                55], [56, 57, 58, 59]])
   canon_adj_list = [[1, 2], [0, 3], [0, 3], [1, 2, 4], [3]]
   mol = ConvMol(atom_features, canon_adj_list)
   # Sorting is done by atom degree as before. So the ordering goes
   # 4, 0, 1, 2, 3 now in terms of the original ordering. The mapping
   # from new position to old position is
   # {(4, 0), (0, 1), (1, 2), (2, 3), (3, 4)}. Check that adjacency
   # list respects this reordering and returns correct adjacency list.
   assert (mol.get_adjacency_list() == [[4], [2, 3], [1, 4], [1, 4], [2, 3,
                                                                      0]])
Пример #4
0
 def test_get_adjacency_list(self):
     """Tests that adj-list is canonicalized properly."""
     atom_features = np.array([[40, 41, 42, 43], [44, 45, 46, 47],
                               [48, 49, 50, 51], [52, 53, 54, 55],
                               [56, 57, 58, 59]])
     canon_adj_list = [[1, 2], [0, 3], [0, 3], [1, 2, 4], [3]]
     mol = ConvMol(atom_features, canon_adj_list)
     # Sorting is done by atom degree as before. So the ordering goes
     # 4, 0, 1, 2, 3 now in terms of the original ordering. The mapping
     # from new position to old position is
     # {(4, 0), (0, 1), (1, 2), (2, 3), (3, 4)}. Check that adjacency
     # list respects this reordering and returns correct adjacency list.
     assert (mol.get_adjacency_list() == [[4], [2, 3], [1, 4], [1, 4],
                                          [2, 3, 0]])
Пример #5
0
  def batch_to_feed_dict(self, batch):
    """Converts the current batch of mol_graphs into tensorflow feed_dict.

    Assigns the graph information in array of ConvMol objects to the
    placeholders tensors

    params
    ------
    batch : np.ndarray
      Array of ConvMol objects

    returns
    -------
    feed_dict : dict
      Can be merged with other feed_dicts for input into tensorflow
    """
    # Merge mol conv objects
    batch = ConvMol.agglomerate_mols(batch)
    atoms = batch.get_atom_features()
    deg_adj_lists = [
        batch.deg_adj_lists[deg] for deg in range(1, self.max_deg + 1)
    ]

    # Generate dicts
    deg_adj_dict = dict(
        list(zip(self.deg_adj_lists_placeholders, deg_adj_lists)))
    atoms_dict = {
        self.atom_features_placeholder: atoms,
        self.deg_slice_placeholder: batch.deg_slice,
        self.membership_placeholder: batch.membership
    }
    return merge_dicts([atoms_dict, deg_adj_dict])
Пример #6
0
  def _featurize(self, mol):
    """Encodes mol as a ConvMol object."""
    # Get the node features
    idx_nodes = [(a.GetIdx(),
                  np.concatenate((atom_features(
                      a, use_chirality=self.use_chirality),
                                  self._get_atom_properties(a))))
                 for a in mol.GetAtoms()]

    idx_nodes.sort()  # Sort by ind to ensure same order as rd_kit
    idx, nodes = list(zip(*idx_nodes))

    # Stack nodes into an array
    nodes = np.vstack(nodes)
    if self.master_atom:
      master_atom_features = np.expand_dims(np.mean(nodes, axis=0), axis=0)
      nodes = np.concatenate([nodes, master_atom_features], axis=0)

    # Get bond lists with reverse edges included
    edge_list = [
        (b.GetBeginAtomIdx(), b.GetEndAtomIdx()) for b in mol.GetBonds()
    ]

    # Get canonical adjacency list
    canon_adj_list = [[] for mol_id in range(len(nodes))]
    for edge in edge_list:
      canon_adj_list[edge[0]].append(edge[1])
      canon_adj_list[edge[1]].append(edge[0])

    if self.master_atom:
      fake_atom_index = len(nodes) - 1
      for index in range(len(nodes) - 1):
        canon_adj_list[index].append(fake_atom_index)

    return ConvMol(nodes, canon_adj_list)
Пример #7
0
 def default_generator(self,
                       dataset,
                       epochs=1,
                       mode='fit',
                       deterministic=True,
                       pad_batches=True):
     for epoch in range(epochs):
         for (X_b, y_b, w_b,
              ids_b) in dataset.iterbatches(batch_size=self.batch_size,
                                            deterministic=deterministic,
                                            pad_batches=pad_batches):
             if self.mode == 'classification':
                 y_b = to_one_hot(y_b.flatten(), self.n_classes).reshape(
                     -1, self.n_tasks, self.n_classes)
             multiConvMol = ConvMol.agglomerate_mols(X_b)
             n_samples = np.array(X_b.shape[0])
             if mode == 'predict':
                 dropout = np.array(0.0)
             else:
                 dropout = np.array(1.0)
             inputs = [
                 multiConvMol.get_atom_features(), multiConvMol.deg_slice,
                 np.array(multiConvMol.membership), n_samples, dropout
             ]
             for i in range(1, len(multiConvMol.get_deg_adjacency_lists())):
                 inputs.append(multiConvMol.get_deg_adjacency_lists()[i])
             yield (inputs, [y_b], [w_b])
Пример #8
0
  def batch_to_feed_dict(self, batch):
    """Converts the current batch of mol_graphs into tensorflow feed_dict.

    Assigns the graph information in array of ConvMol objects to the
    placeholders tensors

    params
    ------
    batch : np.ndarray
      Array of ConvMol objects

    returns
    -------
    feed_dict : dict
      Can be merged with other feed_dicts for input into tensorflow
    """
    # Merge mol conv objects
    batch = ConvMol.agglomerate_mols(batch)
    atoms = batch.get_atom_features()
    deg_adj_lists = [batch.deg_adj_lists[deg]
                     for deg in range(1, self.max_deg+1)]

    # Generate dicts
    deg_adj_dict = dict(list(zip(self.deg_adj_lists_placeholders, deg_adj_lists)))
    atoms_dict = {self.atom_features_placeholder : atoms,
                  self.deg_slice_placeholder : batch.deg_slice,
                  self.membership_placeholder : batch.membership}
    return merge_dicts([atoms_dict, deg_adj_dict])
Пример #9
0
 def default_generator(self,
                       dataset,
                       epochs=1,
                       predict=False,
                       deterministic=True,
                       pad_batches=True):
   for epoch in range(epochs):
     if not predict:
       print('Starting epoch %i' % epoch)
     for ind, (X_b, y_b, w_b, ids_b) in enumerate(
         dataset.iterbatches(
             self.batch_size, pad_batches=True, deterministic=deterministic)):
       d = {}
       for index, label in enumerate(self.my_labels):
         if self.mode == 'classification':
           d[label] = to_one_hot(y_b[:, index])
         if self.mode == 'regression':
           d[label] = np.expand_dims(y_b[:, index], -1)
       d[self.my_task_weights] = w_b
       multiConvMol = ConvMol.agglomerate_mols(X_b)
       d[self.atom_features] = multiConvMol.get_atom_features()
       d[self.degree_slice] = multiConvMol.deg_slice
       d[self.membership] = multiConvMol.membership
       for i in range(1, len(multiConvMol.get_deg_adjacency_lists())):
         d[self.deg_adjs[i - 1]] = multiConvMol.get_deg_adjacency_lists()[i]
       yield d
Пример #10
0
  def test_graph_gather(self):
    """Test that GraphGather can be invoked."""
    batch_size = 2
    n_features = 75
    n_atoms = 4  # In CCC and C, there are 4 atoms
    raw_smiles = ['CCC', 'C']
    mols = [rdkit.Chem.MolFromSmiles(s) for s in raw_smiles]
    featurizer = ConvMolFeaturizer()
    mols = featurizer.featurize(mols)
    multi_mol = ConvMol.agglomerate_mols(mols)
    atom_features = multi_mol.get_atom_features()
    degree_slice = multi_mol.deg_slice
    membership = multi_mol.membership
    deg_adjs = multi_mol.get_deg_adjacency_lists()[1:]

    with self.session() as sess:
      atom_features = tf.convert_to_tensor(atom_features, dtype=tf.float32)
      degree_slice = tf.convert_to_tensor(degree_slice, dtype=tf.int32)
      membership = tf.convert_to_tensor(membership, dtype=tf.int32)
      deg_adjs_tf = []
      for deg_adj in deg_adjs:
        deg_adjs_tf.append(tf.convert_to_tensor(deg_adj, dtype=tf.int32))
      args = [atom_features, degree_slice, membership] + deg_adjs_tf
      out_tensor = GraphGather(batch_size)(*args)
      sess.run(tf.global_variables_initializer())
      out_tensor = out_tensor.eval()
      # TODO(rbharath): Why is it 2*n_features instead of n_features?
      assert out_tensor.shape == (batch_size, 2 * n_features)
Пример #11
0
def test_mol_ordering():
    mols = get_molecules()
    featurizer = ConvMolFeaturizer()
    featurized_mols = featurizer.featurize(mols)
    for i in range(len(featurized_mols)):
        atom_features = featurized_mols[i].atom_features
        degree_list = np.expand_dims(featurized_mols[i].degree_list, axis=1)
        atom_features = np.concatenate([degree_list, atom_features], axis=1)
        featurized_mols[i].atom_features = atom_features

    conv_mol = ConvMol.agglomerate_mols(featurized_mols)

    for start, end in conv_mol.deg_slice.tolist():
        members = conv_mol.membership[start:end]
        sorted_members = np.array(sorted(members))
        members = np.array(members)
        assert np.all(sorted_members == members)

    conv_mol_atom_features = conv_mol.get_atom_features()

    adj_number = 0
    for start, end in conv_mol.deg_slice.tolist():
        deg_features = conv_mol_atom_features[start:end]
        adj_number_array = deg_features[:, 0]
        assert np.all(adj_number_array == adj_number)
        adj_number += 1
Пример #12
0
 def default_generator(self,
                       dataset,
                       epochs=1,
                       predict=False,
                       deterministic=True,
                       pad_batches=True):
     for epoch in range(epochs):
         for ind, (X_b, y_b, w_b, ids_b) in enumerate(
                 dataset.iterbatches(self.batch_size,
                                     pad_batches=pad_batches,
                                     deterministic=deterministic)):
             d = {}
             if self.mode == 'classification':
                 d[self.labels[0]] = to_one_hot(y_b.flatten(),
                                                self.n_classes).reshape(
                                                    -1, self.n_tasks,
                                                    self.n_classes)
             else:
                 d[self.labels[0]] = y_b
             d[self.task_weights[0]] = w_b
             multiConvMol = ConvMol.agglomerate_mols(X_b)
             d[self.atom_features] = multiConvMol.get_atom_features()
             d[self.degree_slice] = multiConvMol.deg_slice
             d[self.membership] = multiConvMol.membership
             for i in range(1, len(multiConvMol.get_deg_adjacency_lists())):
                 d[self.deg_adjs[
                     i - 1]] = multiConvMol.get_deg_adjacency_lists()[i]
             yield d
Пример #13
0
 def test_construct_conv_mol(self):
     """Tests that ConvMols can be constructed without crash."""
     N_feat = 4
     # Artificial feature array.
     atom_features = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])
     adj_list = [[1], [0, 2], [1]]
     mol = ConvMol(atom_features, adj_list)
Пример #14
0
 def default_generator(self,
                       dataset,
                       epochs=1,
                       predict=False,
                       deterministic=True,
                       pad_batches=True):
   for epoch in range(epochs):
     for ind, (X_b, y_b, w_b, ids_b) in enumerate(
         dataset.iterbatches(
             self.batch_size,
             pad_batches=pad_batches,
             deterministic=deterministic)):
       d = {}
       if self.mode == 'classification':
         d[self.labels[0]] = to_one_hot(y_b.flatten(), self.n_classes).reshape(
             -1, self.n_tasks, self.n_classes)
       else:
         d[self.labels[0]] = y_b
       d[self.task_weights[0]] = w_b
       multiConvMol = ConvMol.agglomerate_mols(X_b)
       d[self.atom_features] = multiConvMol.get_atom_features()
       d[self.degree_slice] = multiConvMol.deg_slice
       d[self.membership] = multiConvMol.membership
       for i in range(1, len(multiConvMol.get_deg_adjacency_lists())):
         d[self.deg_adjs[i - 1]] = multiConvMol.get_deg_adjacency_lists()[i]
       yield d
Пример #15
0
    def test_graph_gather(self):
        """Test that GraphGather can be invoked."""
        batch_size = 2
        n_features = 75
        n_atoms = 4  # In CCC and C, there are 4 atoms
        raw_smiles = ['CCC', 'C']
        mols = [rdkit.Chem.MolFromSmiles(s) for s in raw_smiles]
        featurizer = ConvMolFeaturizer()
        mols = featurizer.featurize(mols)
        multi_mol = ConvMol.agglomerate_mols(mols)
        atom_features = multi_mol.get_atom_features()
        degree_slice = multi_mol.deg_slice
        membership = multi_mol.membership
        deg_adjs = multi_mol.get_deg_adjacency_lists()[1:]

        with self.session() as sess:
            atom_features = tf.convert_to_tensor(atom_features,
                                                 dtype=tf.float32)
            degree_slice = tf.convert_to_tensor(degree_slice, dtype=tf.int32)
            membership = tf.convert_to_tensor(membership, dtype=tf.int32)
            deg_adjs_tf = []
            for deg_adj in deg_adjs:
                deg_adjs_tf.append(
                    tf.convert_to_tensor(deg_adj, dtype=tf.int32))
            args = [atom_features, degree_slice, membership] + deg_adjs_tf
            out_tensor = GraphGather(batch_size)(*args)
            sess.run(tf.global_variables_initializer())
            out_tensor = out_tensor.eval()
            # TODO(rbharath): Why is it 2*n_features instead of n_features?
            assert out_tensor.shape == (batch_size, 2 * n_features)
Пример #16
0
    def test_conv_mol_deg_slice(self):
        """Tests that deg_slice works properly."""
        atom_features = np.array([[20, 21, 22, 23], [24, 25, 26, 27],
                                  [28, 29, 30, 31], [32, 33, 34, 35]])
        adj_list = [[1, 2], [0, 3], [0, 3], [1, 2]]
        mol = ConvMol(atom_features, adj_list)

        assert np.array_equal(
            mol.get_deg_slice(),
            # 0 atoms of degree 0
            # 0 atoms of degree 1
            # 4 atoms of degree 2
            # 0 atoms of degree 3
            # 0 atoms of degree 4
            # 0 atoms of degree 5
            # 0 atoms of degree 6
            np.array([[0, 0], [0, 0], [0, 4], [0, 0], [0, 0], [0, 0], [0, 0]]))
Пример #17
0
 def test_get_atom_features(self):
     """Test that the atom features are computed properly."""
     atom_features = np.array([[40, 41, 42, 43], [44, 45, 46, 47],
                               [48, 49, 50, 51], [52, 53, 54, 55],
                               [56, 57, 58, 59]])
     canon_adj_list = [[1, 2], [0, 3], [0, 3], [1, 2, 4], [3]]
     mol = ConvMol(atom_features, canon_adj_list)
     # atom 4 has 0 neighbors
     # atom 0 has 2 neighbors
     # atom 1 has 2 neighbors
     # atom 2 has 2 neighbors
     # atom 3 has 3 neighbors.
     # Verify that atom features have been sorted by atom degree.
     assert np.array_equal(
         mol.get_atom_features(),
         np.array([[56, 57, 58, 59], [40, 41, 42, 43], [44, 45, 46, 47],
                   [48, 49, 50, 51], [52, 53, 54, 55]]))
Пример #18
0
 def test_get_atom_features(self):
   """Test that the atom features are computed properly."""
   atom_features = np.array([[40, 41, 42, 43], [44, 45, 46, 47],
                             [48, 49, 50, 51], [52, 53, 54,
                                                55], [56, 57, 58, 59]])
   canon_adj_list = [[1, 2], [0, 3], [0, 3], [1, 2, 4], [3]]
   mol = ConvMol(atom_features, canon_adj_list)
   # atom 4 has 0 neighbors
   # atom 0 has 2 neighbors
   # atom 1 has 2 neighbors
   # atom 2 has 2 neighbors
   # atom 3 has 3 neighbors.
   # Verify that atom features have been sorted by atom degree.
   assert np.array_equal(mol.get_atom_features(),
                         np.array([[56, 57, 58, 59], [40, 41, 42, 43],
                                   [44, 45, 46, 47], [48, 49, 50,
                                                      51], [52, 53, 54, 55]]))
Пример #19
0
  def test_load_pretrained_subclassed_model(self):
    from rdkit import Chem
    bi_tasks = ['a', 'b']
    y = np.ones((3, 2))
    smiles = ['C', 'CC', 'CCC']
    mols = [Chem.MolFromSmiles(smile) for smile in smiles]
    featurizer = dc.feat.ConvMolFeaturizer()
    X = featurizer.featurize(mols)
    dataset = dc.data.NumpyDataset(X, y, ids=smiles)

    source_model = dc.models.GraphConvModel(
        n_tasks=len(bi_tasks),
        graph_conv_layers=[128, 128],
        dense_layer_size=512,
        dropout=0,
        mode='regression',
        learning_rate=0.001,
        batch_size=8,
        model_dir="model")
    source_model.fit(dataset)

    dest_model = dc.models.GraphConvModel(
        n_tasks=len(bi_tasks),
        graph_conv_layers=[128, 128],
        dense_layer_size=512,
        dropout=0,
        mode='regression',
        learning_rate=0.001,
        batch_size=8)

    X_b, y_b, w_b, ids_b = next(
        dataset.iterbatches(batch_size=8, deterministic=True, pad_batches=True))
    multiConvMol = ConvMol.agglomerate_mols(X_b)
    n_samples = np.array(X_b.shape[0])
    inputs = [
        multiConvMol.get_atom_features(), multiConvMol.deg_slice,
        np.array(multiConvMol.membership), n_samples
    ]
    for i in range(1, len(multiConvMol.get_deg_adjacency_lists())):
      inputs.append(multiConvMol.get_deg_adjacency_lists()[i])

    dest_model.load_from_pretrained(
        source_model=source_model,
        assignment_map=None,
        value_map=None,
        include_top=False,
        inputs=inputs)

    source_vars = source_model.model.trainable_variables[:-2]
    dest_vars = dest_model.model.trainable_variables[:-2]
    assert len(source_vars) == len(dest_vars)

    for source_var, dest_var in zip(*(source_vars, dest_vars)):
      source_val = source_var.numpy()
      dest_val = dest_var.numpy()
      np.testing.assert_array_almost_equal(source_val, dest_val)
Пример #20
0
  def test_conv_mol_deg_slice(self):
    """Tests that deg_slice works properly."""
    atom_features = np.array([[20, 21, 22, 23],
                              [24, 25, 26, 27],
                              [28, 29, 30, 31],
                              [32, 33, 34, 35]])
    adj_list = [[1, 2], [0, 3], [0, 3], [1, 2]]
    mol = ConvMol(atom_features, adj_list)

    assert np.array_equal(
        mol.get_deg_slice(),
        # 0 atoms of degree 0
        # 0 atoms of degree 1
        # 4 atoms of degree 2
        # 0 atoms of degree 3
        # 0 atoms of degree 4
        # 0 atoms of degree 5
        # 0 atoms of degree 6
        np.array([[0, 0], [0, 0], [0, 4], [0, 0], [0, 0], [0, 0], [0,0]]))
Пример #21
0
    def data_generator(self, dataset, batch_size:int, epochs=1):
        for e in range(epochs):
            for X, y, w, idx in dataset.iterbatches(batch_size, pad_batches=True, deterministic=True):
                feed_dict = {self.label: to_one_hot(y[:, 0]), self.weight: w}  # data for feed
                ConvMolList = ConvMol.agglomerate_mols(X)
                feed_dict[self.atom_features] = ConvMolList.get_atom_features()
                feed_dict[self.indexing] = ConvMolList.deg_slice
                feed_dict[self.membership] = ConvMolList.membership
                deg_adj_list = ConvMolList.get_deg_adjacency_lists()
                for i in range(1, len(deg_adj_list)):
                    feed_dict[self.deg_adj_list[i - 1]] = deg_adj_list[i]

                yield feed_dict
Пример #22
0
  def test_agglomerate_molecules(self):
    """Test AggrMol.agglomerate_mols."""
    molecules = []

    #### First example molecule
    N_feat = 4
    # Artificial feature array.
    atom_features = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])
    adj_list = [[1], [0, 2], [1]]
    molecules.append(ConvMol(atom_features, adj_list))

    #### Second example molecule
    atom_features = np.array([[20, 21, 22, 23], [24, 25, 26, 27],
                              [28, 29, 30, 31], [32, 33, 34, 35]])
    adj_list = [[1, 2], [0, 3], [0, 3], [1, 2]]
    molecules.append(ConvMol(atom_features, adj_list))

    ### Third example molecule
    atom_features = np.array([[40, 41, 42, 43], [44, 45, 46, 47],
                              [48, 49, 50, 51], [52, 53, 54,
                                                 55], [56, 57, 58, 59]])
    adj_list = [[1, 2], [0, 3], [0, 3], [1, 2, 4], [3]]
    molecules.append(ConvMol(atom_features, adj_list))

    # Test agglomerate molecule method
    concat_mol = ConvMol.agglomerate_mols(molecules)

    assert concat_mol.get_num_atoms() == 12
    assert concat_mol.get_num_molecules() == 3

    atom_features = concat_mol.get_atom_features()
    assert np.array_equal(atom_features[0, :], [1, 2, 3, 4])
    assert np.array_equal(atom_features[2, :], [56, 57, 58, 59])
    assert np.array_equal(atom_features[11, :], [52, 53, 54, 55])
    assert np.array_equal(atom_features[4, :], [20, 21, 22, 23])

    deg_adj_lists = concat_mol.get_deg_adjacency_lists()
    # No atoms of degree 0
    assert np.array_equal(deg_adj_lists[0], np.zeros([0, 0]))
    # 3 atoms of degree 1
    assert np.array_equal(deg_adj_lists[1], [[3], [3], [11]])
    # 8 atoms of degree 2
    assert np.array_equal(
        deg_adj_lists[2],
        [[0, 1], [5, 6], [4, 7], [4, 7], [5, 6], [9, 10], [8, 11], [8, 11]])
    # 1 atom of degree 3
    assert np.array_equal(deg_adj_lists[3], [[9, 10, 2]])
    # 0 atoms of degree 4
    assert np.array_equal(deg_adj_lists[4], np.zeros([0, 4]))
    # 0 atoms of degree 5
    assert np.array_equal(deg_adj_lists[5], np.zeros([0, 5]))
Пример #23
0
    def test_null_conv_mol(self):
        """Running Null AggrMol Test. Only works when max_deg=6 and min_deg=0"""
        num_feat = 4
        null_mol = ConvMol.get_null_mol(num_feat)

        deg_adj_lists = null_mol.get_deg_adjacency_lists()

        # Check that atoms are only connected to themselves.
        assert np.array_equal(deg_adj_lists[10],
                              [[10, 10, 10, 10, 10, 10, 10, 10, 10, 10]])
        assert np.array_equal(deg_adj_lists[1], [[1]])
        # Check that there's one atom of each degree.
        assert np.array_equal(null_mol.get_deg_slice(),
                              [[0, 1], [1, 1], [2, 1], [3, 1], [4, 1], [5, 1],
                               [6, 1], [7, 1], [8, 1], [9, 1], [10, 1]])
Пример #24
0
 def feed_dict_generator(dataset, batch_size, epochs=1):
   for epoch in range(epochs):
     for ind, (X_b, y_b, w_b, ids_b) in enumerate(
         dataset.iterbatches(batch_size, pad_batches=True)):
       d = {}
       for index, label in enumerate(labels):
         d[label] = to_one_hot(y_b[:, index])
       d[task_weights] = w_b
       multiConvMol = ConvMol.agglomerate_mols(X_b)
       d[atom_features] = multiConvMol.get_atom_features()
       d[degree_slice] = multiConvMol.deg_slice
       d[membership] = multiConvMol.membership
       for i in range(1, len(multiConvMol.get_deg_adjacency_lists())):
         d[deg_adjs[i - 1]] = multiConvMol.get_deg_adjacency_lists()[i]
       yield d
Пример #25
0
 def _construct_feed_dict(self, X_b, y_b, w_b, ids_b):
   feed_dict = dict()
   if y_b is not None:
     for index, label in enumerate(self.labels):
       feed_dict[label.out_tensor] = to_one_hot(y_b[:, index])
   if self.task_weights is not None and w_b is not None:
     feed_dict[self.task_weights[0].out_tensor] = w_b
   if self.features is not None:
     multiConvMol = ConvMol.agglomerate_mols(X_b)
     feed_dict[self.features[0].out_tensor] = multiConvMol.get_atom_features()
     feed_dict[self.features[1].out_tensor] = multiConvMol.deg_slice
     feed_dict[self.features[2].out_tensor] = multiConvMol.membership
     for i in range(self.max_degree):
       feed_dict[self.features[i + 3]
                 .out_tensor] = multiConvMol.get_deg_adjacency_lists()[i + 1]
   return feed_dict
Пример #26
0
  def test_null_conv_mol(self):
    """Running Null AggrMol Test. Only works when max_deg=6 and min_deg=0"""
    num_feat = 4
    min_deg = 0
    null_mol = ConvMol.get_null_mol(num_feat)

    deg_adj_lists = null_mol.get_deg_adjacency_lists()

    # Check that atoms are only connected to themselves.
    assert np.array_equal(deg_adj_lists[10],
                          [[10, 10, 10, 10, 10, 10, 10, 10, 10, 10]])
    assert np.array_equal(deg_adj_lists[1], [[1]])
    # Check that there's one atom of each degree.
    assert np.array_equal(null_mol.get_deg_slice(),
                          [[0, 1], [1, 1], [2, 1], [3, 1], [4, 1], [5, 1],
                           [6, 1], [7, 1], [8, 1], [9, 1], [10, 1]])
Пример #27
0
 def _construct_feed_dict(self, X_b, y_b, w_b, ids_b):
   feed_dict = dict()
   if y_b is not None:
     for index, label in enumerate(self.labels):
       feed_dict[label.out_tensor] = to_one_hot(y_b[:, index])
   if self.task_weights is not None and w_b is not None:
     feed_dict[self.task_weights[0].out_tensor] = w_b
   if self.features is not None:
     multiConvMol = ConvMol.agglomerate_mols(X_b)
     feed_dict[self.features[0].out_tensor] = multiConvMol.get_atom_features()
     feed_dict[self.features[1].out_tensor] = multiConvMol.deg_slice
     feed_dict[self.features[2].out_tensor] = multiConvMol.membership
     for i in range(self.max_degree):
       feed_dict[self.features[i + 3]
                 .out_tensor] = multiConvMol.get_deg_adjacency_lists()[i + 1]
   return feed_dict
Пример #28
0
def data_generator(dataset, n_epoch=1, predict=False):
    for ee in range(n_epoch):
        if not predict:
            print('Starting epoch %i' % ee)
        for ind, (X_b, y_b, w_b, ids_b) in enumerate(
                dataset.iterbatches(n_batch,
                                    pad_batches=True,
                                    deterministic=True)):
            fd = {}
            for ts, label_t in enumerate(label15):
                fd[label_t] = to_one_hot(y_b[:, ts])
            mol = ConvMol.agglomerate_mols(X_b)
            fd[atom_features] = mol.get_atom_features()
            fd[degree_slice] = mol.deg_slice
            fd[membership] = mol.membership
            deg_adj_list = mol.get_deg_adjacency_lists()
            for ii in range(1, 11):
                fd[deg_adjs[ii - 1]] = deg_adj_list[ii]
            yield fd
Пример #29
0
def data_generator(dataset, epochs=1, predict=False, pad_batches=True):
    for epoch in range(epochs):
        if not predict:
            print('Starting epoch %i' % epoch)
        for ind, (X_b, y_b, w_b, ids_b) in enumerate(
                dataset.iterbatches(batch_size,
                                    pad_batches=pad_batches,
                                    deterministic=True)):
            d = {}
            for index, label in enumerate(labels):
                d[label] = to_one_hot(y_b[:, index])
            d[weights] = w_b
            multiConvMol = ConvMol.agglomerate_mols(X_b)
            d[atom_features] = multiConvMol.get_atom_features()
            d[degree_slice] = multiConvMol.deg_slice
            d[membership] = multiConvMol.membership
            for i in range(1, len(multiConvMol.get_deg_adjacency_lists())):
                d[deg_adjs[i - 1]] = multiConvMol.get_deg_adjacency_lists()[i]
            yield d
Пример #30
0
    def _featurize(self, mol):
        """Encodes mol as a ConvMol object."""
        # Get the node features
        idx_nodes = [(a.GetIdx(), atom_features(a)) for a in mol.GetAtoms()]
        idx_nodes.sort()  # Sort by ind to ensure same order as rd_kit
        idx, nodes = list(zip(*idx_nodes))

        # Stack nodes into an array
        nodes = np.vstack(nodes)

        # Get bond lists with reverse edges included
        edge_list = [(b.GetBeginAtomIdx(), b.GetEndAtomIdx())
                     for b in mol.GetBonds()]

        # Get canonical adjacency list
        canon_adj_list = [[] for mol_id in range(len(nodes))]
        for edge in edge_list:
            canon_adj_list[edge[0]].append(edge[1])
            canon_adj_list[edge[1]].append(edge[0])

        return ConvMol(nodes, canon_adj_list)
Пример #31
0
  def amino_acid_embedding(self, name=None):
    if name == None:
      name = 'AAEmbedding_'+str(self.module_count)+'_'
      self.module_count += 1
    feat = ConvMolFeaturizer()
    featurized_AA = [feat._featurize(Chem.MolFromSmiles(smile)) for smile in AminoAcid_SMILES]
    multiConvMol = ConvMol.agglomerate_mols(featurized_AA, max_deg=3)
    atom_features = TensorWrapper(tf.constant(multiConvMol.get_atom_features(), dtype=tf.float32), name=name+'atom_features')
    degree_slice = TensorWrapper(tf.constant(multiConvMol.deg_slice, dtype=tf.int32), name=name+'degree')
    membership = TensorWrapper(tf.constant(multiConvMol.membership, dtype=tf.int32), name=name+'membership')

    deg_adjs = []
    for i in range(0, 3):
      deg_adjs.append(TensorWrapper(tf.constant(multiConvMol.get_deg_adjacency_lists()[i+1], dtype=tf.int32), name=name+'deg_'+str(i)))
      
    gc1 = GraphConv(
        64,
        max_deg=3,
        activation_fn=tf.nn.relu,
        in_layers=[atom_features, degree_slice, membership] + deg_adjs, name=name+'gc1')
    batch_norm1 = BatchNorm(in_layers=[gc1, self.training_placeholder], name=name+'bn1')
    gp1 = GraphPool(max_degree=3, in_layers=[batch_norm1, degree_slice, membership] + deg_adjs, name=name+'gp1')
    gc2 = GraphConv(
        64,
        max_deg=3,
        activation_fn=tf.nn.relu,
        in_layers=[gp1, degree_slice, membership] + deg_adjs, name=name+'gc2')
    batch_norm2 = BatchNorm(in_layers=[gc2, self.training_placeholder], name=name+'bn2')
    gp2 = GraphPool(max_degree=3, in_layers=[batch_norm2, degree_slice, membership] + deg_adjs, name=name+'gp2')
    dense = Dense(out_channels=self.embedding_length/2, activation_fn=tf.nn.relu, in_layers=[gp2], name=name+'dense1')
    batch_norm3 = BatchNorm(in_layers=[dense, self.training_placeholder], name=name+'bn3')
    readout = GraphGather(
        batch_size=21,
        activation_fn=tf.nn.tanh,
        in_layers=[batch_norm3, degree_slice, membership] + deg_adjs, name=name+'gg')
    padding = AminoAcidPad(
        embedding_length=self.embedding_length,
        in_layers=[readout], name=name+'pad')
    return padding
def data_generator(dataset, predict=False, pad_batches=True):
    # iterbatches: Get an object that iterates over minibatches from the dataset.
    for ind, (X_b, y_b, w_b, ids_b) in enumerate(
            dataset.iterbatches(batch_size,
                                pad_batches=pad_batches,
                                deterministic=True)):
        # Concatenates list of ConvMol’s into one mol object that
        # can be used to feed into tensorflow placeholders.
        # agglomerate_mols -> mols: ConvMol objects to be combined into one molecule.
        multiConvMol = ConvMol.agglomerate_mols(X_b)
        # get_atom_features: Returns canonicalized version of atom features
        inputs = [
            multiConvMol.get_atom_features(), multiConvMol.deg_slice,
            np.array(multiConvMol.membership)
        ]

        # Returns adjacency lists grouped by atom degree.
        for i in range(1, len(multiConvMol.get_deg_adjacency_lists())):
            inputs.append(multiConvMol.get_deg_adjacency_lists()[i])
        labels = [to_one_hot(y_b.flatten(), 2).reshape(-1, n_tasks, 2)]
        weights = [w_b]
        yield (inputs, labels, weights)
Пример #33
0
    def _featurize(self, mol):
        """Encodes mol as a ConvMol object.
    If per_atom_fragmentation is True,
    then for each molecule a list of ConvMolObjects
    will be created"""
        def per_atom(n, a):
            """
      Enumerates fragments resulting from mol object,
      s.t. each fragment = mol with single atom removed (all possible removals are enumerated)
      Goes over nodes, deletes one at a time and updates adjacency list of lists (removes connections to that node)

      Parameters
      ----------
      n: np.array of nodes (number_of_nodes X number_of_features)
      a: list of nested lists of adjacent node pairs

      """
            for i in range(n.shape[0]):
                new_n = np.delete(n, (i), axis=0)
                new_a = []
                for j, node_pair in enumerate(a):
                    if i != j:  # don't need this pair, no more connections to deleted node
                        tmp_node_pair = []
                        for v in node_pair:
                            if v < i:
                                tmp_node_pair.append(v)
                            elif v > i:
                                tmp_node_pair.append(
                                    v - 1
                                )  # renumber node, because of offset after node deletion
                        new_a.append(tmp_node_pair)
                yield new_n, new_a

        # Get the node features
        idx_nodes = [(a.GetIdx(),
                      np.concatenate(
                          (atom_features(a, use_chirality=self.use_chirality),
                           self._get_atom_properties(a))))
                     for a in mol.GetAtoms()]

        idx_nodes.sort()  # Sort by ind to ensure same order as rd_kit
        idx, nodes = list(zip(*idx_nodes))

        # Stack nodes into an array
        nodes = np.vstack(nodes)
        if self.master_atom:
            master_atom_features = np.expand_dims(np.mean(nodes, axis=0),
                                                  axis=0)
            nodes = np.concatenate([nodes, master_atom_features], axis=0)

        # Get bond lists with reverse edges included
        edge_list = [(b.GetBeginAtomIdx(), b.GetEndAtomIdx())
                     for b in mol.GetBonds()]
        # Get canonical adjacency list
        canon_adj_list = [[] for mol_id in range(len(nodes))]
        for edge in edge_list:
            canon_adj_list[edge[0]].append(edge[1])
            canon_adj_list[edge[1]].append(edge[0])

        if self.master_atom:
            fake_atom_index = len(nodes) - 1
            for index in range(len(nodes) - 1):
                canon_adj_list[index].append(fake_atom_index)

        if not self.per_atom_fragmentation:
            return ConvMol(nodes, canon_adj_list)
        else:
            return [ConvMol(n, a) for n, a in per_atom(nodes, canon_adj_list)]
Пример #34
0
    def data_generator(self, dataset, prior_label , task = None, num_prior = 0\
                                   , epochs=1, pad_batches=True):
        """Data generator for training and evaluation"""
        for epoch in range(epochs):
            for ind, (X_b, y_b, w_b, ids_b) in enumerate(
                dataset.iterbatches(
                    self.batch_size, pad_batches=pad_batches, deterministic=True)):
              d = {}
              for index, label in enumerate(self.labels):
                d[label] = to_one_hot(y_b[:, index])
              #if epochs < 12:
              w_b = w_b*(0.1) + (w_b*y_b*self.scaled_w)/10.0
              multiConvMol = ConvMol.agglomerate_mols(X_b[:,0])
              circular_feat = X_b[:,1:]
              d[self.circular_feat] = circular_feat
              """Encode labels into the atom_features"""
              if prior_label:
                  prior = []
                  if task is None:
                      for e in range(self.batch_size):
                          arr = np.zeros(self.num_task * 2)
                          if random.random() < 0.5:
                              index = random.sample(range(self.num_task), \
                                           random.randint(0, self.num_task -1))
                          else:
                              index = []
                          if len(index) != 0:
                              for sth in index:
                                  if w_b[e,sth] != 0:
                                      if y_b[e,sth] == 1:
                                          arr[2*sth+1] = 1
                                      else:
                                          arr[2*sth] = 1
                                      w_b[e,sth] = w_b[e,sth]*0.001
                          prior.append(arr)
                  else:
                      for e in range(self.batch_size):
                          arr = np.zeros(self.num_task * 2)
                          list_t = list(range(self.num_task))
                          list_t.pop(task)
                          index = random.sample(list_t, num_prior)
                          if len(index) != 0:
                              for sth in index:
                                  if w_b[e,sth] != 0:
                                      if y_b[e,sth] == 1:
                                          arr[2*sth+1] = 1
                                      else:
                                          arr[2*sth] = 1
                                      w_b[e,sth] = w_b[e,sth]*0.001
                          prior.append(arr)
                          w_b[e,task] = 1.0
                          arr[2*task] = 0.
                          arr[2*task+1] = 0.

                  prior = np.array(prior)
                  atom_feat = multiConvMol.get_atom_features()
                  member = multiConvMol.membership

                  new_atom_feats = []
                  for i in range(atom_feat.shape[0]):
                      new_atom_feat = np.concatenate((atom_feat[i],\
                                                        prior[member[i]]))
                      new_atom_feats.append(new_atom_feat)
                  new_atom_feats = np.array(new_atom_feats)
                  d[self.atom_features] = new_atom_feats
              else:
                  d[self.atom_features] = multiConvMol.get_atom_features()

              d[self.weights] = w_b
              d[self.degree_slice] = multiConvMol.deg_slice
              d[self.membership] = multiConvMol.membership
              for i in range(1, len(multiConvMol.get_deg_adjacency_lists())):
                  d[self.deg_adjs[i - 1]] = multiConvMol.get_deg_adjacency_lists()[i]
              yield d