Пример #1
0
def _sample_ordering(mol: Chem.Mol,
                     scaffold_nodes: np.ndarray,
                     k: int,
                     p: float,
                     ms: MoleculeSpec = MoleculeSpec.get_default()
                     ) -> t.Tuple[np.ndarray,
                                  np.ndarray,
                                  np.ndarray]:
    """Sampling decoding routes of a given molecule `mol`

    Args:
        mol (Chem.Mol):
            the given molecule (type: Chem.Mol)
        scaffold_nodes (np.ndarray):
            the nodes marked as scaffold
        k (int):
            The number of importance samples
        p (float):
            Degree of uncertainty during route sampling, should be in (0, 1)
        ms (mol_spec.MoleculeSpec)

    Returns:
        route_list (np.ndarray):
            route_list[i][j]
            the index of the atom reached at step j in sample i
        step_ids_list (np.ndarray):
            step_ids_list[i][j]
            the step at which atom j is reach at sample i
        logp_list (np.ndarray):
            logp_list[i] - the log-likelihood value of route i
    """
    # build graph
    atom_types, atom_ranks, bonds = [], [], []
    for atom in mol.GetAtoms():
        atom_types.append(ms.get_atom_type(atom))
    for r in Chem.CanonicalRankAtoms(mol):
        atom_ranks.append(r)
    for b in mol.GetBonds():
        idx_1, idx_2 = b.GetBeginAtomIdx(), b.GetEndAtomIdx()
        bonds.append([idx_1, idx_2])
    atom_ranks = np.array(atom_ranks)

    # build nx graph
    graph = nx.Graph()
    graph.add_nodes_from(range(len(atom_ranks)))
    graph.add_edges_from(bonds)

    route_list, step_ids_list, logp_list = [], [], []
    for _ in range(k):
        step_ids, log_p = _traverse(graph, atom_ranks, scaffold_nodes, p)
        step_ids_list.append(step_ids)
        step_ids = np.argsort(step_ids)
        route_list.append(step_ids)
        logp_list.append(log_p)

    # cast to numpy array
    (route_list,
     step_ids_list) = (np.array(route_list, dtype=np.int32),
                       np.array(step_ids_list, dtype=np.int32))
    logp_list = np.array(logp_list, dtype=np.float32)

    return route_list, step_ids_list, logp_list
Пример #2
0
def get_array_from_mol(mol: Chem.Mol,
                       scaffold_nodes: t.Iterable,
                       nh_nodes: t.Iterable,
                       np_nodes: t.Iterable,
                       k: int,
                       p: float,
                       ms: MoleculeSpec = MoleculeSpec.get_default()
                       ) -> t.Tuple[np.ndarray, np.ndarray]:
    """
    Represent the molecule using `np.ndarray`

    Args:
        mol (Chem.Mol):
            The input molecule
        scaffold_nodes (Iterable):
            The location of scaffold represented as `list`/`np.ndarray`
        nh_nodes (Iterable):
            Nodes with modifications
        np_nodes (Iterable):
            Nodes with modifications
        k (int):
            The number of importance samples
        p (float):
            Degree of uncertainty during route sampling, should be in (0, 1)
        ms (mol_spec.MoleculeSpec)

    Returns:
        mol_array (np.ndarray):
            The numpy representation of the molecule
            dtype - np.int32, shape - [k, num_bonds + 1, 5]
        logp (np.ndarray):
            The log-likelihood of each route
            dtype - np.float32, shape - [k, ]
    """
    atom_types, bond_info = [], []
    _, num_bonds = mol.GetNumAtoms(), mol.GetNumBonds()

    # sample route
    scaffold_nodes = np.array(list(scaffold_nodes), dtype=np.int32)
    route_list, step_ids_list, logp = _sample_ordering(mol,
                                                       scaffold_nodes,
                                                       k,
                                                       p)

    for atom_id, atom in enumerate(mol.GetAtoms()):
        if atom_id in nh_nodes:
            atom.SetNumExplicitHs(atom.GetNumExplicitHs() + 1)
        if atom_id in np_nodes:
            atom.SetFormalCharge(atom.GetFormalCharge() - 1)
        atom_types.append(ms.get_atom_type(atom))

    for bond in mol.GetBonds():
        bond_info.append([bond.GetBeginAtomIdx(),
                          bond.GetEndAtomIdx(),
                          ms.get_bond_type(bond)])

    # shape:
    # atom_types: num_atoms
    # bond_info: num_bonds x 3
    atom_types, bond_info = (np.array(atom_types, dtype=np.int32),
                             np.array(bond_info, dtype=np.int32))

    # initialize packed molecule array data
    mol_array = []

    for sample_id in range(k):
        # get the route and step_ids for the i-th sample
        (route_i,
         step_ids_i) = (route_list[sample_id, :],
                        step_ids_list[sample_id, :])

        # reorder atom types and bond info
        # note: bond_info [start_ids, end_ids, bond_type]
        (atom_types_i,
         bond_info_i,
         is_append) = _reorder(atom_types,
                               bond_info,
                               route_i,
                               step_ids_i)

        # atom type added at each step
        # -1 if the current step is connect
        atom_types_added = np.full([num_bonds, ],
                                   -1,
                                   dtype=np.int32)
        atom_types_added[is_append] = \
            atom_types_i[bond_info_i[:, 1]][is_append]

        # pack into mol_array_i
        # size: num_bonds x 4
        # note: [atom_types_added, start_ids, end_ids, bond_type]
        mol_array_i = np.concatenate([atom_types_added[:, np.newaxis],
                                      bond_info_i],
                                     axis=-1)

        # add initialization step
        init_step = np.array([[atom_types_i[0], -1, 0, -1]], dtype=np.int32)

        # concat into mol_array
        # size: (num_bonds + 1) x 4
        mol_array_i = np.concatenate([init_step, mol_array_i], axis=0)

        # Mark up scaffold bonds
        is_scaffold = np.logical_and(mol_array_i[:, 1] < len(scaffold_nodes),
                                     mol_array_i[:, 2] < len(scaffold_nodes))
        is_scaffold = is_scaffold.astype(np.int32)

        # Concatenate
        # shape: k x (num_bonds + 1) x 5
        mol_array_i = np.concatenate((mol_array_i,
                                      is_scaffold[:, np.newaxis]),
                                     axis=-1)

        mol_array.append(mol_array_i)

    # num_samples x (num_bonds + 1) x 4
    mol_array = np.stack(mol_array, axis=0)

    # Output size:
    # mol_array: k x (num_bonds + 1) x 4
    # logp: k

    return mol_array, logp