예제 #1
0
def _in_pi_subgraph(atom_symbol: str, bonds: Tuple[str]) -> bool:
    """Checks whether a SMILES atom symbol should be a node in the pi
    subgraph, based on its bonds.

    More specifically, an atom should be a node in the pi subgraph if it has
    an unpaired valence electron, and thus, is able to make a double bond.

    Reference: https://depth-first.com/articles/2020/02/10/a-comprehensive
               -treatment-of-aromaticity-in-the-smiles-language/

    :param atom_symbol: a SMILES atom symbol representing an atom.
    :param bonds: the bonds connected to ``atom_symbol``.
    :return: True if ``atom_symbol`` should be included in the pi subgraph,
        and False otherwise.
    """

    atom, h_count, charge = parse_atom_symbol(atom_symbol)

    used_electrons = 0
    for b in bonds:
        used_electrons += get_num_from_bond(b)

    # e.g. c1ccccc1
    if (atom == 'c') and (h_count == charge == 0) and (len(bonds) == 2):
        h_count += 1  # implied bonded hydrogen

    if h_count > 1:
        raise ValueError(f"Kekulization Failed: {atom_symbol} not supported.")

    elif h_count == 1:  # e.g. [nH]
        used_electrons += 1

    valence = _aromatic_valences[atom] - charge
    free_electrons = valence - used_electrons
    return free_electrons % 2 != 0
예제 #2
0
def _in_pi_subgraph(atom_symbol: str, bonds: Tuple[str]) -> bool:
    """Checks whether a SMILES atom symbol should be a node in the pi
    subgraph, based on its bonds.

    More specifically, an atom should be a node in the pi subgraph if it has
    an unpaired valence electron, and thus, is able to make a double bond.

    Reference: https://depth-first.com/articles/2020/02/10/a-comprehensive
               -treatment-of-aromaticity-in-the-smiles-language/

    :param atom_symbol: a SMILES atom symbol representing an atom.
    :param bonds: the bonds connected to ``atom_symbol``.
    :return: True if ``atom_symbol`` should be included in the pi subgraph,
        and False otherwise.
    """

    atom, h_count, charge = parse_atom_symbol(atom_symbol)

    used_electrons = 0
    for b in bonds:
        used_electrons += get_num_from_bond(b)

    # e.g. c1ccccc1
    # this also covers the neutral carbon radical case (e.g. C1=[C]NC=C1),
    # which is treated equivalently to a 1-H carbon (e.g. C1=[CH]NC=C1)
    if (atom == 'c') and (h_count == charge == 0) \
            and (len(bonds) == 2) and ('#' not in bonds):

        h_count += 1  # implied bonded hydrogen

    if h_count > 1:
        raise ValueError(
            "unrecognized aromatic symbol '{}'".format(atom_symbol))

    elif h_count == 1:  # e.g. [nH]
        used_electrons += 1

    valence = _aromatic_valences[atom] - charge
    free_electrons = valence - used_electrons
    return free_electrons % 2 != 0
예제 #3
0
def _translate_smiles_derive(smiles_gen: Iterable[Tuple[str, str, int]],
                             rings: Dict[int, Tuple[str, int]],
                             counter: List[int]) -> Tuple[str, int]:
    """Recursive helper for _translate_smiles.

    Derives the SELFIES from a SMILES, and returns a tuple of (1) the
    translated SELFIES and (2) the symbol length of the translated SELFIES.

    :param smiles_gen: an iterable of the symbols (and their types)
        of the SMILES to be translated, created by ``_parse_smiles``.
    :param rings: See ``rings`` in ``_translate_smiles``.
    :param counter: a one-element list that serves as a mutable counter.
        See ``derived_counter`` in ``_translate_smiles``.
    :return: A tuple of the translated SELFIES and its symbol length.
    """

    selfies = ""
    selfies_len = 0
    prev_idx = -1

    for bond, symbol, symbol_type in smiles_gen:

        if bond == '-':  # ignore explicit single bonds
            bond = ''

        if symbol_type == ATOM_TYPE:
            if symbol[0] == '[':
                selfies += "[{}{}expl]".format(bond, symbol[1:-1])
            else:
                selfies += "[{}{}]".format(bond, symbol)
            prev_idx = counter[0]
            counter[0] += 1
            selfies_len += 1

        elif symbol_type == BRANCH_TYPE:
            if symbol == '(':

                # NOTE: looping inside a loop on a generator will produce
                # expected behaviour in this case.

                branch, branch_len = \
                    _translate_smiles_derive(smiles_gen, rings, counter)

                N_as_symbols = get_symbols_from_n(branch_len - 1)
                bond_num = get_num_from_bond(bond)

                selfies += "[Branch{}_{}]".format(len(N_as_symbols), bond_num)
                selfies += ''.join(N_as_symbols) + branch
                selfies_len += 1 + len(N_as_symbols) + branch_len

            else:  # symbol == ')'
                break

        else:  # symbol_type == RING_TYPE
            ring_id = int(symbol)

            if ring_id in rings:
                left_bond, left_end = rings.pop(ring_id)
                right_bond, right_end = bond, prev_idx

                ring_len = right_end - left_end
                N_as_symbols = get_symbols_from_n(ring_len - 1)

                if left_bond != '':
                    selfies += "[Expl{}Ring{}]".format(left_bond,
                                                       len(N_as_symbols))
                elif right_bond != '':
                    selfies += "[Expl{}Ring{}]".format(right_bond,
                                                       len(N_as_symbols))
                else:
                    selfies += "[Ring{}]".format(len(N_as_symbols))

                selfies += ''.join(N_as_symbols)
                selfies_len += 1 + len(N_as_symbols)

            else:
                rings[ring_id] = (bond, prev_idx)

    return selfies, selfies_len
예제 #4
0
def _form_rings_bilocally(derived: List[List[Union[str, int]]],
                          rings: List[Tuple[int, int, str]]) -> None:
    """Forms all the rings specified by the rings list, in first-to-last order,
    by updating derived.

    :param derived: see ``derived`` in ``_translate_selfies``.
    :param rings: see ``rings`` in ``_translate_selfies``.
    :return: ``None``.
    """

    # due to the behaviour of allowing multiple rings between the same atom
    # pair, or rings between already bonded atoms, we first resolve all rings
    # so that only valid rings are left and placed into <ring_locs>.
    ring_locs = OrderedDict()

    for left_idx, right_idx, bond_symbol in rings:

        if left_idx == right_idx:  # ring to the same atom forbidden
            continue

        left_end = derived[left_idx]
        right_end = derived[right_idx]
        bond_num = get_num_from_bond(bond_symbol)

        if left_end[1] <= 0 or right_end[1] <= 0:
            continue  # no room for bond

        if bond_num > min(left_end[1], right_end[1]):
            bond_num = min(left_end[1], right_end[1])
            bond_symbol = get_bond_from_num(bond_num)

        # ring is formed between two atoms that are already bonded
        # e.g. CC1C1C --> CC=CC
        if left_idx == right_end[2]:

            right_symbol = right_end[0]

            if right_symbol[0] in {'-', '/', '\\', '=', '#'}:
                old_bond = right_symbol[0]
            else:
                old_bond = ''

            # update bond multiplicity and symbol
            new_bond_num = min(bond_num + get_num_from_bond(old_bond), 3)
            new_bond_symbol = get_bond_from_num(new_bond_num)

            right_end[0] = new_bond_symbol + right_end[0][len(old_bond):]

        # ring is formed between two atoms that are not bonded, e.g. C1CC1C
        else:
            loc = (left_idx, right_idx)

            if loc in ring_locs:
                # a ring is formed between two atoms that are have previously
                # been bonded by a ring, so ring bond multiplicity is updated

                new_bond_num = min(
                    bond_num + get_num_from_bond(ring_locs[loc]), 3)
                new_bond_symbol = get_bond_from_num(new_bond_num)
                ring_locs[loc] = new_bond_symbol

            else:
                ring_locs[loc] = bond_symbol

        left_end[1] -= bond_num
        right_end[1] -= bond_num

    # finally, use <ring_locs> to add all the rings into <derived>

    ring_counter = 1
    for (left_idx, right_idx), bond_symbol in ring_locs.items():

        ring_id = str(ring_counter)
        if len(ring_id) == 2:
            ring_id = "%" + ring_id
        ring_counter += 1  # increment

        derived[left_idx][0] += bond_symbol + ring_id
        derived[right_idx][0] += bond_symbol + ring_id
예제 #5
0
def _translate_selfies_derive(selfies_gen: Iterable[str], init_state: int,
                              derived: List[List[Union[str, int]]],
                              prev_idx: int, branches: Dict[int, int],
                              rings: List[Tuple[int, int, str]]) -> None:
    """Recursive helper for _translate_selfies.

    Derives the SMILES symbols one-by-one from a SELFIES, and
    populates derived, branches, and rings. The main chain and side branches
    of the SELFIES are translated recursively. Rings are not actually
    translated, but saved to the rings list to be added later.

    :param selfies_gen: an iterable of the symbols of the SELFIES to be
        translated, created by ``_parse_selfies``.
    :param init_state: the initial derivation state.
    :param derived: see ``derived`` in ``_translate_selfies``.
    :param prev_idx: the index of the previously derived atom, or -1,
        if no atoms have been derived yet.
    :param branches: see ``branches`` in ``_translate_selfies``.
    :param rings: see ``rings`` in ``_translate_selfies``.
    :return: ``None``.
    """

    curr_symbol = next(selfies_gen)
    state = init_state

    while curr_symbol != '' and state >= 0:

        # Case 1: Branch symbol (e.g. [Branch1_2])
        if 'Branch' in curr_symbol:

            branch_init_state, new_state = \
                get_next_branch_state(curr_symbol, state)

            if state <= 1:  # state = 0, 1
                pass  # ignore no symbols

            else:
                L = int(curr_symbol[-4])  # corresponds to [BranchL_X]
                L_symbols = []
                for _ in range(L):
                    L_symbols.append(next(selfies_gen))

                N = get_n_from_symbols(*L_symbols)

                branch_symbols = []
                for _ in range(N + 1):
                    branch_symbols.append(next(selfies_gen))
                branch_gen = _parse_selfies_symbols(branch_symbols)

                branch_start = len(derived)
                _translate_selfies_derive(branch_gen, branch_init_state,
                                          derived, prev_idx, branches, rings)
                branch_end = len(derived) - 1

                # resolve C((C)Cl)C --> C(C)(Cl)C
                while branch_start in branches:
                    branch_start = branches[branch_start] + 1

                # finally, register the branch in branches
                if branch_start <= branch_end:
                    branches[branch_start] = branch_end

        # Case 2: Ring symbol (e.g. [Ring2])
        elif 'Ring' in curr_symbol:

            new_state = state

            if state == 0:
                pass  # ignore no symbols

            else:
                L = int(curr_symbol[-2])  # corresponds to [RingL]
                L_symbols = []
                for _ in range(L):
                    L_symbols.append(next(selfies_gen))

                N = get_n_from_symbols(*L_symbols)

                left_idx = max(0, prev_idx - (N + 1))
                right_idx = prev_idx

                bond_symbol = ''
                if curr_symbol[1:5] == 'Expl':
                    bond_symbol = curr_symbol[5]

                rings.append((left_idx, right_idx, bond_symbol))

        # Case 3: regular symbol (e.g. [N], [=C], [F])
        else:
            new_symbol, new_state = get_next_state(curr_symbol, state)

            if new_symbol != '':  # in case of [epsilon]
                derived.append([new_symbol, new_state, prev_idx])

                if prev_idx >= 0:
                    bond_num = get_num_from_bond(new_symbol[0])
                    derived[prev_idx][1] -= bond_num

                prev_idx = len(derived) - 1

        curr_symbol = next(selfies_gen)  # update symbol and state
        state = new_state