Пример #1
0
def test_change_constraints_cache_clear():
    alphabet = sf.get_semantic_robust_alphabet()
    assert alphabet == sf.get_semantic_robust_alphabet()
    assert sf.decoder("[C][#C]") == "C#C"

    new_constraints = sf.get_semantic_constraints()
    new_constraints["C"] = 1
    sf.set_semantic_constraints(new_constraints)

    new_alphabet = sf.get_semantic_robust_alphabet()
    assert new_alphabet != alphabet
    assert sf.decoder("[C][#C]") == "CC"

    sf.set_semantic_constraints()  # re-set alphabet
Пример #2
0
def test_roundtrip_translation(test_path, dataset_samples):
    """Tests SMILES -> SELFIES -> SMILES translation on various datasets.
    """

    # very relaxed constraints
    constraints = sf.get_preset_constraints("hypervalent")
    constraints.update({"P": 7, "P-1": 8, "P+1": 6, "?": 12})
    sf.set_semantic_constraints(constraints)

    error_path = ERROR_LOG_DIR / "{}.csv".format(test_path.stem)
    with open(error_path, "w+") as error_log:
        error_log.write("In, Out\n")

    error_data = []
    error_found = False

    n_lines = sum(1 for _ in open(test_path)) - 1
    n_keep = dataset_samples if (0 < dataset_samples <= n_lines) else n_lines
    skip = random.sample(range(1, n_lines + 1), n_lines - n_keep)
    reader = pd.read_csv(test_path, chunksize=10000, header=0, skiprows=skip)

    for chunk in reader:

        for in_smiles in chunk["smiles"]:
            in_smiles = in_smiles.strip()

            mol = Chem.MolFromSmiles(in_smiles, sanitize=True)
            if (mol is None) or ("*" in in_smiles):
                continue

            try:
                selfies = sf.encoder(in_smiles, strict=True)
                out_smiles = sf.decoder(selfies)
            except (sf.EncoderError, sf.DecoderError):
                error_data.append((in_smiles, ""))
                continue

            if not is_same_mol(in_smiles, out_smiles):
                error_data.append((in_smiles, out_smiles))

        with open(error_path, "a") as error_log:
            for entry in error_data:
                error_log.write(",".join(entry) + "\n")

        error_found = error_found or error_data
        error_data = []

    sf.set_semantic_constraints()  # restore constraints

    assert not error_found
Пример #3
0
def test_unconstrained_symbols():
    """Tests SELFIES with symbols that are not semantically constrained.
    """

    f_branch = "[Branch1][C][F]"
    s = "[Xe-2]" + (f_branch * 8)
    assert decode_eq(s, "[Xe-2](F)(F)(F)(F)(F)(F)(F)CF")

    # change default semantic constraints
    constraints = sf.get_semantic_constraints()
    constraints["?"] = 2
    sf.set_semantic_constraints(constraints)

    assert decode_eq(s, "[Xe-2](F)CF")

    sf.set_semantic_constraints()
Пример #4
0
def test_charged_symbols():
    """Tests that SELFIES symbols with charges are constrained properly.
    """

    constraints = sf.get_semantic_constraints()
    constraints["Sn+4"] = 1
    constraints["O-2"] = 2
    sf.set_semantic_constraints(constraints)

    # the following molecules don't make sense, but we use them to test
    # selfies. Hence, we can't verify them with RDKit
    assert decode_eq("[Sn+4][=C]", "[Sn+4]C")
    assert decode_eq("[O-2][#C]", "[O-2]=C")

    # mixing many symbol types
    assert decode_eq("[17O@@H1-2][#C]", "[17O@@H1-2]C")

    sf.set_semantic_constraints()
Пример #5
0
def test_charged_symbols():
    """Tests that SELFIES symbols with charges are constrained properly.
    """

    constraints = sf.get_semantic_constraints()
    constraints['Sn+4'] = 1
    constraints['O-2'] = 2
    sf.set_semantic_constraints(constraints)

    # the following molecules don't make sense, but we use them to test
    # selfies. Hence, we can't verify them with RDKit
    assert sf.decoder("[Sn++++expl][=C]") == "[Sn++++]C"
    assert sf.decoder("[Sn+4expl][=C]") == "[Sn+4]C"
    assert sf.decoder("[O--expl][#C]") == "[O--]=C"
    assert sf.decoder("[O-2expl][#C]") == "[O-2]=C"

    # mixing many symbol types
    assert sf.decoder("[17O@@H-2expl][#C]") == "[17O@@H-2]C"

    sf.set_semantic_constraints()
Пример #6
0
def test_unconstrained_symbols():
    """Tests SELFIES with symbols that are not semantically constrained.
    """

    assert sf.decoder("[Xe-2expl][Branch1_1][C][F][Branch1_1][C][F]"
                      "[Branch1_1][C][F][Branch1_1][C][F][Branch1_1][C][F]"
                      "[Branch1_1][C][F][Branch1_1][C][F][Branch1_1][C][F]") \
           == "[Xe-2](F)(F)(F)(F)(F)(F)(F)CF"

    # change default semantic constraints
    constraints = sf.get_semantic_constraints()
    constraints['?'] = 2
    sf.set_semantic_constraints(constraints)

    assert sf.decoder("[Xe-2expl][Branch1_1][C][F][Branch1_1][C][F]"
                      "[Branch1_1][C][F][Branch1_1][C][F][Branch1_1][C][F]"
                      "[Branch1_1][C][F][Branch1_1][C][F][Branch1_1][C][F]") \
           == "[Xe-2](F)CF"

    sf.set_semantic_constraints()
Пример #7
0
def test_nop_symbol_decoder(max_len, hard_alphabet):
    """Tests that the '[nop]' symbol is decoded properly, i.e., it is
    always skipped over.
    """

    sf.set_semantic_constraints()

    alphabet = list(hard_alphabet)
    alphabet.remove('[nop]')

    for _ in range(1000):

        # create random SELFIES with and without [nop]
        rand_len = random.randint(1, max_len)
        rand_mol = [random.choice(alphabet) for _ in range(rand_len)]
        rand_mol.extend(['[nop]'] * rand_len)
        random.shuffle(rand_mol)

        with_nops = ''.join(rand_mol)
        without_nops = with_nops.replace('[nop]', '')

        assert sf.decoder(with_nops) == sf.decoder(without_nops)
Пример #8
0
def test_random_selfies_decoder(trials, max_len, hard_alphabet):
    """Tests if SELFIES that are generated by randomly stringing together
    symbols from the SELFIES alphabet are decoded into valid SMILES.
    """

    sf.set_semantic_constraints()  # re-set alphabet
    alphabet = tuple(hard_alphabet)

    for _ in range(trials):

        # create random SELFIES and decode
        rand_len = random.randint(1, max_len)
        rand_mol = ''.join(random.choices(alphabet, k=rand_len))
        smiles = sf.decoder(rand_mol)

        # check if SMILES is valid
        try:
            is_valid = MolFromSmiles(smiles, sanitize=True) is not None
        except Exception:
            is_valid = False

        assert is_valid, f"Invalid SMILES {smiles} decoded from {rand_mol}."
Пример #9
0
def roundtrip_translation():
    sf.set_semantic_constraints("hypervalent")

    n_entries = 0
    for chunk in make_reader():
        n_entries += len(chunk)
    pbar = tqdm(total=n_entries)

    reader = make_reader()
    error_log = open(ERROR_LOG_DIR / f"{TEST_SET_PATH.stem}.txt", "a+")

    curr_idx = 0
    for chunk_idx, chunk in enumerate(reader):
        for in_smiles in chunk[args.col_name]:
            pbar.update(1)
            curr_idx += 1
            if curr_idx < args.start_from:
                continue

            in_smiles = in_smiles.strip()

            mol = Chem.MolFromSmiles(in_smiles, sanitize=True)
            if (mol is None) or ("*" in in_smiles):
                continue

            try:
                selfies = sf.encoder(in_smiles, strict=True)
                out_smiles = sf.decoder(selfies)
            except (sf.EncoderError, sf.DecoderError):
                error_log.write(in_smiles + "\n")
                tqdm.write(in_smiles)
                continue

            if not is_same_mol(in_smiles, out_smiles):
                error_log.write(in_smiles + "\n")
                tqdm.write(in_smiles)

    error_log.close()
Пример #10
0
def reset_alphabet():
    sf.set_semantic_constraints({
        'H': 1,
        'F': 1,
        'Cl': 1,
        'Br': 1,
        'I': 1,
        'O': 2,
        'O+1': 3,
        'O-1': 1,
        'N': 6,
        'N+1': 4,
        'N-1': 2,
        'C': 4,
        'C+1': 5,
        'C-1': 3,
        'S': 6,
        'S+1': 7,
        'S-1': 5,
        'P': 7,
        'P+1': 8,
        'P-1': 6,
        '?': 8,
    })
Пример #11
0
def test_roundtrip_translation():
    """Tests a roundtrip SMILES -> SELFIES -> SMILES translation of the
    SMILES examples in QM9, NonFullerene, Zinc, etc.
    """

    # modify constraints
    constraints = sf.get_semantic_constraints()
    constraints['N'] = 6
    constraints['Br'] = 7
    constraints['Cl'] = 7
    constraints['I'] = 7
    sf.set_semantic_constraints(constraints)

    # file I/O
    ckpt_path = os.path.join(curr_dir, 'checkpoints', 'emolecule_ckpt.txt')
    error_path = os.path.join(curr_dir, 'error_sets', 'errors_emolecules.csv')

    # check if a previous checkpoint exists to continue tests
    if os.path.exists(ckpt_path):
        with open(ckpt_path, 'r') as ckpt_file:
            checkpoint = int(ckpt_file.readlines()[0])

    # if no path to a checkpoint exists,
    # create a new directory for error logging and checkpoints
    else:
        os.makedirs(os.path.dirname(ckpt_path), exist_ok=True)
        os.makedirs(os.path.dirname(error_path), exist_ok=True)

        with open(error_path, "w+") as error_log:
            error_log.write("In, Out\n")
        checkpoint = -1

    error_list = []
    error_found_flag = False

    # make pandas reader
    reader = pd.read_csv(EMOL_PATH,
                         chunksize=10000,
                         compression='gzip',
                         delimiter=' ',
                         header=0)

    # roundtrip testing
    for chunk_idx, chunk in enumerate(reader):

        if chunk_idx <= checkpoint:
            continue

        for in_smiles in chunk[COL_NAME]:

            # check if SMILES in chunk is a valid RDKit molecule.
            # if not, skip testing
            # All inputted SMILES must be valid
            # RDKit Mol objects to be encoded.
            if (MolFromSmiles(in_smiles) is None) or ('*' in in_smiles):
                continue

            # encode selfies
            selfies = sf.encoder(in_smiles)

            # if unable to encode SMILES, write to list of errors
            if selfies is None:
                error_list.append((in_smiles, ''))
                continue

            # take encoeded SELFIES and decode
            out_smiles = sf.decoder(selfies)

            # compare original SMILES to decoded SELFIE string.
            # if not the same string, write to list of errors.
            if not is_same_mol(in_smiles, out_smiles):
                error_list.append((in_smiles, out_smiles))

        # open and write all errors to errors_emolecule.csv
        with open(error_path, "a") as error_log:
            for error in error_list:
                error_log.write(','.join(error) + "\n")
        error_found_flag = error_found_flag or error_list
        error_list = []

        # create checkpoint from the current pandas reader chunk,
        # to load from and continue testing.
        with open(ckpt_path, 'w+') as ckpt_file:
            ckpt_file.write(str(chunk_idx))

    sf.set_semantic_constraints()  # restore defaults
    os.remove(ckpt_path)  # remove checkpoint

    assert not error_found_flag
Пример #12
0
def test_roundtrip_translation(test_name, column_name, dataset_samples):
    """Tests a roundtrip SMILES -> SELFIES -> SMILES translation of the
    SMILES examples in QM9, NonFullerene, Zinc, etc.
    """

    # modify semantic bond constraints
    sf.set_semantic_constraints({
        'H': 1, 'F': 1, 'Cl': 1, 'Br': 1, 'I': 1,
        'O': 2, 'O+1': 3, 'O-1': 1,
        'N': 6, 'N+1': 4, 'N-1': 2,
        'C': 4, 'C+1': 5, 'C-1': 3,
        'S': 6, 'S+1': 7, 'S-1': 5,
        'P': 7, 'P+1': 8, 'P-1': 6,
        '?': 8,
    })

    # file I/O
    curr_dir = os.path.dirname(__file__)
    test_path = os.path.join(curr_dir, 'test_sets', test_name + ".txt")
    error_path = os.path.join(curr_dir,
                              'error_sets',
                              "errors_{}.csv".format(test_name))

    # create error directory
    os.makedirs(os.path.dirname(error_path), exist_ok=True)
    error_list = []

    # add header in error log text file
    with open(error_path, "w+") as error_log:
        error_log.write("In, Out\n")
    error_found_flag = False

    # make pandas reader
    N = sum(1 for _ in open(test_path)) - 1
    S = dataset_samples if (0 < dataset_samples <= N) else N
    skip = sorted(random.sample(range(1, N + 1), N - S))
    reader = pd.read_csv(test_path,
                         chunksize=10000,
                         header=0,
                         skiprows=skip)

    # roundtrip testing
    for chunk in reader:
        for in_smiles in chunk[column_name]:
            # check if SMILES in chunk is a valid RDKit molecule.
            # if not, skip testing
            # All inputted SMILES must be valid
            # RDKit Mol objects to be encoded.
            if (MolFromSmiles(in_smiles) is None) or ('*' in in_smiles):
                continue

            # encode SELFIE string
            selfies = sf.encoder(in_smiles)

            # if unable to encode SMILES, write to list of errors
            if selfies is None:
                error_list.append((in_smiles, ''))
                continue

            # take encoeded SELFIES and decode
            out_smiles = sf.decoder(selfies)

            # compare original SMILES to decoded SELFIE string.
            # if not the same string, write to list of errors.
            if not is_same_mol(in_smiles, out_smiles):
                error_list.append((in_smiles, str(out_smiles)))

        # open and write all errors to errors_{test_name}.csv
        with open(error_path, "a") as error_log:
            for error in error_list:
                error_log.write(','.join(error) + "\n")
        error_found_flag = error_found_flag or error_list
        error_list = []

    sf.set_semantic_constraints()  # restore defaults

    assert not error_found_flag