def test_change_constraints_cache_clear(): alphabet = sf.get_semantic_robust_alphabet() assert alphabet == sf.get_semantic_robust_alphabet() assert sf.decoder("[C][#C]") == "C#C" new_constraints = sf.get_semantic_constraints() new_constraints["C"] = 1 sf.set_semantic_constraints(new_constraints) new_alphabet = sf.get_semantic_robust_alphabet() assert new_alphabet != alphabet assert sf.decoder("[C][#C]") == "CC" sf.set_semantic_constraints() # re-set alphabet
def test_roundtrip_translation(test_path, dataset_samples): """Tests SMILES -> SELFIES -> SMILES translation on various datasets. """ # very relaxed constraints constraints = sf.get_preset_constraints("hypervalent") constraints.update({"P": 7, "P-1": 8, "P+1": 6, "?": 12}) sf.set_semantic_constraints(constraints) error_path = ERROR_LOG_DIR / "{}.csv".format(test_path.stem) with open(error_path, "w+") as error_log: error_log.write("In, Out\n") error_data = [] error_found = False n_lines = sum(1 for _ in open(test_path)) - 1 n_keep = dataset_samples if (0 < dataset_samples <= n_lines) else n_lines skip = random.sample(range(1, n_lines + 1), n_lines - n_keep) reader = pd.read_csv(test_path, chunksize=10000, header=0, skiprows=skip) for chunk in reader: for in_smiles in chunk["smiles"]: in_smiles = in_smiles.strip() mol = Chem.MolFromSmiles(in_smiles, sanitize=True) if (mol is None) or ("*" in in_smiles): continue try: selfies = sf.encoder(in_smiles, strict=True) out_smiles = sf.decoder(selfies) except (sf.EncoderError, sf.DecoderError): error_data.append((in_smiles, "")) continue if not is_same_mol(in_smiles, out_smiles): error_data.append((in_smiles, out_smiles)) with open(error_path, "a") as error_log: for entry in error_data: error_log.write(",".join(entry) + "\n") error_found = error_found or error_data error_data = [] sf.set_semantic_constraints() # restore constraints assert not error_found
def test_unconstrained_symbols(): """Tests SELFIES with symbols that are not semantically constrained. """ f_branch = "[Branch1][C][F]" s = "[Xe-2]" + (f_branch * 8) assert decode_eq(s, "[Xe-2](F)(F)(F)(F)(F)(F)(F)CF") # change default semantic constraints constraints = sf.get_semantic_constraints() constraints["?"] = 2 sf.set_semantic_constraints(constraints) assert decode_eq(s, "[Xe-2](F)CF") sf.set_semantic_constraints()
def test_charged_symbols(): """Tests that SELFIES symbols with charges are constrained properly. """ constraints = sf.get_semantic_constraints() constraints["Sn+4"] = 1 constraints["O-2"] = 2 sf.set_semantic_constraints(constraints) # the following molecules don't make sense, but we use them to test # selfies. Hence, we can't verify them with RDKit assert decode_eq("[Sn+4][=C]", "[Sn+4]C") assert decode_eq("[O-2][#C]", "[O-2]=C") # mixing many symbol types assert decode_eq("[17O@@H1-2][#C]", "[17O@@H1-2]C") sf.set_semantic_constraints()
def test_charged_symbols(): """Tests that SELFIES symbols with charges are constrained properly. """ constraints = sf.get_semantic_constraints() constraints['Sn+4'] = 1 constraints['O-2'] = 2 sf.set_semantic_constraints(constraints) # the following molecules don't make sense, but we use them to test # selfies. Hence, we can't verify them with RDKit assert sf.decoder("[Sn++++expl][=C]") == "[Sn++++]C" assert sf.decoder("[Sn+4expl][=C]") == "[Sn+4]C" assert sf.decoder("[O--expl][#C]") == "[O--]=C" assert sf.decoder("[O-2expl][#C]") == "[O-2]=C" # mixing many symbol types assert sf.decoder("[17O@@H-2expl][#C]") == "[17O@@H-2]C" sf.set_semantic_constraints()
def test_unconstrained_symbols(): """Tests SELFIES with symbols that are not semantically constrained. """ assert sf.decoder("[Xe-2expl][Branch1_1][C][F][Branch1_1][C][F]" "[Branch1_1][C][F][Branch1_1][C][F][Branch1_1][C][F]" "[Branch1_1][C][F][Branch1_1][C][F][Branch1_1][C][F]") \ == "[Xe-2](F)(F)(F)(F)(F)(F)(F)CF" # change default semantic constraints constraints = sf.get_semantic_constraints() constraints['?'] = 2 sf.set_semantic_constraints(constraints) assert sf.decoder("[Xe-2expl][Branch1_1][C][F][Branch1_1][C][F]" "[Branch1_1][C][F][Branch1_1][C][F][Branch1_1][C][F]" "[Branch1_1][C][F][Branch1_1][C][F][Branch1_1][C][F]") \ == "[Xe-2](F)CF" sf.set_semantic_constraints()
def test_nop_symbol_decoder(max_len, hard_alphabet): """Tests that the '[nop]' symbol is decoded properly, i.e., it is always skipped over. """ sf.set_semantic_constraints() alphabet = list(hard_alphabet) alphabet.remove('[nop]') for _ in range(1000): # create random SELFIES with and without [nop] rand_len = random.randint(1, max_len) rand_mol = [random.choice(alphabet) for _ in range(rand_len)] rand_mol.extend(['[nop]'] * rand_len) random.shuffle(rand_mol) with_nops = ''.join(rand_mol) without_nops = with_nops.replace('[nop]', '') assert sf.decoder(with_nops) == sf.decoder(without_nops)
def test_random_selfies_decoder(trials, max_len, hard_alphabet): """Tests if SELFIES that are generated by randomly stringing together symbols from the SELFIES alphabet are decoded into valid SMILES. """ sf.set_semantic_constraints() # re-set alphabet alphabet = tuple(hard_alphabet) for _ in range(trials): # create random SELFIES and decode rand_len = random.randint(1, max_len) rand_mol = ''.join(random.choices(alphabet, k=rand_len)) smiles = sf.decoder(rand_mol) # check if SMILES is valid try: is_valid = MolFromSmiles(smiles, sanitize=True) is not None except Exception: is_valid = False assert is_valid, f"Invalid SMILES {smiles} decoded from {rand_mol}."
def roundtrip_translation(): sf.set_semantic_constraints("hypervalent") n_entries = 0 for chunk in make_reader(): n_entries += len(chunk) pbar = tqdm(total=n_entries) reader = make_reader() error_log = open(ERROR_LOG_DIR / f"{TEST_SET_PATH.stem}.txt", "a+") curr_idx = 0 for chunk_idx, chunk in enumerate(reader): for in_smiles in chunk[args.col_name]: pbar.update(1) curr_idx += 1 if curr_idx < args.start_from: continue in_smiles = in_smiles.strip() mol = Chem.MolFromSmiles(in_smiles, sanitize=True) if (mol is None) or ("*" in in_smiles): continue try: selfies = sf.encoder(in_smiles, strict=True) out_smiles = sf.decoder(selfies) except (sf.EncoderError, sf.DecoderError): error_log.write(in_smiles + "\n") tqdm.write(in_smiles) continue if not is_same_mol(in_smiles, out_smiles): error_log.write(in_smiles + "\n") tqdm.write(in_smiles) error_log.close()
def reset_alphabet(): sf.set_semantic_constraints({ 'H': 1, 'F': 1, 'Cl': 1, 'Br': 1, 'I': 1, 'O': 2, 'O+1': 3, 'O-1': 1, 'N': 6, 'N+1': 4, 'N-1': 2, 'C': 4, 'C+1': 5, 'C-1': 3, 'S': 6, 'S+1': 7, 'S-1': 5, 'P': 7, 'P+1': 8, 'P-1': 6, '?': 8, })
def test_roundtrip_translation(): """Tests a roundtrip SMILES -> SELFIES -> SMILES translation of the SMILES examples in QM9, NonFullerene, Zinc, etc. """ # modify constraints constraints = sf.get_semantic_constraints() constraints['N'] = 6 constraints['Br'] = 7 constraints['Cl'] = 7 constraints['I'] = 7 sf.set_semantic_constraints(constraints) # file I/O ckpt_path = os.path.join(curr_dir, 'checkpoints', 'emolecule_ckpt.txt') error_path = os.path.join(curr_dir, 'error_sets', 'errors_emolecules.csv') # check if a previous checkpoint exists to continue tests if os.path.exists(ckpt_path): with open(ckpt_path, 'r') as ckpt_file: checkpoint = int(ckpt_file.readlines()[0]) # if no path to a checkpoint exists, # create a new directory for error logging and checkpoints else: os.makedirs(os.path.dirname(ckpt_path), exist_ok=True) os.makedirs(os.path.dirname(error_path), exist_ok=True) with open(error_path, "w+") as error_log: error_log.write("In, Out\n") checkpoint = -1 error_list = [] error_found_flag = False # make pandas reader reader = pd.read_csv(EMOL_PATH, chunksize=10000, compression='gzip', delimiter=' ', header=0) # roundtrip testing for chunk_idx, chunk in enumerate(reader): if chunk_idx <= checkpoint: continue for in_smiles in chunk[COL_NAME]: # check if SMILES in chunk is a valid RDKit molecule. # if not, skip testing # All inputted SMILES must be valid # RDKit Mol objects to be encoded. if (MolFromSmiles(in_smiles) is None) or ('*' in in_smiles): continue # encode selfies selfies = sf.encoder(in_smiles) # if unable to encode SMILES, write to list of errors if selfies is None: error_list.append((in_smiles, '')) continue # take encoeded SELFIES and decode out_smiles = sf.decoder(selfies) # compare original SMILES to decoded SELFIE string. # if not the same string, write to list of errors. if not is_same_mol(in_smiles, out_smiles): error_list.append((in_smiles, out_smiles)) # open and write all errors to errors_emolecule.csv with open(error_path, "a") as error_log: for error in error_list: error_log.write(','.join(error) + "\n") error_found_flag = error_found_flag or error_list error_list = [] # create checkpoint from the current pandas reader chunk, # to load from and continue testing. with open(ckpt_path, 'w+') as ckpt_file: ckpt_file.write(str(chunk_idx)) sf.set_semantic_constraints() # restore defaults os.remove(ckpt_path) # remove checkpoint assert not error_found_flag
def test_roundtrip_translation(test_name, column_name, dataset_samples): """Tests a roundtrip SMILES -> SELFIES -> SMILES translation of the SMILES examples in QM9, NonFullerene, Zinc, etc. """ # modify semantic bond constraints sf.set_semantic_constraints({ 'H': 1, 'F': 1, 'Cl': 1, 'Br': 1, 'I': 1, 'O': 2, 'O+1': 3, 'O-1': 1, 'N': 6, 'N+1': 4, 'N-1': 2, 'C': 4, 'C+1': 5, 'C-1': 3, 'S': 6, 'S+1': 7, 'S-1': 5, 'P': 7, 'P+1': 8, 'P-1': 6, '?': 8, }) # file I/O curr_dir = os.path.dirname(__file__) test_path = os.path.join(curr_dir, 'test_sets', test_name + ".txt") error_path = os.path.join(curr_dir, 'error_sets', "errors_{}.csv".format(test_name)) # create error directory os.makedirs(os.path.dirname(error_path), exist_ok=True) error_list = [] # add header in error log text file with open(error_path, "w+") as error_log: error_log.write("In, Out\n") error_found_flag = False # make pandas reader N = sum(1 for _ in open(test_path)) - 1 S = dataset_samples if (0 < dataset_samples <= N) else N skip = sorted(random.sample(range(1, N + 1), N - S)) reader = pd.read_csv(test_path, chunksize=10000, header=0, skiprows=skip) # roundtrip testing for chunk in reader: for in_smiles in chunk[column_name]: # check if SMILES in chunk is a valid RDKit molecule. # if not, skip testing # All inputted SMILES must be valid # RDKit Mol objects to be encoded. if (MolFromSmiles(in_smiles) is None) or ('*' in in_smiles): continue # encode SELFIE string selfies = sf.encoder(in_smiles) # if unable to encode SMILES, write to list of errors if selfies is None: error_list.append((in_smiles, '')) continue # take encoeded SELFIES and decode out_smiles = sf.decoder(selfies) # compare original SMILES to decoded SELFIE string. # if not the same string, write to list of errors. if not is_same_mol(in_smiles, out_smiles): error_list.append((in_smiles, str(out_smiles))) # open and write all errors to errors_{test_name}.csv with open(error_path, "a") as error_log: for error in error_list: error_log.write(','.join(error) + "\n") error_found_flag = error_found_flag or error_list error_list = [] sf.set_semantic_constraints() # restore defaults assert not error_found_flag