def test_kekulize_parser(test_name, column_name, dataset_samples): """Tests the kekulization of SMILES, which is the first step of selfies.encoder(). """ # file I/O curr_dir = os.path.dirname(__file__) test_path = os.path.join(curr_dir, 'test_sets', test_name + ".txt") error_path = os.path.join(curr_dir, 'error_sets', "errors_kekulize_{}.csv".format(test_name)) os.makedirs(os.path.dirname(error_path), exist_ok=True) error_list = [] with open(error_path, "w+") as error_log: error_log.write("In\n") error_found_flag = False # make pandas reader N = sum(1 for _ in open(test_path)) - 1 S = dataset_samples if (0 < dataset_samples <= N) else N skip = sorted(random.sample(range(1, N + 1), N - S)) reader = pd.read_csv(test_path, chunksize=10000, header=0, skiprows=skip) # kekulize testing for chunk in reader: for smiles in chunk[column_name]: if (MolFromSmiles(smiles) is None) or ('*' in smiles): continue # build kekulized SMILES kekule_fragments = [] for fragment in smiles.split("."): kekule_gen = kekulize_parser(_parse_smiles(fragment)) k = [] for bond, symbol, symbol_type in kekule_gen: if symbol_type == BRANCH_TYPE: bond = '' k.append(bond) if symbol_type == RING_TYPE and len(symbol) == 2: k.append('%') k.append(symbol) kekule_fragments.append(''.join(k)) kekule_smiles = '.'.join(kekule_fragments) if not is_same_mol(smiles, kekule_smiles): error_list.append(smiles) with open(error_path, "a") as error_log: error_log.write("\n".join(error_list)) error_found_flag = error_found_flag or error_list error_list = [] assert not error_found_flag
def _translate_smiles(smiles: str) -> str: """A helper for ``selfies.encoder``, which translates a SMILES into a SELFIES (assuming the input SMILES contains no dots). :param smiles: the SMILES to be translated. :return: the SELFIES translation of SMILES. """ smiles_gen = _parse_smiles(smiles) char_set = set(smiles) if any(c in char_set for c in ['c', 'n', 'o', 'p', 'a', 's']): smiles_gen = kekulize_parser(smiles_gen) # a simple mutable counter to track which atom was the i-th derived atom derive_counter = [0] # a dictionary to keep track of the rings to be made. If a ring with id # X is connected to the i-th and j-th derived atoms (i < j) with bond # symbol s, then after the i-th atom is derived, rings[X] = (s, i). # As soon as the j-th atom is derived, rings[X] is removed from <rings>, # and the ring is made. rings = {} selfies, _ = _translate_smiles_derive(smiles_gen, rings, derive_counter) if rings: raise ValueError("malformed ring numbering or ring numbering " "across a dot symbol") return selfies
def time_kekulize(file_path: str, sample_size: int = -1): curr_dir = os.path.dirname(__file__) file_path = os.path.join(curr_dir, file_path) # load data with open(file_path, 'r') as file: smiles = [line.rstrip() for line in file.readlines()] smiles.pop(0) if sample_size > 0: smiles = random.sample(smiles, sample_size) print(f"Timing Kekulization of {len(smiles)} SMILES from {file_path}") # time selfies kekulization start = time.time() for s in smiles: list(kekulize_parser(_parse_smiles(s))) selfies_time = time.time() - start print(f"--> selfies kekulize: {selfies_time:0.7f}s") # time RDKit kekulization start = time.time() for s in smiles: m = MolFromSmiles(s) Kekulize(m) MolToSmiles(m, kekuleSmiles=True) rdkit_time = time.time() - start print(f"--> RDKit kekulize: {rdkit_time:0.7f}s")