def get_selfie_and_smiles_encodings_for_dataset(smiles_list): """ Returns encoding, alphabet and length of largest molecule in SMILES and SELFIES, given a file containing SMILES molecules. input: csv file with molecules. Column's name must be 'smiles'. output: - selfies encoding - selfies alphabet - longest selfies string - smiles encoding (equivalent to file content) - smiles alphabet (character based) - longest smiles string """ # df = pd.read_csv(file_path) # smiles_list = np.asanyarray(df.smiles) smiles_alphabet = list(set(''.join(smiles_list))) smiles_alphabet.append(' ') # for padding largest_smiles_len = len(max(smiles_list, key=len)) print('--> Translating SMILES to SELFIES...') selfies_list = list(map(sf.encoder, smiles_list)) all_selfies_symbols = sf.get_alphabet_from_selfies(selfies_list) all_selfies_symbols.add('[nop]') selfies_alphabet = list(all_selfies_symbols) largest_selfies_len = max(sf.len_selfies(s) for s in selfies_list) print('Finished translating SMILES to SELFIES.') return selfies_list, selfies_alphabet, largest_selfies_len, \ smiles_list, smiles_alphabet, largest_smiles_len
def test_get_alphabet_from_selfies(dataset): entries, (vocab_stoi, _, _) = dataset selfies = [entry.selfies for entry in entries] alphabet = sf.get_alphabet_from_selfies(selfies) alphabet.add("[nop]") alphabet.add(".") assert alphabet == set(vocab_stoi.keys())
def get_selfies_alphabet(smiles_list): """Returns a sorted list of all SELFIES tokens required to build a SELFIES string for each molecule.""" selfies_list = list(map(sf.encoder, smiles_list)) all_selfies_symbols = sf.get_alphabet_from_selfies(selfies_list) all_selfies_symbols.add('[nop]') selfies_alphabet = list(all_selfies_symbols) selfies_alphabet.sort() return selfies_alphabet
def main(args): # Load Tokenizer print("Loading selfied in directory: {}.".format(args.directory)) selfies = load_selfies(args.directory) print("Extracting alphabet from smiles samples") print("The longest sample in dataset is {}".format( max(sf.len_selfies(s) for s in selfies))) alphabet = sf.get_alphabet_from_selfies(selfies) alphabet.add('[start]') # '[start]' alphabet.add('[end]') alphabet.add('[pad]') alphabet.add('[unk]') alphabet = list(alphabet) symbol_to_idx = {s: i for i, s in enumerate(alphabet)} idx_to_symbol = {i: s for i, s in enumerate(alphabet)} with open(args.output_file, 'w') as w: for i in range(0, len(idx_to_symbol)): w.write("{}\n".format(idx_to_symbol[i])) print("Alphabet written")
def __init__(self, selfies=None, selfies_file=None, vocab_file=None): """ Can be initiated from either a list of SELFIES, or a line-delimited SELFIES file. Args: selfies (list): the complete set of SELFIES that constitute the training dataset selfies_file (string): line-delimited file containing the complete set of SELFIES that constitute the training dataset vocab_file (string): line-delimited file containing all tokens to be used in the vocabulary """ if vocab_file is not None: # read tokens from file, and add to vocabulary all_chars = read_smiles(vocab_file) # prevent chain popping open multi-character tokens self.characters = list(set(chain(*[[char] for char in all_chars]))) else: # read SMILES if selfies is not None: self.selfies = selfies elif selfies_file is not None: self.selfies = read_smiles(selfies_file) else: raise ValueError("must provide SELFIES list or file to" + \ " instantiate Vocabulary") # tokenize all SMILES in the input and add all tokens to vocabulary alphabet = sorted(list(sf.get_alphabet_from_selfies(self.selfies))) self.characters = alphabet # add padding token self.characters.append('<PAD>') # add SOS/EOS tokens self.characters.append('SOS') self.characters.append('EOS') # create dictionaries self.dictionary = {key: idx for idx, key in enumerate(self.characters)} self.reverse_dictionary = {value: key for key, value in \ self.dictionary.items()}
import os import argparse import selfies as sf dataset = ['[C][O][C]', '[F][C][F]', '[O][=O]', '[C][C][O][C][C]'] alphabet = sf.get_alphabet_from_selfies(dataset) def load_selfies(directory): selfies = [] for filename in os.listdir(directory): if filename.endswith(".selfies"): print("Found file {}. Loading samples".format(filename)) with open(os.path.join(directory, filename), 'r') as f: for l in f: l = l.strip() selfies.append(l) else: continue print("All files in directory loaded.\n There are {} molecules.".format( len(selfies))) return selfies def main(args): # Load Tokenizer print("Loading selfied in directory: {}.".format(args.directory)) selfies = load_selfies(args.directory) print("Extracting alphabet from smiles samples") print("The longest sample in dataset is {}".format( max(sf.len_selfies(s) for s in selfies)))
def test_len_selfies(test_cases): for case in test_cases[0]: assert sf.len_selfies(case.selfies) == case.length def test_split_selfies(test_cases): for case in test_cases[0]: assert list(sf.split_selfies(case.selfies)) == case.symbols def test_get_alphabet_from_selfies(test_cases): case_list, (vocab_stoi, _, _) = test_cases selfies = [case.selfies for case in case_list] alphabet = sf.get_alphabet_from_selfies(selfies) alphabet.add("[nop]") alphabet.add(".") assert alphabet == set(vocab_stoi.keys()) def test_selfies_to_encoding(test_cases): case_list, (vocab_stoi, vocab_itos, pad_to_len) = test_cases for case in case_list: label, one_hot = sf.selfies_to_encoding(case.selfies, vocab_stoi, pad_to_len=pad_to_len, enc_type='both') assert label == case.label
def test_get_alphabet_from_selfies(test_cases, test_cases_alphabet): alphabet = sf.get_alphabet_from_selfies(test_cases.keys()) assert alphabet == test_cases_alphabet