예제 #1
0
def get_selfie_and_smiles_encodings_for_dataset(smiles_list):
    """
    Returns encoding, alphabet and length of largest molecule in SMILES and
    SELFIES, given a file containing SMILES molecules.
    input:
        csv file with molecules. Column's name must be 'smiles'.
    output:
        - selfies encoding
        - selfies alphabet
        - longest selfies string
        - smiles encoding (equivalent to file content)
        - smiles alphabet (character based)
        - longest smiles string
    """

    # df = pd.read_csv(file_path)
    # smiles_list = np.asanyarray(df.smiles)
    smiles_alphabet = list(set(''.join(smiles_list)))
    smiles_alphabet.append(' ')  # for padding
    largest_smiles_len = len(max(smiles_list, key=len))

    print('--> Translating SMILES to SELFIES...')

    selfies_list = list(map(sf.encoder, smiles_list))

    all_selfies_symbols = sf.get_alphabet_from_selfies(selfies_list)
    all_selfies_symbols.add('[nop]')
    selfies_alphabet = list(all_selfies_symbols)

    largest_selfies_len = max(sf.len_selfies(s) for s in selfies_list)

    print('Finished translating SMILES to SELFIES.')

    return selfies_list, selfies_alphabet, largest_selfies_len, \
           smiles_list, smiles_alphabet, largest_smiles_len
예제 #2
0
def test_get_alphabet_from_selfies(dataset):
    entries, (vocab_stoi, _, _) = dataset

    selfies = [entry.selfies for entry in entries]
    alphabet = sf.get_alphabet_from_selfies(selfies)
    alphabet.add("[nop]")
    alphabet.add(".")

    assert alphabet == set(vocab_stoi.keys())
예제 #3
0
def get_selfies_alphabet(smiles_list):
    """Returns a sorted list of all SELFIES tokens required to build a
    SELFIES string for each molecule."""

    selfies_list = list(map(sf.encoder, smiles_list))
    all_selfies_symbols = sf.get_alphabet_from_selfies(selfies_list)
    all_selfies_symbols.add('[nop]')
    selfies_alphabet = list(all_selfies_symbols)
    selfies_alphabet.sort()
    return selfies_alphabet
def main(args):
    # Load Tokenizer
    print("Loading selfied in directory: {}.".format(args.directory))
    selfies = load_selfies(args.directory)
    print("Extracting alphabet from smiles samples")
    print("The longest sample in dataset is {}".format(
        max(sf.len_selfies(s) for s in selfies)))
    alphabet = sf.get_alphabet_from_selfies(selfies)
    alphabet.add('[start]')  # '[start]'
    alphabet.add('[end]')
    alphabet.add('[pad]')
    alphabet.add('[unk]')
    alphabet = list(alphabet)
    symbol_to_idx = {s: i for i, s in enumerate(alphabet)}
    idx_to_symbol = {i: s for i, s in enumerate(alphabet)}
    with open(args.output_file, 'w') as w:
        for i in range(0, len(idx_to_symbol)):
            w.write("{}\n".format(idx_to_symbol[i]))
    print("Alphabet written")
    def __init__(self, selfies=None, selfies_file=None, vocab_file=None):
        """
        Can be initiated from either a list of SELFIES, or a line-delimited
        SELFIES file.

        Args:
            selfies (list): the complete set of SELFIES that constitute the
              training dataset
            selfies_file (string): line-delimited file containing the complete
              set of SELFIES that constitute the training dataset
            vocab_file (string): line-delimited file containing all tokens to
              be used in the vocabulary
        """
        if vocab_file is not None:
            # read tokens from file, and add to vocabulary
            all_chars = read_smiles(vocab_file)
            # prevent chain popping open multi-character tokens
            self.characters = list(set(chain(*[[char] for char in all_chars])))
        else:
            # read SMILES
            if selfies is not None:
                self.selfies = selfies
            elif selfies_file is not None:
                self.selfies = read_smiles(selfies_file)
            else:
                raise ValueError("must provide SELFIES list or file to" + \
                                 " instantiate Vocabulary")
            # tokenize all SMILES in the input and add all tokens to vocabulary
            alphabet = sorted(list(sf.get_alphabet_from_selfies(self.selfies)))
            self.characters = alphabet

        # add padding token
        self.characters.append('<PAD>')
        # add SOS/EOS tokens
        self.characters.append('SOS')
        self.characters.append('EOS')
        # create dictionaries
        self.dictionary = {key: idx for idx, key in enumerate(self.characters)}
        self.reverse_dictionary = {value: key for key, value in \
                                   self.dictionary.items()}
import os
import argparse
import selfies as sf

dataset = ['[C][O][C]', '[F][C][F]', '[O][=O]', '[C][C][O][C][C]']
alphabet = sf.get_alphabet_from_selfies(dataset)


def load_selfies(directory):
    selfies = []
    for filename in os.listdir(directory):
        if filename.endswith(".selfies"):
            print("Found file {}. Loading samples".format(filename))
            with open(os.path.join(directory, filename), 'r') as f:
                for l in f:
                    l = l.strip()
                    selfies.append(l)
        else:
            continue
    print("All files in directory loaded.\n There are {} molecules.".format(
        len(selfies)))
    return selfies


def main(args):
    # Load Tokenizer
    print("Loading selfied in directory: {}.".format(args.directory))
    selfies = load_selfies(args.directory)
    print("Extracting alphabet from smiles samples")
    print("The longest sample in dataset is {}".format(
        max(sf.len_selfies(s) for s in selfies)))
예제 #7
0
def test_len_selfies(test_cases):
    for case in test_cases[0]:
        assert sf.len_selfies(case.selfies) == case.length


def test_split_selfies(test_cases):
    for case in test_cases[0]:
        assert list(sf.split_selfies(case.selfies)) == case.symbols


def test_get_alphabet_from_selfies(test_cases):
    case_list, (vocab_stoi, _, _) = test_cases

    selfies = [case.selfies for case in case_list]
    alphabet = sf.get_alphabet_from_selfies(selfies)
    alphabet.add("[nop]")
    alphabet.add(".")

    assert alphabet == set(vocab_stoi.keys())


def test_selfies_to_encoding(test_cases):
    case_list, (vocab_stoi, vocab_itos, pad_to_len) = test_cases

    for case in case_list:
        label, one_hot = sf.selfies_to_encoding(case.selfies,
                                                vocab_stoi,
                                                pad_to_len=pad_to_len,
                                                enc_type='both')
        assert label == case.label
def test_get_alphabet_from_selfies(test_cases, test_cases_alphabet):
    alphabet = sf.get_alphabet_from_selfies(test_cases.keys())

    assert alphabet == test_cases_alphabet