def test_untokenise_smiles_with_simplify_rings(self, smiles_str, split, tokens): token_list = ['?', 'C', '1', '2', '%12', '%'] tokeniser = smiles.SmilesTokeniser(token_list, splitting_method=split, simplify_rings=True) smiles_result = tokeniser.untokenise_smiles(tokens) assert smiles_result == smiles_str
def test_tokenise_smiles_all_tokens(self, smiles_str, tokens): token_list = [ '?', 'C', 'N', 'O', 'Cl', 'Br', '[Pt]', '(', ')', '=', '1', '2', '3', '%10' ] tokeniser = smiles.SmilesTokeniser(token_list) tokens_result = tokeniser.tokenise_smiles(smiles_str) assert tokens_result == tokens
def test_tokenise_smiles_with_sos_eos(self, smiles_str, token_sos, token_eos, tokens): token_list = ['C', 'N', 'O', 'Cl', 'Br', '<SOS>', '<EOS>'] tokeniser = smiles.SmilesTokeniser(token_list, token_sos=token_sos, token_eos=token_eos) tokens_result = tokeniser.tokenise_smiles(smiles_str) assert tokens_result == tokens
def test_tokenise_smiles_with_simplify_rings(self, smiles_str, split, tokens): token_list = ['?', 'C', '1', '2', '3'] tokeniser = smiles.SmilesTokeniser(token_list, splitting_method=split, simplify_rings=True) tokens_result = tokeniser.tokenise_smiles(smiles_str) assert tokens_result == tokens
def test_tokenise_smiles_with_padding(self, smiles_str, token_pad, length, truncate, tokens): token_list = ['C', 'N', 'O', 'Cl', 'Br', '_', ' '] tokeniser = smiles.SmilesTokeniser(token_list, token_padding=token_pad, sequence_length=length, truncate_sequence=truncate) tokens_result = tokeniser.tokenise_smiles(smiles_str) assert tokens_result == tokens
def test_tokenise_smiles_with_unknown_placeholder(self, smiles_str, split, placeholder, tokens): token_list = ['C', 'Br', 'B', 'r', '?', '[Pt]', '[', 'P', 't', ']'] tokeniser = smiles.SmilesTokeniser(token_list, splitting_method=split, token_unknown=placeholder) tokens_result = tokeniser.tokenise_smiles(smiles_str) assert tokens_result == tokens assert tokeniser.missing_tokens == {'S', 'N'}
def test_tokenise_smiles_characters(self, smiles_str, tokens): token_list = [ '?', 'C', 'N', 'O', 'l', 'B', '[', 'P', 't', ']', '(', ')', '=', '1', '2', '3', '%', '0', 'r' ] tokeniser = smiles.SmilesTokeniser(token_list, splitting_method='characters') tokens_result = tokeniser.tokenise_smiles(smiles_str) assert tokens_result == tokens
def test_tokenise_smiles_halogens_only(self, smiles_str, tokens): token_list = [ '?', 'C', 'N', 'O', 'Cl', 'Br', '[', 'P', 't', ']', '(', ')', '=', '1', '2', '3', '%', '0' ] tokeniser = smiles.SmilesTokeniser(token_list, splitting_method='halogens_only') tokens_result = tokeniser.tokenise_smiles(smiles_str) assert tokens_result == tokens
def test_tokenise_smiles_with_pad_sos_eos(self, smiles_str, token_sos, token_eos, tokens): token_list = ['C', 'N', 'O', 'Cl', 'Br', '<SOS>', '<EOS>', '_'] tokeniser = smiles.SmilesTokeniser(token_list, token_sos=token_sos, token_eos=token_eos, token_padding='_', sequence_length=7) tokens_result = tokeniser.tokenise_smiles(smiles_str) assert tokens_result == tokens
def test_exception_if_seq_length_not_padding(self): token_list = ['?', 'C', '1', '2', '%12', '%'] with pytest.raises(ValueError): _ = smiles.SmilesTokeniser(token_list, sequence_length=50)
def test_exception_if_padding_not_seq_length(self): token_list = ['?', 'C', '1', '2', '%12', '%'] with pytest.raises(ValueError): _ = smiles.SmilesTokeniser(token_list, token_padding='?')