def test_change_constraints_cache_clear(): alphabet = sf.get_semantic_robust_alphabet() assert alphabet == sf.get_semantic_robust_alphabet() assert sf.decoder("[C][#C]") == "C#C" new_constraints = sf.get_semantic_constraints() new_constraints["C"] = 1 sf.set_semantic_constraints(new_constraints) new_alphabet = sf.get_semantic_robust_alphabet() assert new_alphabet != alphabet assert sf.decoder("[C][#C]") == "CC" sf.set_semantic_constraints() # re-set alphabet
def large_alphabet(): alphabet = sf.get_semantic_robust_alphabet() alphabet.update([ "[#Br]", "[#Branch1]", "[#Branch2]", "[#Branch3]", "[#C@@H1]", "[#C@@]", "[#C@H1]", "[#C@]", "[#C]", "[#Cl]", "[#F]", "[#H]", "[#I]", "[#NH1]", "[#N]", "[#O]", "[#P]", "[#Ring1]", "[#Ring2]", "[#Ring3]", "[#S]", "[/Br]", "[/C@@H1]", "[/C@@]", "[/C@H1]", "[/C@]", "[/C]", "[/Cl]", "[/F]", "[/H]", "[/I]", "[/NH1]", "[/N]", "[/O]", "[/P]", "[/S]", "[=Br]", "[=Branch1]", "[=Branch2]", "[=Branch3]", "[=C@@H1]", "[=C@@]", "[=C@H1]", "[=C@]", "[=C]", "[=Cl]", "[=F]", "[=H]", "[=I]", "[=NH1]", "[=N]", "[=O]", "[=P]", "[=Ring1]", "[=Ring2]", "[=Ring3]", "[=S]", "[Br]", "[Branch1]", "[Branch2]", "[Branch3]", "[C@@H1]", "[C@@]", "[C@H1]", "[C@]", "[C]", "[Cl]", "[F]", "[H]", "[I]", "[NH1]", "[N]", "[O]", "[P]", "[Ring1]", "[Ring2]", "[Ring3]", "[S]", "[\\Br]", "[\\C@@H1]", "[\\C@@]", "[\\C@H1]", "[\\C@]", "[\\C]", "[\\Cl]", "[\\F]", "[\\H]", "[\\I]", "[\\NH1]", "[\\N]", "[\\O]", "[\\P]", "[\\S]", "[nop]" ]) return list(alphabet)
def hard_alphabet(): """A challenging alphabet of SELFIES symbols. """ alphabet = sf.get_semantic_robust_alphabet() alphabet.update([ '[#Br]', '[#C@@Hexpl]', '[#C@@expl]', '[#C@Hexpl]', '[#C@expl]', '[#C]', '[#Cl]', '[#F]', '[#Hexpl]', '[#I]', '[#NHexpl]', '[#N]', '[#O]', '[#P]', '[#S]', '[/Br]', '[/C@@Hexpl]', '[/C@@expl]', '[/C@Hexpl]', '[/C@expl]', '[/C]', '[/Cl]', '[/F]', '[/Hexpl]', '[/I]', '[/NHexpl]', '[/N]', '[/O]', '[/P]', '[/S]', '[=Br]', '[=C@@Hexpl]', '[=C@@expl]', '[=C@Hexpl]', '[=C@expl]', '[=C]', '[=Cl]', '[=F]', '[=Hexpl]', '[=I]', '[=NHexpl]', '[=N]', '[=O]', '[=P]', '[=S]', '[Br]', '[Branch1_1]', '[Branch1_2]', '[Branch1_3]', '[Branch2_1]', '[Branch2_2]', '[Branch2_3]', '[Branch3_1]', '[Branch3_2]', '[Branch3_3]', '[C@@Hexpl]', '[C@@expl]', '[C@Hexpl]', '[C@expl]', '[C]', '[Cl]', '[Expl#Ring1]', '[Expl=Ring1]', '[F]', '[Hexpl]', '[I]', '[NHexpl]', '[N]', '[O]', '[P]', '[Ring1]', '[Ring2]', '[Ring3]', '[S]', '[\\Br]', '[\\C@@Hexpl]', '[\\C@@expl]', '[\\C@Hexpl]', '[\\C@expl]', '[\\C]', '[\\Cl]', '[\\F]', '[\\Hexpl]', '[\\I]', '[\\NHexpl]', '[\\N]', '[\\O]', '[\\P]', '[\\S]', '[epsilon]', '[nop]' ]) return list(alphabet)
from rdkit import Chem from selfies import encoder, decoder, get_semantic_robust_alphabet import re import random import logging as logger ALPHABET = [symbol.strip("[]") for symbol in get_semantic_robust_alphabet()] # follow these rules for which types of substitution should be allowed ALLOWED_SUBS = { "C": ["N", "O", "H"], "=C": ["=N", "N", "O", "=O", "S"], "F": ["Cl", "Br", "I", "O", "N", "C", "H"], "Cl": ["F", "Br", "I", "O", "N", "C", "H"], "Br": ["Cl", "F", "I", "O", "N", "C", "H"], "I": ["Cl", "Br", "F", "O", "N", "C", "H"], "H": ["C", "O", "N", "S", "=C", "=O", "=S"], "O": ["S", "N", "=O", "=N", "C", "=C", "Cl", "F", "Br", "I", "H"], "=O": ["=S", "=N", "=C", "O"], "N": ["O", "C", "H"], "=N": ["=O", "O", "S", "=C", "C"], "#N": ["#C"], "S": ["O", "N", "C", "=O", "=N", "H"], "=S": ["=O", "=N", "=C", "O"], "#C": ["#N"] } def selfies_substitution(*, parent_smiles: str,
def mutate_selfie(selfie, max_molecules_len, write_fail_cases=False): '''Return a mutated selfie string (only one mutation on slefie is performed) Mutations are done until a valid molecule is obtained Rules of mutation: With a 33.3% propbabily, either: 1. Add a random SELFIE character in the string 2. Replace a random SELFIE character with another 3. Delete a random character Parameters: selfie (string) : SELFIE string to be mutated max_molecules_len (int) : Mutations of SELFIE string are allowed up to this length write_fail_cases (bool) : If true, failed mutations are recorded in "selfie_failure_cases.txt" Returns: selfie_mutated (string) : Mutated SELFIE string smiles_canon (string) : canonical smile of mutated SELFIE string ''' valid = False fail_counter = 0 chars_selfie = get_selfie_chars(selfie) while not valid: fail_counter += 1 alphabet = list( selfies.get_semantic_robust_alphabet()) # 34 SELFIE characters choice_ls = [1, 2, 3] # 1=Insert; 2=Replace; 3=Delete random_choice = np.random.choice(choice_ls, 1)[0] # Insert a character in a Random Location if random_choice == 1: random_index = np.random.randint(len(chars_selfie) + 1) random_character = np.random.choice(alphabet, size=1)[0] selfie_mutated_chars = chars_selfie[:random_index] + [ random_character ] + chars_selfie[random_index:] # Replace a random character elif random_choice == 2: random_index = np.random.randint(len(chars_selfie)) random_character = np.random.choice(alphabet, size=1)[0] if random_index == 0: selfie_mutated_chars = [random_character ] + chars_selfie[random_index + 1:] else: selfie_mutated_chars = chars_selfie[:random_index] + [ random_character ] + chars_selfie[random_index + 1:] # Delete a random character elif random_choice == 3: random_index = np.random.randint(len(chars_selfie)) if random_index == 0: selfie_mutated_chars = chars_selfie[random_index + 1:] else: selfie_mutated_chars = chars_selfie[: random_index] + chars_selfie[ random_index + 1:] else: raise Exception('Invalid Operation trying to be performed') selfie_mutated = "".join(x for x in selfie_mutated_chars) sf = "".join(x for x in chars_selfie) try: smiles = decoder(selfie_mutated) mol, smiles_canon, done = sanitize_smiles(smiles) if len(selfie_mutated_chars ) > max_molecules_len or smiles_canon == "": done = False if done: valid = True else: valid = False except: valid = False if fail_counter > 1 and write_fail_cases == True: f = open("selfie_failure_cases.txt", "a+") f.write('Tried to mutate SELFIE: ' + str(sf) + ' To Obtain: ' + str(selfie_mutated) + '\n') f.close() return (selfie_mutated, smiles_canon)
c='black', markersize=7, linewidth=3) # TARGETS clb.set_label('Joint Similarity', fontsize=10) ax.set_xlabel('LogP', fontsize=10) ax.set_ylabel('QED', fontsize=10) plt.xlim([-4, 8]) ax.grid(True) fig.tight_layout() # plt.savefig('./logP_v_QED_scatter.png', dpi=1000) plt.show() alphabet = list(selfies.get_semantic_robust_alphabet()) # 34 SELFIE characters max_len_random_struct = max([ len(get_selfie_chars(encoder(starting_smile))), len(get_selfie_chars(encoder(target_smile))) ]) min_len_random_struct = min([ len(get_selfie_chars(encoder(starting_smile))), len(get_selfie_chars(encoder(target_smile))) ]) num_samples = len(logP_path) random_selfies = [] for _ in range(num_samples): selfie = '' for i in range(random.randint(min_len_random_struct, max_len_random_struct)