def parses(s): s = convert_nx_to_smiles(convert_smiles_to_nx(s)) try: convert_nx_to_dict(convert_smiles_to_nx(s), atom_types, bond_types) return True except ValueError: return False
def compute_ic50(gen: Iterator[List[dict]], model: tf.keras.Model, atom_types: List[int], bond_types: List[str]) -> Iterator[List[dict]]: """Compute the IC50 of a chunk of molecules""" for chunk in gen: # Get the features for each molecule batch = [] tested_mols = [] for i, entry in enumerate(chunk): graph = convert_smiles_to_nx(entry['smiles']) try: graph_dict = convert_nx_to_dict(graph, atom_types, bond_types) except AssertionError: continue batch.append(graph_dict) tested_mols.append(i) # Prepare in input format keys = batch[0].keys() batch_dict = {} for k in keys: batch_dict[k] = np.concatenate([np.atleast_1d(b[k]) for b in batch], axis=0) inputs = combine_graphs(batch_dict) # Compute the IC50 ic50 = model.predict_on_batch(inputs).numpy()[:, 0] # Store in in the chunk data for i, v in zip(tested_mols, ic50): chunk[i]['pIC50_mpnn'] = v yield chunk
def update_actions(self, new_state: nx.Graph, allowed_space: Space): """Generate the available actions for a new state Uses the actions to redefine the action space for Args: new_state (str): Molecule used to define action space allowed_space (Space): Space of possible observations """ # Store the new state self._state = new_state # Compute the possible actions, which we describe by the new molecule they would form valid_actions = get_valid_actions( convert_nx_to_smiles(new_state), atom_types=self.atom_types, allow_removal=self.allow_removal, allow_no_modification=self.allow_no_modification, allowed_ring_sizes=self.allowed_ring_sizes, allow_bonds_between_rings=self.allow_bonds_between_rings, max_molecule_size=self.max_molecule_size) # Get only those actions which are in the desired space self._valid_actions = [ convert_smiles_to_nx(x) for x in valid_actions if x in allowed_space ]
def test_pickle(model, atom_types, bond_types): # Run inference on the first graph reward = MPNNReward(model, atom_types, bond_types) graph = convert_smiles_to_nx('CCC') reward(graph) # Clone the model reward2 = pkl.loads(pkl.dumps(reward)) assert isclose(reward(graph), reward2(graph), abs_tol=1e-6)
def test_reward(model, oneshot_model, atom_types, bond_types, target_mols): ic50_reward = MPNNReward(model, atom_types, bond_types) sim_reward = OneShotScore(oneshot_model, atom_types, bond_types, target_mols) reward = LogisticCombination(ic50_reward, sim_reward) graph = convert_smiles_to_nx('C') x = reward(graph) assert isinstance(x, float) assert 0 < x < 3
def __init__(self, smiles: List[str], atom_types: List[int], bond_types: List[str], outputs: List[float], batch_size: int, shuffle: bool = True, random_state: int = None): """ Args: smiles: List of molecules atom_types: List of known atom types bond_types: List of known bond types outputs: List of molecular outputs batch_size: Number of batches to use to train model shuffle: Whether to shuffle after each epoch random_state: Random state for the shuffling """ super(GraphLoader, self).__init__() # Convert the molecules to MPNN-ready formats mols = [ convert_nx_to_dict(convert_smiles_to_nx(s), atom_types, bond_types) for s in smiles ] self.entries = np.array(list(zip(mols, outputs))) # Other data self.batch_size = batch_size self.shuffle = shuffle # Give it a first shuffle, if needed self.rng = np.random.RandomState(random_state) if shuffle: self.rng.shuffle(self.entries)
def evaluate_mpnn(model_msg: MPNNMessage, smiles: List[str], atom_types: List[int], bond_types: List[str], batch_size: int = 128) -> np.ndarray: """Run inference on a list of molecules Args: model_msg: Serialized version of the model smiles: List of molecules to evaluate atom_types: List of known atom types bond_types: List of known bond types batch_size: List of molecules to create into matches Returns: Predicted value for each molecule """ # Rebuild the model tf.keras.backend.clear_session() model = model_msg.get_model() # Convert all SMILES strings to batches of molecules # TODO (wardlt): Use multiprocessing. Could benefit from a persistent Pool to avoid loading in TF many times mols = [ convert_nx_to_dict(convert_smiles_to_nx(s), atom_types, bond_types) for s in smiles ] chunks = [ mols[start:start + batch_size] for start in range(0, len(mols), batch_size) ] batches = [_merge_batch(c) for c in chunks] # Feed the batches through the MPNN outputs = [model.predict_on_batch(b) for b in batches] return np.vstack(outputs)
mpnn_dir = os.path.join('notebooks', 'mpnn-training') with open(os.path.join(mpnn_dir, 'atom_types.json')) as fp: atom_types = json.load(fp) with open(os.path.join(mpnn_dir, 'bond_types.json')) as fp: bond_types = json.load(fp) pt = GetPeriodicTable() if len(args.elements) == 0: elements = [pt.GetElementSymbol(i) for i in atom_types] else: elements = args.elements elements = [e for e in elements if MolFromSmiles(e) is not None] logger.info(f'Using {len(elements)} elements: {elements}') # Prepare the one-shot model. We the molecules to compare against and the comparison model with open(os.path.join('seed-molecules', 'top_100_pIC50.json')) as fp: comparison_mols = [convert_smiles_to_nx(s) for s in json.load(fp)] oneshot_dir = 'similarity' oneshot_model = load_model(os.path.join(oneshot_dir, 'oneshot_model.h5'), custom_objects=custom_objects) with open(os.path.join(oneshot_dir, 'atom_types.json')) as fp: os_atom_types = json.load(fp) with open(os.path.join(oneshot_dir, 'bond_types.json')) as fp: os_bond_types = json.load(fp) # Making all of the reward functions model = load_model(os.path.join(mpnn_dir, 'best_model.h5'), custom_objects=custom_objects) rewards = { 'logP': LogP(maximize=True),
'u0_atom': MPNNReward(model, atom_types, bond_types, maximize=False), } # Make the reward function if args.reward == 'u0_atom': reward = rewards['u0_atom'] else: raise ValueError(f'Reward function not defined: {args.reward}') run_params['maximize'] = reward.maximize # Set up environment action_space = MoleculeActions(elements, allow_removal=not args.no_backtrack) init_mol = args.initial_molecule if init_mol is not None: init_mol = convert_smiles_to_nx(init_mol) env = Molecule(action_space, reward=reward, init_mol=init_mol) logger.debug('using environment: %s' % env) # Setup agent agent = DQNFinalState(env, gamma=args.gamma, preprocessor=MorganFingerprints( args.fingerprint_size), batch_size=args.batch_size, epsilon=args.epsilon, q_network_dense=args.hidden_layers, epsilon_decay=args.epsilon_decay) # Make a test directory test_dir = os.path.join(
def test_mpnn_reward(model, atom_types, bond_types): reward = MPNNReward(model, atom_types, bond_types) graph = convert_smiles_to_nx('CCC') assert isinstance(reward(graph), float)
def test_reward(oneshot_model, atom_types, bond_types, target_mols): reward = OneShotScore(oneshot_model, atom_types, bond_types, target_mols) graph = convert_smiles_to_nx('CCC') assert isinstance(reward(graph), float)
'ic50': MPNNReward(model, atom_types, bond_types, maximize=True), 'QED': QEDReward(), 'SA': SAScore(), 'cycles': CycleLength() } if __name__ == "__main__": # Parse the inputs parser = ArgumentParser() parser.add_argument("smiles_file") args = parser.parse_args() # Load in the molecules with open(args.smiles_file) as fp: mols = [x.strip() for x in fp] # Get only the molecules that parse with RDKit mols = [x for x in mols if MolFromSmiles(x) is not None] # Compute the reward function statistics for all the rewards stats = {} for name, reward in rewards.items(): data = [ reward(convert_smiles_to_nx(mol)) for mol in tqdm(mols, desc=name) ] stats[name] = {'mean': np.mean(data), 'scale': np.std(data)} # Save as a json file with open('reward_ranges.json', 'w') as fp: json.dump(stats, fp, indent=2)