def parses(s):
    s = convert_nx_to_smiles(convert_smiles_to_nx(s))
    try:
        convert_nx_to_dict(convert_smiles_to_nx(s), atom_types, bond_types)
        return True
    except ValueError:
        return False
Exemplo n.º 2
0
def compute_ic50(gen: Iterator[List[dict]], model: tf.keras.Model, atom_types: List[int],
                 bond_types: List[str]) -> Iterator[List[dict]]:
    """Compute the IC50 of a chunk of molecules"""

    for chunk in gen:
        # Get the features for each molecule
        batch = []
        tested_mols = []
        for i, entry in enumerate(chunk):
            graph = convert_smiles_to_nx(entry['smiles'])
            try:
                graph_dict = convert_nx_to_dict(graph, atom_types, bond_types)
            except AssertionError:
                continue
            batch.append(graph_dict)
            tested_mols.append(i)

        # Prepare in input format
        keys = batch[0].keys()
        batch_dict = {}
        for k in keys:
            batch_dict[k] = np.concatenate([np.atleast_1d(b[k]) for b in batch], axis=0)
        inputs = combine_graphs(batch_dict)

        # Compute the IC50
        ic50 = model.predict_on_batch(inputs).numpy()[:, 0]

        # Store in in the chunk data
        for i, v in zip(tested_mols, ic50):
            chunk[i]['pIC50_mpnn'] = v

        yield chunk
Exemplo n.º 3
0
    def __init__(self,
                 model: Model,
                 atom_types: List[int],
                 bond_types: List[str],
                 target_molecules: List[nx.Graph],
                 batch_size: int = 256,
                 maximize=True):
        """
        Args:
            model: Keras MPNN model for one-shot learning
            atom_types: List of known atomic types
            bond_types: List of known bond types
            target_molecules: Set of molecules to compare
        """
        super().__init__(model,
                         atom_types,
                         bond_types,
                         maximize,
                         big_value=0 if maximize else 1)

        # Convert the target molecules into batches
        target_dicts = [
            convert_nx_to_dict(g, self.atom_types, self.bond_types)
            for g in target_molecules
        ]
        self.batch_size = batch_size
        target_batches_temp = create_batches_from_objects(
            target_dicts, batch_size=batch_size)

        # Append a "_l" to the inputs for each of the batches
        self.target_molecules_length = len(target_molecules)
        self.target_batches = []
        for b in target_batches_temp:
            new_dict = dict((f'{k}_l', v) for k, v in b.items())
            self.target_batches.append(new_dict)
Exemplo n.º 4
0
    def _call(self, graph: nx.Graph) -> float:
        # Convert the graph to dict format, and add in "node_graph_indices"
        entry = convert_nx_to_dict(graph, self.atom_types, self.bond_types)
        if entry['n_bond'] == 0:
            return self.big_value
        entry = dict((k, tf.convert_to_tensor(v)) for k, v in entry.items())
        entry['node_graph_indices'] = tf.zeros((entry['n_atom'], ))

        # Run the molecule as a batch
        output = self.model.predict_on_batch(entry)
        return float(output[0, 0])
Exemplo n.º 5
0
    def _call(self, graph: nx.Graph) -> float:
        # Convert the graph to dict format, and add in "node_graph_indices"
        entry = convert_nx_to_dict(graph, self.atom_types, self.bond_types)
        if entry['n_bond'] == 0:
            return self.big_value

        # Make a set of batches for the "right side", merge them with the left-side batches
        batches_r = create_batches_from_objects(
            [entry] * self.target_molecules_length, self.batch_size)
        comparisons = []
        for batch_l, batch_r in zip(self.target_batches, batches_r):
            new_dict = dict((f'{k}_r', v) for k, v in batch_r.items())
            new_dict.update(batch_l)
            comparisons.append(new_dict)

        # Compute the maximum similarity
        output = 0
        for batch in comparisons:
            preds = self.model.predict_on_batch(batch)
            output = max(preds.numpy().max(), output)
        return float(output)  # Convert from float32 (not JSON-serializable)
Exemplo n.º 6
0
    def __init__(self,
                 smiles: List[str],
                 atom_types: List[int],
                 bond_types: List[str],
                 outputs: List[float],
                 batch_size: int,
                 shuffle: bool = True,
                 random_state: int = None):
        """

        Args:
            smiles: List of molecules
            atom_types: List of known atom types
            bond_types: List of known bond types
            outputs: List of molecular outputs
            batch_size: Number of batches to use to train model
            shuffle: Whether to shuffle after each epoch
            random_state: Random state for the shuffling
        """

        super(GraphLoader, self).__init__()

        # Convert the molecules to MPNN-ready formats
        mols = [
            convert_nx_to_dict(convert_smiles_to_nx(s), atom_types, bond_types)
            for s in smiles
        ]
        self.entries = np.array(list(zip(mols, outputs)))

        # Other data
        self.batch_size = batch_size
        self.shuffle = shuffle

        # Give it a first shuffle, if needed
        self.rng = np.random.RandomState(random_state)
        if shuffle:
            self.rng.shuffle(self.entries)
Exemplo n.º 7
0
def evaluate_mpnn(model_msg: MPNNMessage,
                  smiles: List[str],
                  atom_types: List[int],
                  bond_types: List[str],
                  batch_size: int = 128) -> np.ndarray:
    """Run inference on a list of molecules

    Args:
        model_msg: Serialized version of the model
        smiles: List of molecules to evaluate
        atom_types: List of known atom types
        bond_types: List of known bond types
        batch_size: List of molecules to create into matches
    Returns:
        Predicted value for each molecule
    """

    # Rebuild the model
    tf.keras.backend.clear_session()
    model = model_msg.get_model()

    # Convert all SMILES strings to batches of molecules
    # TODO (wardlt): Use multiprocessing. Could benefit from a persistent Pool to avoid loading in TF many times
    mols = [
        convert_nx_to_dict(convert_smiles_to_nx(s), atom_types, bond_types)
        for s in smiles
    ]
    chunks = [
        mols[start:start + batch_size]
        for start in range(0, len(mols), batch_size)
    ]
    batches = [_merge_batch(c) for c in chunks]

    # Feed the batches through the MPNN
    outputs = [model.predict_on_batch(b) for b in batches]
    return np.vstack(outputs)