Пример #1
0
def _run_through(product_path, regressor, mol_chef_wae, seq_to_smi_list_func, cuda_details):
    # == Get the products we want to retrosyntheize  ==
    with open(product_path, 'r') as fo:
        products = [x.strip() for x in fo.readlines()]

    # == Get graph representation of the ones we care about ==
    graphs = []
    for mol_smi in tqdm.tqdm(products, desc="creating graphs"):
        mol = rdkit_general_ops.get_molecule(mol_smi, kekulize=True)
        mol, am_to_indx_map = rdkit_general_ops.add_atom_mapping(mol)
        mol_as_adj_list = rdkit_featurization_ops.mol_to_atom_feats_and_adjacency_list(mol, am_to_indx_map)
        graph = atom_features_dataset.trfm_mol_as_adj_list_to_graph_as_adj_list_trsfm(mol_as_adj_list)
        graphs.append(graph)

    # == Now regress to latent space & run decoder ==
    batch_size = 500
    predicted_latents = []
    resultant_reactants = []
    for i in tqdm.tqdm(range(math.ceil(len(graphs) / batch_size)), desc="to_z_and_then_bag"):
        graphs_of_batch = graphs[i*batch_size:(i+1)*batch_size]
        graphs_of_batch = graphs_of_batch[0].concatenate(graphs_of_batch)
        graphs_of_batch = graphs_of_batch.to_torch(cuda_details)
        latents_ = regressor(graphs_of_batch)
        predicted_latents.append(latents_.cpu().numpy())

        seq_, _ = mol_chef_wae.decode_from_z_no_grad(latents_)
        predicted_seqs_batch_first_np = seq_.cpu().numpy().T
        for seq in predicted_seqs_batch_first_np:
            seq_as_mols = seq_to_smi_list_func(seq)
            reactant_str = '.'.join(sorted(seq_as_mols))
            resultant_reactants.append(reactant_str)

    return resultant_reactants
 def transform_text_to_qed(text_line):
     molecules = [rdkit_general_ops.get_molecule(mol_str, kekulize=False) for mol_str in text_line.split('.')]
     qed_scores = [QED.qed(mol) for mol in molecules]
     # May have many products so take max (given this is what we are optimising for in the optimisation part).
     # Expect this to be less of an issue in practice as USPTO mostly details
     # single product reactions. It may be interesting to look at using the Molecular Transformer prediction on
     # these reactions rather than this ground truth and other ways of combining multiple products eg mean.
     return np.max(qed_scores)
Пример #3
0
    def __call__(self, path_to_smiles_file):
        graphs = []

        with open(path_to_smiles_file, 'r') as fo:
            lines = fo.readlines()

        for mol_smi in tqdm.tqdm(lines):
            mol = rdkit_general_ops.get_molecule(mol_smi, kekulize=True)
            mol, am_to_indx_map = rdkit_general_ops.add_atom_mapping(mol)
            mol_as_adj_list = rdkit_featurization_ops.mol_to_atom_feats_and_adjacency_list(mol, am_to_indx_map)
            graph = atom_features_dataset.trfm_mol_as_adj_list_to_graph_as_adj_list_trsfm(mol_as_adj_list)
            graphs.append((graph,))

        return graphs
Пример #4
0
def create_shared_dataset_files(reactants_to_reactant_id_dict):
    print("creating shared files")
    # Create file A
    with open(
            path.join(mchef_config.get_processed_data_dir(),
                      'reactants_to_reactant_id.json'), 'w') as fo:
        json.dump(reactants_to_reactant_id_dict, fo)

    # Create file B
    print(f"Creating reactant smi to reactant_id map.")
    reactant_feats = {}
    for smiles, id in tqdm.tqdm(reactants_to_reactant_id_dict.items()):
        mol = rdkit_general_ops.get_molecule(smiles, kekulize=True)
        mol, am_to_indx_map = rdkit_general_ops.add_atom_mapping(mol)
        reactant_feats[
            id] = rdkit_featurization_ops.mol_to_atom_feats_and_adjacency_list(
                mol, am_to_indx_map)
    with open(
            path.join(mchef_config.get_processed_data_dir(),
                      'reactants_feats.pick'), 'wb') as fo:
        pickle.dump(reactant_feats, fo)
Пример #5
0
def _canonicalize(smi_str):
    return rdkit_general_ops.return_canoncailised_smiles_str(
        rdkit_general_ops.get_molecule(smi_str, kekulize=False),
        kekuleSmiles=False)