예제 #1
0
    def test_SmartsMolFilter(self):
        smis = ['C1CCC1', 'C1CCC1C=O', 'CCCC', 'CCC=O', 'CC(=O)C', 'CCN', 'NCCN', 'NCC=O']
        mols = [Chem.MolFromSmiles(x) for x in smis]
        suppl = SupplyNode(contents=mols)
        self.assertEqual(len(list(suppl)), 8)

        smas = ['C=O', 'CN']
        counts = [1, 2]
        filt = SmartsMolFilter.SmartsFilter(patterns=smas, counts=counts)
        filt.AddParent(suppl)
        self.assertEqual(len(list(filt)), 5)

        suppl.reset()
        filt.SetNegate(True)
        self.assertEqual(len(list(filt)), 3)

        smas = ['C=O', 'CN']
        filt = SmartsMolFilter.SmartsFilter(patterns=smas)
        filt.AddParent(suppl)
        self.assertEqual(len(list(filt)), 6)

        self.assertRaises(ValueError, SmartsMolFilter.SmartsFilter, patterns=smas,
                          counts=['notEnough', ])
        RDLogger.DisableLog('rdApp.error')
        self.assertRaises(ValueError, SmartsMolFilter.SmartsFilter, patterns=['BadSmarts'])
        RDLogger.EnableLog('rdApp.error')
def as_atom(symbol):
    # Temporarily disable rdkit's logging to avoid spamming with
    # "WARNING: not removing hydrogen atom without neighbors"
    RDLogger.DisableLog('rdApp.warning')
    mol = Chem.MolFromSmiles(f'[{symbol}]')
    RDLogger.EnableLog('rdApp.warning')
    return mol.GetAtoms()[0]
예제 #3
0
 def test_SmartsRemover(self):
     salts = ['[Cl;H1&X1,-]', '[Na+]', '[O;H2,H1&-,X0&-2]', 'BadSmarts']
     RDLogger.DisableLog('rdApp.error')
     self.assertRaises(ValueError,
                       SmartsRemover.SmartsRemover,
                       patterns=salts)
     RDLogger.EnableLog('rdApp.error')
예제 #4
0
  def test_PatternHolder(self):
    fname = os.path.join(os.environ["RDBASE"], "Data", "NCI", "first_5K.smi")
    suppl = Chem.SmilesMolSupplier(fname, delimiter="\t", titleLine=False)
    mols1 = rdSubstructLibrary.CachedTrustedSmilesMolHolder()
    fps1 = rdSubstructLibrary.PatternHolder(2048)
    ssslib1 = rdSubstructLibrary.SubstructLibrary(mols1, fps1)
    mols2 = rdSubstructLibrary.CachedTrustedSmilesMolHolder()
    fps2 = rdSubstructLibrary.PatternHolder()
    ssslib2 = rdSubstructLibrary.SubstructLibrary(mols2, fps2)

    RDLogger.DisableLog('rdApp.error')
    for i in range(0, 1000, 10):
      try:
        mol = suppl[i]
      except Exception:
        continue
      if (not mol):
        continue
      mols1.AddSmiles(Chem.MolToSmiles(mol))
      fps1.AddFingerprint(fps1.MakeFingerprint(mol))
      ssslib2.AddMol(mol)
    RDLogger.EnableLog('rdApp.error')
    query = Chem.MolFromSmarts("N")
    self.assertIsNotNone(query)
    matches1 = sorted(ssslib1.GetMatches(query))
    matches2 = sorted(ssslib2.GetMatches(query))
    self.assertEqual(len(matches1), len(matches2))
    self.assertTrue(all([m1 == matches2[i] for i, m1 in enumerate(matches1)]))
예제 #5
0
def test_case_study_rrc(uri, user, password):

    with open(DATABASE_CONF, "w") as file:
        file.write("uri=" + str(uri) + "\n")
        file.write("user="******"\n")
        file.write("password="******"/case_studies/redundant_representation_case/iJR904_mapped.xml")
        components = [
            "cpd00214", "cpd03847", "cpd05274", "cpd25615", "cpd05237"
        ]

        solver = RedundantCaseSolver(model, "BiGG")

        solver.swap_from_generic(["cpd22513", "cpd15649"], components, True)
        # solver.generateISAreactions()

        os.remove(DATABASE_CONF)

    except:
        os.remove(DATABASE_CONF)
        raise Exception("Not well run")
예제 #6
0
def data_augm(rx_list):
    RDLogger.DisableLog('rdApp.*')
    rx_list_augm = rx_list.copy()
    for j, rx in enumerate(rx_list):
        rx_rand = rx
        i = 0
        while rx == rx_rand and i < 10:
            rx_mol = Chem.MolFromSmiles(rx)
            if rx_mol == None:
                print(rx)
            new_atom_order = list(range(rx_mol.GetNumAtoms()))
            random.shuffle(new_atom_order)
            random_mol = Chem.RenumberAtoms(rx_mol, newOrder=new_atom_order)
            rx_rand = Chem.MolToSmiles(random_mol,
                                       canonical=False,
                                       isomericSmiles=False)
            i += 1
        if rx_rand == rx:
            print(
                '\nFailed to generate random equivalent SMILES for the reaction:'
            )
            print(rx)
        else:
            rx_list_augm.append(rx_rand)
    return rx_list_augm
예제 #7
0
    def test_SmilesReaderBoundaryConditions(self):
        # Suppress the error message due to the incorrect smiles
        RDLogger.DisableLog('rdApp.error')

        smis = ['CC', 'CCOC', 'fail', 'CCO']
        supp = Chem.SmilesMolSupplierFromText('\n'.join(smis), ',', 0, -1, 0)
        self.assertEqual(len(supp), 4)
        self.assertIsNone(supp[2])
        self.assertIsNotNone(supp[3])

        supp = Chem.SmilesMolSupplierFromText('\n'.join(smis), ',', 0, -1, 0)
        self.assertIsNone(supp[2])
        self.assertIsNotNone(supp[3])
        self.assertEqual(len(supp), 4)
        with self.assertRaises(IndexError):
            supp[4]

        supp = Chem.SmilesMolSupplierFromText('\n'.join(smis), ',', 0, -1, 0)
        self.assertEqual(len(supp), 4)
        self.assertIsNotNone(supp[3])
        with self.assertRaises(IndexError):
            supp[4]

        supp = Chem.SmilesMolSupplierFromText('\n'.join(smis), ',', 0, -1, 0)
        with self.assertRaises(IndexError):
            supp[4]

        self.assertEqual(len(supp), 4)
        self.assertIsNotNone(supp[3])
예제 #8
0
def importDataFile(
        file_name: str,
        import_function: Callable[[str], pd.DataFrame] = pd.read_csv,
        fp_size: int = default_fp_size) -> pd.DataFrame:
    """
    Reads data as CSV or TSV and calculates fingerprints from the SMILES in the data.
    :param import_function:
    :param file_name: Filename of CSV files containing the training data. The
        SMILES/Fingerprints are stored 1st column
    :param fp_size: Number of bits in the fingerprint
    :return: Two pandas dataframe containing the X and Y matrix for training and/or prediction. If
        no outcome data is provided, the Y matrix is a None object.
    """
    # Read the data as Pandas pickle which already contains the calculated fingerprints
    name, ext = os.path.splitext(file_name)
    if ext == ".pkl":
        return pd.read_pickle(file_name)

    df = import_function(file_name)

    # disable the rdkit logger. We know that some inchis will fail and we took care of it. No use to spam the console
    RDLogger.DisableLog("rdApp.*")
    n_cores = multiprocessing.cpu_count()
    df_split = np.array_split(df, n_cores)
    with multiprocessing.Pool(n_cores) as pool:
        df = pd.concat(
            pool.map(partial(addFPColumn, fp_size=fp_size), df_split))
        pool.close()
        pool.join()
    return df
예제 #9
0
    def test_washing_with_dask(self):
        """Bit more elaborate of a test to see if rdkit handles a set of molecules in a consistent way in coming versions
        and to see how dask handles the used chem_functions functions.
        """

        expected = ['CC(C)=CCC/C(C)=C\\CC/C(C)=C\\CO', 'CC12CC(O)C(CC1=O)C2(C)C', 'Oc1cc(C2CCNCC2)on1',
                    'Cn1ncc2cc(CN)ccc21', 'O=C(O)c1cc(Cl)cs1', 'Cc1cc(CN)ncc1Br', 'CO[C@@H](C)[C@@H](N)C(=O)O',
                    'Nc1ccc(Br)c(F)c1[N+](=O)[O-]', 'Cc1ccc(F)c(C#N)n1', 'Cc1ccc(F)c(CN)n1']

        from rdkit import Chem
        from MCR import chem_functions
        import dask.bag as db
        from rdkit import RDLogger, rdBase

        rdBase.DisableLog('rdApp.error')
        RDLogger.DisableLog('rdApp.info')

        bag = db.read_text("../tests/test_data/test_db.smi", blocksize=16e6)
        bag = bag.map(lambda x: Chem.MolFromSmiles(x)).filter(lambda x: x is not None)
        bag = bag.map(chem_functions.remove_salts_mol)
        bag = bag.map(chem_functions.decharge_mol)
        bag = bag.map(chem_functions.get_largest_fragment_mol)
        bag = bag.map(chem_functions.standardize_mol)

        self.assertEqual([Chem.MolToSmiles(x) for x in bag.take(10)], expected)
예제 #10
0
def check_symmetry(met_filename):
    """ Function that checks if the given metabolite is symmetric.
    Uses pymatgen package for symmetry related operations, and
    RDKit for Molfile conversion to XYZ format. Requires Molfiles
    of the metabolites to be present in the working_dir/metabolites
    folder.
    Current criterion for symmetricity is for every carbon except one
    (central, if molecule consists of odd number of carbons) to have
    at least one equivalent carbon in the structure.

    Parameters
    ----------
    met_filename : str
        Filename of the specific Molfile

    Returns
    -------
    symmetrical : bool
        True if metabolite is symmetric, False if not.

    """
    symmetrical = False
    # Disable RDKit warnings
    RDLogger.DisableLog('rdApp.*')
    # Counter for non symmetrical carbon atoms
    non_eq_carbons = 0
    carbons = 0
    # Convert Molfile to XYZ string
    molecule = Chem.MolFromMolFile(f'metabolites/{met_filename}')
    molecule_xyz = Chem.rdmolfiles.MolToXYZBlock(molecule)

    # Create IMolecule object to analyze its' symmetricity
    try:
        molecule_obj = structure.IMolecule.from_str(molecule_xyz, fmt='xyz')
        if len(molecule_obj) == 1:
            return symmetrical
        # Initialize point group analyzer
        pg_analyzer = analyzer.PointGroupAnalyzer(molecule_obj)

    except (IndexError, ValueError):
        # '*' is unrecognized in particular
        print(f'{met_filename} contains unrecognized symbols')
        return symmetrical
    # Extract equal atom sets
    eq_atoms = pg_analyzer.get_equivalent_atoms()
    for i in eq_atoms['eq_sets'].keys():
        if str(molecule_obj[i].specie) == 'C':
            carbons += 1
            if len(eq_atoms['eq_sets'].get(i)) == 1:
                non_eq_carbons += 1
                if non_eq_carbons > 1:
                    return symmetrical

    # Molecule has more than 1 carbon, and at most 1 non-symmetrical carbon
    if carbons > 1:
        symmetrical = True

    return symmetrical
예제 #11
0
def create_coms_from_mol_list(conformer_list, gau_tpl_content, base_out_name,
                              max_num_coms, print_original):
    """
    From a list of RDKit mol objects, create gaussian output files, optionally for only the specified number of
    objects
    :param conformer_list:
    :param gau_tpl_content:
    :param base_out_name:
    :param max_num_coms: int or infinity
    :param print_original: Boolean, whether to print the initial conformation
    :return:
    """
    energy_list = []
    if print_original:
        start_at = 0
    else:
        start_at = 1

    RDLogger.DisableLog('rdApp.*')
    for current_mol in conformer_list[start_at:]:
        opt_results = MMFFOptimizeMoleculeConfs(current_mol, maxIters=0)
        energy_list.append(opt_results[0][1])

    combined_lists = zip(energy_list, conformer_list)
    zipped_sorted = sorted(combined_lists, key=itemgetter(0))

    # for energy in sorted(energy_list):
    #     print(f"{energy:15.8f}")
    mol_num = 0
    last_energy = np.nan
    print_note = False
    com_fname = None
    for energy, current_mol in zipped_sorted:
        if mol_num >= max_num_coms:
            if np.isclose(energy, last_energy):
                print_note = True
            else:
                break
        mol_num += 1
        last_energy = energy
        com_fname = create_out_fname(base_out_name,
                                     suffix=f"_{mol_num}",
                                     ext=".com",
                                     rel_path=True)
        pdb_str = MolToPDBBlock(current_mol)
        create_com_from_pdb_str(pdb_str, gau_tpl_content, com_fname)
        print(f"{int(energy):12,} {com_fname}")

    if com_fname:
        print(
            f"Wrote {mol_num} files, ending with: {os.path.relpath(com_fname)}"
        )
    else:
        print("No output created from rotating dihedrals.")
    if print_note:
        print(
            f"More than {max_num_coms} conformations were output to ties calculated energies."
        )
def sample_Reaxys(df, s):
    #Remove rdkit warnings
    RDLogger.DisableLog('rdApp.*')

    #Sample given sample size s
    smiles = df.sample(frac=s / len(df.index))["smiles"].tolist()
    smiles = list(map(str, smiles))

    #Convert all sampled smiles strings into mols
    mols = [Chem.MolFromSmiles(smi.strip()) for smi in smiles]
    mols = [m for m in mols if m != None]
    print("Retieved", len(mols), "random molecules")
    return mols
예제 #13
0
def sdf_text_worker(merged_results, vendors, num_mols, start_time, mol_counter,
                    fragment_counter, drug_like_counter, big_counter,
                    parent_fragment_collector, parent_drug_like_collector,
                    parent_big_collector, failures, addhs, embed, verbose):
    if not verbose:
        RDLogger.DisableLog('rdApp.*')
    fragment_collector, drug_like_collector, big_collector = [], [], []
    for index, row in merged_results.iterrows():
        try:
            mol = Chem.MolFromSmiles(row['smiles'])
            if addhs:
                mol = Chem.AddHs(mol)
            if embed:
                AllChem.EmbedMolecule(mol)
            properties = {vendor: row[vendor] for vendor in vendors}
            mol_name = ','.join([
                identifier for identifier in properties.values()
                if len(identifier) > 0
            ])
            if len(mol_name) > 20:
                mol_name = mol_name[:17] + '...'
            mol.SetProp('_Name', mol_name)
            properties['smiles'] = row['smiles']
            molecular_weight = ExactMolWt(mol)
        except:
            failures.append(' '.join(['write_error', row['smiles']]))
            molecular_weight = 10000
        if molecular_weight < 1200:
            if molecular_weight < 300:
                with fragment_counter.get_lock():
                    fragment_counter.value += 1
                fragment_collector.append(sdf_text(mol, properties))
            elif 300 <= molecular_weight < 700:
                with drug_like_counter.get_lock():
                    drug_like_counter.value += 1
                drug_like_collector.append(sdf_text(mol, properties))
            else:
                with big_counter.get_lock():
                    big_counter.value += 1
                big_collector.append(sdf_text(mol, properties))
        with mol_counter.get_lock():
            mol_counter.value += 1
            update_progress(mol_counter.value / num_mols,
                            'Progress of writing',
                            ((time.time() - start_time) / mol_counter.value) *
                            (num_mols - mol_counter.value))
    parent_fragment_collector.extend(fragment_collector)
    parent_drug_like_collector.extend(drug_like_collector)
    parent_big_collector.extend(big_collector)
    return
예제 #14
0
 def setUp(self):
   self.dataset = dict()
   self.dataset_inchi = dict()
   inf = gzip.open(os.path.join(RDConfig.RDCodeDir, 'Chem/test_data', 'pubchem-hard-set.sdf.gz'),
                   'r')
   self.dataset['problematic'] = ForwardSDMolSupplier(inf, sanitize=False, removeHs=False)
   with open(os.path.join(RDConfig.RDCodeDir, 'Chem/test_data', 'pubchem-hard-set.inchi'),
             'r') as intF:
     buf = intF.read().replace('\r\n', '\n').encode('latin1')
     intF.close()
   with io.BytesIO(buf) as inF:
     pkl = inF.read()
   self.dataset_inchi['problematic'] = pickle.loads(pkl, encoding='latin1')
   # disable logging
   RDLogger.DisableLog('rdApp.warning')
예제 #15
0
    def _gen_compound(mol):
        rkl.DisableLog("rdApp.*")
        try:
            if explicit_h:
                mol = RemoveHs(mol)

            # resolve potential tautomers and choose first one
            mol_smiles = MolToSmiles(mol, True)
            if "n" in mol_smiles:
                mol_smiles = utils.postsanitize_smiles([mol_smiles])[0][0]
                mol = MolFromSmiles(mol_smiles)

            SanitizeMol(mol)

        # TODO: logger
        # Get lots of "Explicit valence greater than permitted" errors here
        # This is for predicted compounds that are infeasible, so we throw them out
        except BaseException:
            return None
        rkl.EnableLog("rdApp.*")

        mol_smiles = MolToSmiles(mol, True)
        if "." in mol_smiles:
            return None

        cpd_id, inchi_key = utils.get_compound_hash(mol_smiles, "Predicted")
        if cpd_id:
            if cpd_id not in local_cpds:
                cpd_dict = {
                    "ID": None,
                    "_id": cpd_id,
                    "SMILES": mol_smiles,
                    "InChI_key": inchi_key,
                    "Type": "Predicted",
                    "Generation": generation,
                    "atom_count": utils.get_atom_count(mol),
                    "Reactant_in": [],
                    "Product_of": [],
                    "Expand": True,
                    "Formula": CalcMolFormula(mol),
                    "last_tani": 0,
                }
            else:
                cpd_dict = local_cpds[cpd_id]

            return cpd_dict
        else:
            return None
예제 #16
0
 def sample(self, num=1, start='G'):
     sampled = []
     if self.session == 'generate':
         for _ in tqdm(range(num)):
             sampled.append(self._generate(start))
         return sampled
     else:
         from rdkit import Chem, RDLogger
         RDLogger.DisableLog('rdApp.*')
         while len(sampled) < num:
             sequence = self._generate(start)
             mol = Chem.MolFromSmiles(sequence)
             if mol is not None:
                 canon_smiles = Chem.MolToSmiles(mol)
                 sampled.append(canon_smiles)
         return sampled
예제 #17
0
    def _preprocess(self):
        x = {
            'product': [],
            'substrates': [],
        }

        split = []
        meta = []
        split_keys = ['train', 'valid', 'test']

        # there is a warning about hydrogen atoms that do not have neighbors that could not be deleted (that is OK)
        RDLogger.DisableLog('rdApp.*')

        for split_i, split_key in enumerate(split_keys):
            split_path = os.path.join(self.feat_dir, f'data/{split_key}.txt')

            file_len = sum(1 for _ in open(split_path, 'r'))
            for line in tqdm(open(split_path, 'r'),
                             desc=f'reading {split_key} reactions',
                             total=file_len):
                split_line = line.split(' ')
                reaction = split_line[0]
                meta_info = split_line[1].strip()
                subs, prod = tuple(reaction.split('>>'))
                subs = subs.strip()
                prod = prod.strip()
                x['substrates'].append(subs)
                x['product'].append(prod)
                split.append(split_i)
                meta.append(meta_info)
            logger.info(f'Saved {file_len} {split_key} reactions')

        split = np.asarray(split, dtype=int)
        split_df = dict(
            (k, (split == i).astype(int)) for i, k in enumerate(split_keys))

        meta = {'uspto_mit_split': split, 'meta_info': meta}

        logger.info(f"Saving 'x' to {self.x_path}")
        pd.DataFrame(x).to_csv(self.x_path, sep='\t')

        logger.info(f"Saving {self.metadata_path}")
        pd.DataFrame(meta).to_csv(self.metadata_path, sep='\t')

        split_path = os.path.join(self.dir, 'default_split.csv')
        logger.info(f"Saving default split to {split_path}")
        pd.DataFrame(split_df).to_csv(split_path)
예제 #18
0
def formalCharge(molecule):
    """Compute the formal charge on a molecule. This function requires that
       the molecule has explicit hydrogen atoms.

       Parameters
       ----------

       molecule : :class:`Molecule <BioSimSpace._SireWrappers.Molecule>`
           A molecule object.

       Returns
       -------

       formal_charge : :class:`Charge <BioSimSpace.Types.Charge>`
           The total formal charge on the molecule.
    """

    if type(molecule) is not _Molecule:
        raise TypeError("'molecule' must be of type 'BioSimSpace._SireWrappers.Molecule'")

    from rdkit import Chem as _Chem
    from rdkit import RDLogger as _RDLogger

    # Disable RDKit warnings.
    _RDLogger.DisableLog('rdApp.*')

    # Create a temporary working directory.
    tmp_dir = _tempfile.TemporaryDirectory()
    work_dir = tmp_dir.name

    # Zero the total formal charge.
    formal_charge = 0

    # Run in the working directory.
    with _Utils.cd(work_dir):

        # Save the molecule to a PDB file.
        _IO.saveMolecules("tmp", molecule, "PDB")

        # Read the ligand PDB into an RDKit molecule.
        mol = _Chem.MolFromPDBFile("tmp.pdb")

        # Compute the formal charge.
        formal_charge = _Chem.rdmolops.GetFormalCharge(mol)

    return formal_charge * _electron_charge
def create_fingerprints(df_Without_Double_or_Triple, similarity_value=0.95):
    """
    Gets a data frame with only a single entry in the taxonomy row.
    
    Uses RDkit modul to create Morgan-Fingerprints from the smiles code of each aglycon.

    Passes the input of the similarity value, the smiles code of the aglycons, the fingerprint of the aglycons and
    the created data frame with only single entries in the taxonomy row.
    """
    with open(df_Without_Double_or_Triple, "rb") as infile:
        df_Without_Double_or_Triple = pickle.load(infile, encoding="utf-8")

    mol_From_Smiles = []
    index_Mol_Explicit_Valence = []
    index_Mol_Implicit_Valence = []
    index = 0
    RDLogger.DisableLog('rdApp.*')
    for smiles in df_Without_Double_or_Triple.deglycosilated_smiles:
        mol = Chem.MolFromSmiles(smiles)
        if mol == None:
            index_Mol_Explicit_Valence.append(index)
        else:
            mol_From_Smiles.append(mol)
            index_Mol_Implicit_Valence.append(index)
        index += 1
    #print(index_Mol_Explicit_Valence)
    df_Without_Explicit_Valence = df_Without_Double_or_Triple.iloc[
        index_Mol_Implicit_Valence[:]]
    df_Without_Explicit_Valence = df_Without_Explicit_Valence.reset_index()
    #df_Without_Explicit_Valence
    fps = [
        AllChem.GetMorganFingerprint(mol, 2, useFeatures=True)
        for mol in mol_From_Smiles
    ]
    # create combinations of deglycosilated_smiles for indexing
    aglycon_formula_for_indexing = list(
        df_Without_Explicit_Valence.deglycosilated_smiles)
    aglycon_formulas = [
        aglycon_pair for aglycon_pair in itertools.combinations(
            aglycon_formula_for_indexing, 2)
    ]
    #print(len(aglycon_formulas))
    print("MORGAN FINGERPRINTS DONE")
    create_tanimoto_index(similarity_value, aglycon_formulas, fps,
                          df_Without_Double_or_Triple)
def main():
    # ### AGAVE TEST ###
    # agave_test()

    # #Test: Time Fragments for Earth atmosphere SMILES strings
    # cpd_smiles = open("Other/Earth_atmosphere_SMILES.txt", "rb").readlines()

    # ### Get KEGG Mol Objects ###
    # kegg_mols = read_KEGG_mols()
    #
    # ### Get Reaxys Mol Objects ###
    # #Read in full reaction database
    # df = pd.DataFrame()
    # for i in range(1,11):
    #     df = df.append(read_cpds(str(i)), ignore_index=True)
    #     print("Done with subset", i, "...")
    #     print("Df size", len(df.index))

    ### ADENINE TEST (FOR ERNEST) ###
    #adenine_fragments("C1=NC2=NC=NC(=C2N1)N", cpd_mols)

    ### PARALLEL FRAGMENT GENERTION ###
    pool = Pool(processes=8)
    RDLogger.DisableLog('rdApp.*')

    # #kegg_size = len(kegg_mols)
    # for i in range(10):
    #     print("Analyzing sample", i)
    #     fp = "Technology/Data/Reaxys_1000_Samples/"
    #     reaxys_mols = sample_Reaxys(df, 1000)
    #
    #     #Save mols for future occurrence testing
    #     pickle.dump(reaxys_mols, open(fp + "sample_" + str(i) + "_ReaxysMols.p", "wb"))
    #
    #     generate_fragments(pool, reaxys_mols, fp + "sample_" + str(i) + "frags.p")
    #
    #     ### FIND UNIQUE FRAGMENTS ###
    #     find_unique_frags(pool, fp + "sample_" + str(i) + "frags.p", fp + "sample_" + str(i) + "frags_unique.p")
    #     print()

    # #for fp in os.listdir("Technology/Data/"):
    #Test on one file
    find_unique_frags(pool, "Technology/Data/Reaxys_fragments_keggSize_0.p",
                      "Technology/Data/Reaxys_fragments_keggSize_0unique.p")
예제 #21
0
  def test3SmilesSupplier(self):
    txt = """C1CC1,1
CC(=O)O,3
fail,4
CCOC,5
"""
    RDLogger.DisableLog('rdApp.error')

    fileN = tempfile.mktemp('.csv')
    try:
      with open(fileN, 'w+') as f:
        f.write(txt)
      suppl = Chem.SmilesMolSupplier(fileN, delimiter=',', smilesColumn=0, nameColumn=1,
                                     titleLine=0)
      ms = [x for x in suppl]
      while ms.count(None):
        ms.remove(None)
      self.assertEqual(len(ms), 3)
    finally:
      os.unlink(fileN)
예제 #22
0
    def test3SmilesSupplier(self):
        txt = """C1CC1,1
CC(=O)O,3
fail,4
CCOC,5
"""
        RDLogger.DisableLog('rdApp.error')

        try:
            with tempfile.NamedTemporaryFile('w+', suffix='.csv', delete=False) as tmp:
                tmp.write(txt)
            suppl = Chem.SmilesMolSupplier(tmp.name, delimiter=',', smilesColumn=0, nameColumn=1,
                                           titleLine=0)
            ms = [x for x in suppl]
            suppl = None
            while ms.count(None):
                ms.remove(None)
            self.assertEqual(len(ms), 3)
        finally:
            os.unlink(tmp.name)
예제 #23
0
def search(query: str, min_mw: float, max_mw: float,
           layout: widgets.Box) -> None:
    with get_new_log_box(layout):
        clear_search_output(layout)
        results = get_synonym_matches(query)
        for cur in results:
            RDLogger.DisableLog("rdApp.*")  # hide rdkit warnings
            cur["mol"] = cheminfo.normalize_molecule(
                Chem.inchi.MolFromInchi(cur["inchi"]))
            cur["norm_inchi"] = Chem.inchi.MolToInchi(cur["mol"])
            RDLogger.EnableLog("rdApp.*")
            cur["MW"] = ExactMolWt(cur["mol"])
        filtered = filter_by_mw(filter_to_norm_inchi_in_db(results), min_mw,
                                max_mw)
        logger.debug("Found %d matches to %s.", len(filtered), query)
        if not is_valid_num_results(len(filtered), query, layout):
            return
        final = sorted(filtered, key=lambda x: x["MW"])
        logger.debug("Num mols: %d", len(final))
        column_names = ["", "Name", "MW", "Structure"]
        sheet = ipysheet.sheet(
            rows=len(final),
            columns=len(column_names),
            column_headers=column_names,
            column_resizing=False,
            column_width=[1, 4, 2, 10],
        )
        buttons = [
            widgets.Button(description="use",
                           layout=widgets.Layout(width="100%")) for x in final
        ]
        for button in buttons:
            button.on_click(
                lambda current: on_use_button_clicked(current, final, layout))
        ipysheet.column(0, buttons)
        ipysheet.column(1, [x["name"] for x in final])
        ipysheet.column(2, [ExactMolWt(x["mol"]) for x in final])
        ipysheet.column(3, [cheminfo.mol_to_image(x["mol"]) for x in final])
        layout.children = swap_layout(layout.children,
                                      LayoutPosition.SEARCH_OUTPUT.value,
                                      sheet)
import pandas as pd
from IPython import display
from keras.layers import Input, Dense, Conv1D, MaxPooling2D, UpSampling2D, UpSampling1D, MaxPooling1D, Lambda
from keras.layers.recurrent import GRU
from keras.layers.core import Dense, Flatten, RepeatVector, Dropout
from keras.losses import mse, binary_crossentropy, categorical_crossentropy
from keras.layers.merge import Concatenate
from keras.models import Model
from keras import backend as K
from keras.layers.normalization import BatchNormalization
from keras.callbacks import ModelCheckpoint
from rdkit import RDLogger  
from sklearn.model_selection import train_test_split
import time
RDLogger.DisableLog('rdApp.*') 




def add_space(raw_data, input_dim = 34):
    out = []
    for i in raw_data:
        if len(i) < input_dim:
            out.append(i+' '*(input_dim - len(i)))
        else:
            out.append(i)           
    return(out)

def plot_auto(out, predict_st):
    size = (50, 50)
예제 #25
0
def from_smiles(smiles: str, with_hydrogen: bool = False,
                kekulize: bool = False):
    r"""Converts a SMILES string to a :class:`torch_geometric.data.Data`
    instance.

    Args:
        smiles (string, optional): The SMILES string.
        with_hydrogen (bool, optional): If set to :obj:`True`, will store
            hydrogens in the molecule graph. (default: :obj:`False`)
        kekulize (bool, optional): If set to :obj:`True`, converts aromatic
            bonds to single/double bonds. (default: :obj:`False`)
    """
    from rdkit import Chem, RDLogger

    from torch_geometric.data import Data

    RDLogger.DisableLog('rdApp.*')

    mol = Chem.MolFromSmiles(smiles)

    if mol is None:
        mol = Chem.MolFromSmiles('')
    if with_hydrogen:
        mol = Chem.AddHs(mol)
    if kekulize:
        mol = Chem.Kekulize(mol)

    xs = []
    for atom in mol.GetAtoms():
        x = []
        x.append(x_map['atomic_num'].index(atom.GetAtomicNum()))
        x.append(x_map['chirality'].index(str(atom.GetChiralTag())))
        x.append(x_map['degree'].index(atom.GetTotalDegree()))
        x.append(x_map['formal_charge'].index(atom.GetFormalCharge()))
        x.append(x_map['num_hs'].index(atom.GetTotalNumHs()))
        x.append(x_map['num_radical_electrons'].index(
            atom.GetNumRadicalElectrons()))
        x.append(x_map['hybridization'].index(str(atom.GetHybridization())))
        x.append(x_map['is_aromatic'].index(atom.GetIsAromatic()))
        x.append(x_map['is_in_ring'].index(atom.IsInRing()))
        xs.append(x)

    x = torch.tensor(xs, dtype=torch.long).view(-1, 9)

    edge_indices, edge_attrs = [], []
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()

        e = []
        e.append(e_map['bond_type'].index(str(bond.GetBondType())))
        e.append(e_map['stereo'].index(str(bond.GetStereo())))
        e.append(e_map['is_conjugated'].index(bond.GetIsConjugated()))

        edge_indices += [[i, j], [j, i]]
        edge_attrs += [e, e]

    edge_index = torch.tensor(edge_indices)
    edge_index = edge_index.t().to(torch.long).view(2, -1)
    edge_attr = torch.tensor(edge_attrs, dtype=torch.long).view(-1, 3)

    if edge_index.numel() > 0:  # Sort indices.
        perm = (edge_index[0] * x.size(0) + edge_index[1]).argsort()
        edge_index, edge_attr = edge_index[:, perm], edge_attr[perm]

    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr, smiles=smiles)
예제 #26
0
import os
import pickle
from glob import glob

import numpy as np
import pandas as pd
import requests
from rdkit import RDLogger
from rdkit.Chem import MolFromSmiles, MolFromSmarts
from rdkit.Chem.inchi import MolFromInchi, MolToInchi
from tqdm import tqdm

from molgrad.utils import DATA_PATH, PROCESSED_DATA_PATH

RDLogger.DisableLog("rdApp.*")
IUPAC_REST = "http://cactus.nci.nih.gov/chemical/structure/{}/inchi"


def smi_to_inchi_with_val(smiles, ovalues):
    inchis = []
    values = []

    for smi, val in zip(smiles, ovalues):
        mol = MolFromSmiles(smi)
        if mol is not None:
            try:
                inchi = MolToInchi(mol)
                m = MolFromInchi(inchi)
                if m is not None:  # ensure rdkit can read an inchi it just wrote...
                    inchis.append(inchi)
                    values.append(val)
예제 #27
0
    def test1InchiReadPubChem(self):
        for f in self.dataset.values():
            same, diff, reasonable = 0, 0, 0
            for m in f:
                if m is None:  # pragma: nocover
                    continue
                x = MolToInchi(m)
                y = None
                RDLogger.DisableLog('rdApp.error')
                mol = MolFromInchi(x)
                RDLogger.EnableLog('rdApp.error')
                if mol is not None:
                    y = MolToInchi(
                        MolFromSmiles(MolToSmiles(mol, isomericSmiles=True)))
                if y is None:
                    # metal involved?
                    try:
                        MolToInchi(m, treatWarningAsError=True)
                    except InchiReadWriteError as inst:
                        _, error = inst.args
                        if 'Metal' in error or \
                                'Charges were rearranged' in error:
                            reasonable += 1
                            continue
                    # THERE ARE NO EXAMPLES FOR THE FOLLOWING (no coverage)
                    # RDKit does not like the SMILES? use MolBlock instead
                    inchiMol = MolFromInchi(x)
                    if inchiMol:
                        rdDepictor.Compute2DCoords(inchiMol)
                        z = MolToInchi(MolFromMolBlock(
                            MolToMolBlock(inchiMol)))
                        if x == z:
                            reasonable += 1
                            continue
                    # InChI messed up the radical?
                    unsanitizedInchiMol = MolFromInchi(x, sanitize=False)
                    if sum([
                            a.GetNumRadicalElectrons() * a.GetAtomicNum()
                            for a in m.GetAtoms()
                            if a.GetNumRadicalElectrons() != 0
                    ]) != sum([
                            a.GetNumRadicalElectrons() * a.GetAtomicNum()
                            for a in unsanitizedInchiMol.GetAtoms()
                            if a.GetNumRadicalElectrons() != 0
                    ]):
                        reasonable += 1
                        continue

                    diff += 1
                    cid = m.GetProp('PUBCHEM_COMPOUND_CID')
                    print(COLOR_GREEN + 'Empty mol for PubChem Compound ' +
                          cid + '\n' + COLOR_RESET)
                    continue
                if x != y:
                    # if there was warning in the first place, then this is
                    # tolerable
                    try:
                        MolToInchi(m, treatWarningAsError=True)
                        MolFromInchi(x, treatWarningAsError=True)
                    except InchiReadWriteError as inst:
                        reasonable += 1
                        continue
                    # or if there are big rings
                    SanitizeMol(m)
                    if filter(lambda i: i >= 8,
                              [len(r) for r in m.GetRingInfo().AtomRings()]):
                        reasonable += 1
                        continue
                    # THERE ARE NO EXAMPLES FOR THE FOLLOWING (no coverage)
                    # or if RDKit loses bond stereo
                    s = MolToSmiles(m, True)
                    if MolToSmiles(MolFromSmiles(s), True) != s:
                        reasonable += 1
                        continue
                    # or if it is RDKit SMILES writer unhappy about the mol
                    inchiMol = MolFromInchi(x)
                    rdDepictor.Compute2DCoords(inchiMol)
                    z = MolToInchi(MolFromMolBlock(MolToMolBlock(inchiMol)))
                    if x == z:
                        reasonable += 1
                        continue

                    diff += 1
                    print(COLOR_GREEN +
                          'Molecule mismatch for PubChem Compound ' + cid +
                          COLOR_RESET)
                    print(inchiDiff(x, y))
                    print()
                else:
                    same += 1
            fmt = "\n{0}InChI read Summary: {1} identical, {2} variance, {3} reasonable variance{4}"
            print(fmt.format(COLOR_GREEN, same, diff, reasonable, COLOR_RESET))
            self.assertEqual(same, 621)
            self.assertEqual(diff, 0)
            self.assertEqual(reasonable, 560)
예제 #28
0
    def process(self):
        try:
            import rdkit
            from rdkit import Chem, RDLogger
            from rdkit.Chem.rdchem import BondType as BT
            from rdkit.Chem.rdchem import HybridizationType
            RDLogger.DisableLog('rdApp.*')

        except ImportError:
            rdkit = None

        if rdkit is None:
            print(("Using a pre-processed version of the dataset. Please "
                   "install 'rdkit' to alternatively process the raw data."),
                  file=sys.stderr)

            data_list = torch.load(self.raw_paths[0])
            data_list = [Data(**data_dict) for data_dict in data_list]

            if self.pre_filter is not None:
                data_list = [d for d in data_list if self.pre_filter(d)]

            if self.pre_transform is not None:
                data_list = [self.pre_transform(d) for d in data_list]

            torch.save(self.collate(data_list), self.processed_paths[0])
            return

        types = {'H': 0, 'C': 1, 'N': 2, 'O': 3, 'F': 4}
        bonds = {BT.SINGLE: 0, BT.DOUBLE: 1, BT.TRIPLE: 2, BT.AROMATIC: 3}

        with open(self.raw_paths[1], 'r') as f:
            target = f.read().split('\n')[1:-1]
            target = [[float(x) for x in line.split(',')[1:20]]
                      for line in target]
            target = torch.tensor(target, dtype=torch.float)
            target = torch.cat([target[:, 3:], target[:, :3]], dim=-1)
            target = target * conversion.view(1, -1)

        with open(self.raw_paths[2], 'r') as f:
            skip = [int(x.split()[0]) - 1 for x in f.read().split('\n')[9:-2]]

        suppl = Chem.SDMolSupplier(self.raw_paths[0],
                                   removeHs=False,
                                   sanitize=False)

        data_list = []
        for i, mol in enumerate(tqdm(suppl)):
            if i in skip:
                continue

            N = mol.GetNumAtoms()

            pos = suppl.GetItemText(i).split('\n')[4:4 + N]
            pos = [[float(x) for x in line.split()[:3]] for line in pos]
            pos = torch.tensor(pos, dtype=torch.float)

            type_idx = []
            atomic_number = []
            aromatic = []
            sp = []
            sp2 = []
            sp3 = []
            num_hs = []
            for atom in mol.GetAtoms():
                type_idx.append(types[atom.GetSymbol()])
                atomic_number.append(atom.GetAtomicNum())
                aromatic.append(1 if atom.GetIsAromatic() else 0)
                hybridization = atom.GetHybridization()
                sp.append(1 if hybridization == HybridizationType.SP else 0)
                sp2.append(1 if hybridization == HybridizationType.SP2 else 0)
                sp3.append(1 if hybridization == HybridizationType.SP3 else 0)

            z = torch.tensor(atomic_number, dtype=torch.long)

            row, col, edge_type = [], [], []
            for bond in mol.GetBonds():
                start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
                row += [start, end]
                col += [end, start]
                edge_type += 2 * [bonds[bond.GetBondType()]]

            edge_index = torch.tensor([row, col], dtype=torch.long)
            edge_type = torch.tensor(edge_type, dtype=torch.long)
            edge_attr = F.one_hot(edge_type,
                                  num_classes=len(bonds)).to(torch.float)

            perm = (edge_index[0] * N + edge_index[1]).argsort()
            edge_index = edge_index[:, perm]
            edge_type = edge_type[perm]
            edge_attr = edge_attr[perm]

            row, col = edge_index
            hs = (z == 1).to(torch.float)
            num_hs = scatter(hs[row], col, dim_size=N).tolist()

            x1 = F.one_hot(torch.tensor(type_idx), num_classes=len(types))
            x2 = torch.tensor([atomic_number, aromatic, sp, sp2, sp3, num_hs],
                              dtype=torch.float).t().contiguous()
            x = torch.cat([x1.to(torch.float), x2], dim=-1)

            y = target[i].unsqueeze(0)
            name = mol.GetProp('_Name')

            data = Data(x=x,
                        z=z,
                        pos=pos,
                        edge_index=edge_index,
                        edge_attr=edge_attr,
                        y=y,
                        name=name,
                        idx=i)

            if self.pre_filter is not None and not self.pre_filter(data):
                continue
            if self.pre_transform is not None:
                data = self.pre_transform(data)

            data_list.append(data)

        torch.save(self.collate(data_list), self.processed_paths[0])
예제 #29
0
    def _get_molecule_database(self, molecule_database_src,
                               molecule_database_src_type):
        """Load molecular database and return it.
        Optionally return features if found in excel / csv file.

        Args:
            molecule_database_src (str):
                Source of molecular information. Can be a folder or a filepath.
                In case a folder is specified, all .pdb files in the folder
                are sequentially read.
                If a file path, it is assumed that the file is a .txt file with
                layout: SMILES string (column1) '\b' property (column2, optional).
            molecule_database_src_type (str):
                Type of source. Can be ['folder', 'text', 'excel', 'csv']

        Returns:
            (list(Molecule), np.ndarray or None)
                Returns a tuple. First element of tuple is the molecule_database.
                Second element is array of features of shape
                (len(molecule_database), n_features) or None if None found.

        """
        if not self.is_verbose:
            RDLogger.DisableLog('rdApp.*')

        molecule_database = []
        features = None
        if molecule_database_src_type.lower() in ["folder", "directory"]:
            if self.is_verbose:
                print(f"Searching for *.pdb files in {molecule_database_src}")
            for molfile in glob(os.path.join(molecule_database_src, "*.pdb")):
                if self.is_verbose:
                    print(f"Loading {molfile}")
                try:
                    molecule_database.append(Molecule(mol_src=molfile))
                except LoadingError as e:
                    if self.is_verbose:
                        print(f"{molfile} could not be imported. Skipping")

        elif molecule_database_src_type.lower() == "text":
            if self.is_verbose:
                print(f"Reading SMILES strings from {molecule_database_src}")
            with open(molecule_database_src, "r") as fp:
                smiles_data = fp.readlines()
            for count, line in enumerate(smiles_data):
                # Assumes that the first column contains the smiles string
                line_fields = line.split()
                smile = line_fields[0]
                mol_property_val = None
                if len(line_fields) > 1:
                    mol_property_val = float(line_fields[1])
                if self.is_verbose:
                    print(f"Processing {smile} "
                          f"({count + 1}/"
                          f"{len(smiles_data)})")
                mol_text = smile
                try:
                    molecule_database.append(
                        Molecule(
                            mol_smiles=smile,
                            mol_text=mol_text,
                            mol_property_val=mol_property_val,
                        ))
                except LoadingError as e:
                    if self.is_verbose:
                        print(f"{smile} could not be imported. Skipping")

        elif molecule_database_src_type.lower() in ["excel", "csv"]:
            if self.is_verbose:
                print(f"Reading molecules from {molecule_database_src}")
            database_df = (pd.read_excel(molecule_database_src,
                                         engine="openpyxl")
                           if molecule_database_src_type.lower() == "excel"
                           else pd.read_csv(molecule_database_src))
            # expects feature columns to be prefixed with feature_
            # e.g. feature_smiles
            feature_cols = [
                column for column in database_df.columns
                if column.split("_")[0] == "feature"
            ]
            database_feature_df = database_df[feature_cols]
            mol_names, mol_smiles, responses = None, None, None
            if "feature_name" in feature_cols:
                mol_names = database_feature_df["feature_name"].values.flatten(
                )
                database_feature_df = database_feature_df.drop(
                    ["feature_name"], axis=1)
            if "feature_smiles" in feature_cols:
                mol_smiles = database_df["feature_smiles"].values.flatten()
                database_feature_df = database_feature_df.drop(
                    ["feature_smiles"], axis=1)

            response_col = [
                column for column in database_df.columns
                if column.split("_")[0] == "response"
            ]
            if len(response_col) > 0:
                # currently handles one response
                responses = database_df[response_col].values.flatten()
            for mol_id, smile in enumerate(mol_smiles):
                if self.is_verbose:
                    print(f"Processing {smile} "
                          f"({mol_id + 1}/"
                          f"{database_df['feature_smiles'].values.size})")
                mol_text = mol_names[mol_id] if mol_names is not None else smile

                mol_property_val = responses[
                    mol_id] if responses is not None else None

                try:
                    molecule_database.append(
                        Molecule(
                            mol_smiles=smile,
                            mol_text=mol_text,
                            mol_property_val=mol_property_val,
                        ))
                except LoadingError as e:
                    if self.is_verbose:
                        print(f"{smile} could not be imported. Skipping")

            if len(database_feature_df.columns) > 0:
                features = database_feature_df.values
        else:
            raise FileNotFoundError(
                f"{molecule_database_src} could not be found. "
                f"Please enter valid folder name or path of a "
                f"text/excel/csv")
        if len(molecule_database) == 0:
            raise UserWarning("No molecular files found in the location!")
        return molecule_database, features
예제 #30
0
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#

import unittest
import os, sys, copy

import pickle

from rdkit import rdBase
from rdkit import Chem
from rdkit.Chem.rdRGroupDecomposition import RGroupDecompose, RGroupDecomposition, RGroupDecompositionParameters
from collections import OrderedDict

# the RGD code can generate a lot of warnings. disable them
from rdkit import RDLogger
RDLogger.DisableLog("rdApp.warning")


class TestCase(unittest.TestCase):

  def test_multicores(self):
    cores_smi_easy = OrderedDict()
    cores_smi_hard = OrderedDict()

    #cores_smi_easy['cephem'] = Chem.MolFromSmiles('O=C1C([1*])[C@@H]2N1C(C(O)=O)=C([3*])CS2')
    cores_smi_easy['cephem'] = Chem.MolFromSmarts('O=C1C([*:1])C2N1C(C(O)=O)=C([*:3])CS2')
    cores_smi_hard['cephem'] = Chem.MolFromSmarts('O=C1C([2*])([1*])[C@@H]2N1C(C(O)=O)=C([3*])CS2')

    #cores_smi_easy['carbacephem'] = Chem.MolFromSmiles('O=C1C([1*])[C@@H]2N1C(C(O)=O)=C([3*])CC2')
    cores_smi_easy['carbacephem'] = Chem.MolFromSmarts('O=C1C([1*])C2N1C(C(O)=O)=C([3*])CC2')
    cores_smi_hard['carbacephem'] = Chem.MolFromSmarts(