示例#1
0
def main():

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--scratch',
                        action='store',
                        help='',
                        metavar="DIR",
                        default="_tmp_")
    parser.add_argument('--json', action='store', help='', metavar="FILE")
    parser.add_argument('-j',
                        '--procs',
                        action='store',
                        help='pararallize',
                        metavar="int",
                        default=0,
                        type=int)

    args = parser.parse_args()

    if args.scratch[-1] != "/":
        args.scratch += "/"

    data = misc.load_json(args.json)

    keys = data.keys()
    keys = list(keys)

    canonical_data = {}

    for key in keys:

        molobj, status = cheminfo.smiles_to_molobj(key)

        if molobj is None:
            print("error none mol:", key)
            continue

        smiles = cheminfo.molobj_to_smiles(molobj, remove_hs=True)

        if "." in smiles:
            print("error multi mol:", smiles)
            continue

        atoms = cheminfo.molobj_to_atoms(molobj)

        if not is_mol_allowed(atoms):
            print("error heavy mol:", smiles)
            continue

        canonical_data[smiles] = data[key]

    misc.save_json(args.scratch + "molecule_data", canonical_data)
    misc.save_obj(args.scratch + "molecule_data", canonical_data)

    return
示例#2
0
def clean_data(listdata):

    data = {}

    atom_types = []

    for row in listdata:

        idx = row[0]
        smi = row[1]
        value = row[3]
        value = float(value)

        molobj, status = cheminfo.smiles_to_molobj(smi)

        if molobj is None:
            print("error:", smi)
            continue

        smi = cheminfo.molobj_to_smiles(molobj, remove_hs=True)

        atoms = cheminfo.molobj_to_atoms(molobj)

        # filter for organic chemistry
        if not is_mol_allowed(atoms):
            continue

        atom_types += list(atoms)

        if smi not in data:
            data[smi] = []

        data[smi].append(value)

    atom_types, counts = np.unique(atom_types, return_counts=True)

    for atom, count in zip(atom_types, counts):
        print(atom, count)

    keys = data.keys()

    print("Total molecules", len(keys))

    return data
示例#3
0
def test_dot():

    smiles = "Oc1ccccc1"
    molobj, status = cheminfo.smiles_to_molobj(smiles)
    fp1 = get_rdkitfp(molobj)
    bm = fp_to_bitmap(fp1)

    print(list(bm))

    # hello = np.array([0, 1, 0,0,0,0,0,0,0,0,0,1])
    # res = np.dot(hello, hello)

    bm = np.array(bm, dtype=int)

    s = np.sum(bm)
    other = np.dot(bm, bm)

    print(s, other)

    return
示例#4
0
def filter_dict(molecules):

    keys = molecules.keys()
    keys = list(keys)

    max_atoms = 0

    for key in keys:

        molobj, status = cheminfo.smiles_to_molobj(key)

        if molobj is None:
            continue

        status = filter_molobj(molobj)

        if not status:
            del molecules[key]
            print(key, status)
            continue

        status = filter_value(molecules[key])

        if not status:
            print(status, key, molecules[key])
            del molecules[key]
            continue

        # Report
        atoms = cheminfo.molobj_to_atoms(molobj)
        n_atoms = len(atoms)

        if n_atoms > max_atoms:
            max_atoms = n_atoms

        continue

    print("max atoms: ", max_atoms)

    return molecules
def clean_data(df, scratch):

    smiles = df.iloc[1]

    data = {}

    atom_types = []

    for index, row in df.iterrows():

        smi = row.smiles
        value = row.mpC + 273.15

        molobj, status = cheminfo.smiles_to_molobj(smi)

        if molobj is None:
            print("error:", smi)
            continue

        smi = cheminfo.molobj_to_smiles(molobj, remove_hs=True)

        # Atoms
        atoms = cheminfo.molobj_to_atoms(molobj)
        atom_types += list(atoms)

        if smi not in data:
            data[smi] = []

        data[smi].append(value)

    atom_types, counts = np.unique(atom_types, return_counts=True)

    for atom, count in zip(atom_types, counts):
        print(atom, count)

    misc.save_obj(scratch + "molecule_data", data)
    misc.save_json(scratch + "molecule_data", data)

    return
示例#6
0
def test_kernel():

    smiles = ['c1ccccn1']
    smiles += ['c1ccco1']
    smiles += ['Oc1ccccc1']
    smiles += ['Nc1ccccc1']
    smiles += ['CCO']
    smiles += ['CCN']
    molobjs = [cheminfo.smiles_to_molobj(x)[0] for x in smiles]

    molobjs = cheminfo.read_sdffile("_tmp_bing_bp_/structures.sdf.gz")
    molobjs = [next(molobjs) for _ in range(5000)]

    init = time.time()
    vectors = molobjs_to_fps(molobjs)

    print("init", time.time() - init)

    time_pykernel = time.time()
    kernel = bitmap_jaccard_kernel(vectors)
    print("pykernel", time.time() - time_pykernel)
    print(kernel)

    del kernel

    n_items = vectors.shape[0]
    # kernel = np.zeros((n_items, n_items))

    vectors = vectors.T
    vectors = np.array(vectors, dtype=int)

    # help(bitmap_kernels)
    time_fkernel = time.time()
    kernel = bitmap_kernels.symmetric_jaccard_kernel(n_items, vectors)
    print("fokernel", time.time() - time_fkernel)
    print(kernel)

    return
示例#7
0
def prepare_training_data_protonafinity():

    distance_cut = 20.0
    parameters = {
        "pad": 25,
        'nRs2': 22,
        'nRs3': 17,
        'eta2': 0.41,
        'eta3': 0.97,
        'three_body_weight': 45.83,
        'three_body_decay': 2.39,
        'two_body_decay': 2.39,
        "rcut": distance_cut,
        "acut": distance_cut,
        "elements": [1, 6, 7, 8, 9, 12]
    }

    dirprefix = "data/dataset-proton-affinity/data/"
    filename = dirprefix + "pm3_properties.csv"
    df = pd.read_csv(filename, sep=",")

    n_rows = df.shape[0]

    # column names
    col_neuidx = "MoleculeIdx"
    col_proidx = "ProtonatedIdx"
    col_refsmi = "ReferenceSmiles"
    col_prosmi = "ProtonatedSmiles"
    col_neueng = "NeutralEnergy"
    col_proeng = "ProtonatedEnergy"

    # Collect energies
    energies_neutr = df[col_neueng]
    energies_proto = df[col_proeng]

    energies = [energies_neutr, energies_proto]
    energies = np.array(energies)

    # Protonated representation
    p_representations = []
    p_coord_list = []
    p_atoms_list = []

    # Neutral representation
    n_representations = []
    n_coord_list = []
    n_atoms_list = []

    for idx, row in tqdm.tqdm(df.iterrows(),
                              desc="Preparing FCHL19",
                              total=n_rows,
                              **TQDM_OPTIONS):

        # print(row)

        nidx = row[col_neuidx]
        pidx = row[col_proidx]

        nname = f"xyz{nidx}_n.xyz"
        pname = f"xyz{nidx}_{pidx}.xyz"

        # Neutral state
        atoms, coord = rmsd.get_coordinates_xyz(dirprefix + "pm3.cosmo.mop/" +
                                                nname)
        atoms = [cheminfo.convert_atom(atom) for atom in atoms]

        n_representation = generate_fchl_acsf(atoms, coord, **parameters)
        n_representations.append(n_representation)
        n_coord_list.append(coord)
        n_atoms_list.append(atoms)

        # Protonated state
        atoms, coord = rmsd.get_coordinates_xyz(dirprefix + "pm3.cosmo.mop/" +
                                                pname)
        atoms = [cheminfo.convert_atom(atom) for atom in atoms]

        # Find protonated atom
        smiles = row[col_prosmi]
        molobj = cheminfo.smiles_to_molobj(smiles)

        assert molobj is not None, "Molobj failed for {smiles}"

        smi_atoms = molobj.GetAtoms()
        atom_charges = [atom.GetFormalCharge() for atom in smi_atoms]
        atom_charges = np.array(atom_charges)
        idx, = np.where(atom_charges > 0)

        assert len(idx) == 1, f"Should only be one charged atom in {pname}"

        idx = idx[0]

        # Set nitrogen to heavy atom
        atoms[idx] = 12

        p_representation = generate_fchl_acsf(atoms, coord, **parameters)
        p_representations.append(n_representation)
        p_coord_list.append(coord)
        p_atoms_list.append(atoms)

    # proton_idxs = np.array(proton_idxs)

    n_representations = np.array(n_representations)
    p_representations = np.array(p_representations)

    return n_representations, p_representations, n_coord_list, p_coord_list, n_atoms_list, p_atoms_list, energies
示例#8
0
def main():

    # smiles_list = ['c1ccccn1', 'c1ccco1']*10
    # molobjs = [cheminfo.smiles_to_molobj(smiles)[0] for smiles in smiles_list]

    smiles1 = 'c1ccccn1'
    smiles2 = 'c1ccco1'
    smiles1 = 'Oc1ccccc1'
    smiles2 = 'Nc1ccccc1'
    # smiles1 = 'CCO'
    # smiles2 = 'CCN'
    molobj1, status = cheminfo.smiles_to_molobj(smiles1)
    molobj2, status = cheminfo.smiles_to_molobj(smiles2)

    fp1 = get_rdkitfp(molobj1)
    fp2 = get_rdkitfp(molobj2)
    bm1 = fp_to_bitmap(fp1)
    bm2 = fp_to_bitmap(fp2)

    print(bm1)

    print()

    sim = rdkit.DataStructs.FingerprintSimilarity(fp1, fp2)
    print(sim)

    sim = jaccard_index(bm1, bm2)
    print(sim)

    sim = dice_coefficient(bm1, bm2)
    print(sim)

    print()

    fp1 = AllChem.GetMorganFingerprintAsBitVect(molobj1,
                                                2,
                                                nBits=1024 * 5,
                                                useFeatures=True)
    fp2 = AllChem.GetMorganFingerprintAsBitVect(molobj2,
                                                2,
                                                nBits=1024 * 5,
                                                useFeatures=True)
    bm1 = fp_to_bitmap(fp1)
    bm2 = fp_to_bitmap(fp2)

    sim = jaccard_index(bm1, bm2)
    print(sim)
    sim = rdkit.DataStructs.FingerprintSimilarity(fp1, fp2)
    print(sim)

    fp1 = get_morgan(molobj1)
    fp2 = get_morgan(molobj2)
    sim = AllChem.DataStructs.DiceSimilarity(fp1, fp2)
    print(sim)

    # molobjs = cheminfo.read_sdffile("_tmp_bing_bp_/structures.sdf.gz")
    # molobjs = [next(molobjs) for _ in range(20)]
    #
    # fingerprints = molobjs_to_fps(molobjs, procs=2)
    # kernel = fingerprints_to_kernel(fingerprints, fingerprints, procs=2, similarity=dice_similarity)
    #
    # print(kernel)

    return
示例#9
0
def parse_molandprop(*args, debug=False, **kwargs):

    if len(args) > 1:
        molobj = args[0]
        props = args[1]
    else:
        molobj, props = args[0]

    if molobj is None:
        return None, None

    keys = props.keys()

    if "SMILES" not in keys:
        return None, None

    prop_smiles = props["SMILES"]

    # Ignore multi molecules
    if "." in prop_smiles:
        if debug:
            print(f"ignore: {prop_smiles}")
        return None, None

    # Count
    atoms = cheminfo.molobj_to_atoms(molobj)

    # if len(atoms) < 3:
    #     if debug:
    #         print("ignore small", props)
    #     return None, None

    # if len(atoms) > 40:
    #     if debug:
    #         print("ignore large", props)
    #     return None, None

    # atoms_carbons, = np.where(atoms == 6)
    # if len(atoms_carbons) < 1:
    #     if debug:
    #         print("ignore non-org", props)
    #     return None, None

    # Add hydrogens and optimize structure
    molobj = cheminfo.molobj_add_hydrogens(molobj)
    status = cheminfo.molobj_optimize(molobj)

    # if unconverged
    if status == 5:

        # try the smiles
        molobj, status = cheminfo.smiles_to_molobj(prop_smiles)
        if molobj is None:
            print("error", props)
            return None, None

        molobj = cheminfo.molobj_add_hydrogens(molobj)
        status = cheminfo.molobj_optimize(molobj)

        if status == 5:
            print("error", props)
            return None, None

    idx_ref = [key for key in keys if "{measured}" in key]
    idx_ref = idx_ref[0]

    value = str(props[idx_ref])
    if "<" in value:
        return None, None
    if ">" in value:
        return None, None

    idx_value = [key for key in keys if "measured, converted" in key]
    idx_value = idx_value[0]

    idx_unit = [key for key in keys if "UNIT" in key]
    idx_unit = [key for key in idx_unit if "Point" in key]
    idx_unit = idx_unit[0]

    prop_unit = props[idx_unit]
    prop_value = props[idx_value]

    if prop_unit == "Celsius":
        prop_value += 273.15
    elif prop_unit == "K":
        pass
    else:
        print("error unknown unit", prop_unit, props)
        return None, None

    return molobj, prop_value