示例#1
0
def test_rmsd_xyz():

    filename_1 = pathlib.PurePath(RESOURCE_PATH, "ethane.xyz")
    filename_2 = pathlib.PurePath(RESOURCE_PATH, "ethane_mini.xyz")

    p_atoms, p_coord = rmsd.get_coordinates_xyz(filename_1)
    q_atoms, q_coord = rmsd.get_coordinates_xyz(filename_2)

    pure_rmsd = rmsd.rmsd(p_coord, q_coord)

    np.testing.assert_almost_equal(0.33512, pure_rmsd, decimal=3)
示例#2
0
def load_data(
    data_dir="data/xyz/", ref_file="data/qm9-reference.csv", offset=0, query_size=100,
):
    """
    Inputs:
        data_file (str): The data_file
        offset (int): The row offset for the data query
        query_size (int): The number of rows to return

    Returns:
        atom_list: List of chemical species for each molecule in query
        coords_list: List of species coordinates for each molecule in query
        charges: List of species charges for each molecule in query
        filenames: List of names for each reference
    """
    reference = pd.read_csv(ref_file, skiprows=range(1, offset), nrows=query_size)

    atoms_list, coord_list, charges = [], [], []
    filenames = reference["name"]

    for filename in filenames:
        filename = os.path.join(data_dir, f"{filename}.xyz")
        atoms, coords = rmsd.get_coordinates_xyz(filename)

        charges.append(0)
        atoms_list.append(atoms)
        coord_list.append(coords)

    return atoms_list, coord_list, charges, filenames, reference
示例#3
0
def parse_xyz(filename):

    atoms, coordinates = rmsd.get_coordinates_xyz(filename)

    inertia = get_inertia(atoms, coordinates)

    return inertia
示例#4
0
def load_data():

    reference = "../dataset-qm9/reference.csv"
    reference = pd.read_csv(reference)

    filenames = reference["name"]
    # energies = reference["binding energy"]

    atoms_list = []
    coord_list = []
    charges = []
    titles = []

    for filename in filenames:

        titles.append(filename)
        charges.append(0)

        filename = "../dataset-qm9/xyz/" + filename + ".xyz"
        atoms, coord = rmsd.get_coordinates_xyz(filename)

        atoms_list.append(atoms)
        coord_list.append(coord)

    offset = 10 + 100
    to_offset = 110 + 100

    atoms_list = atoms_list[offset:to_offset]
    coord_list = coord_list[offset:to_offset]
    charges = charges[offset:to_offset]
    titles = titles[offset:to_offset]
    reference = reference[offset:to_offset]

    return atoms_list, coord_list, charges, titles, reference
示例#5
0
def test_get_coordinates_xyz():

    filename = pathlib.PurePath(RESOURCE_PATH, "ethane.xyz")
    atoms, coords = rmsd.get_coordinates_xyz(filename)

    assert "C" == atoms[0]
    assert [-0.98353, 1.81095, -0.0314] == coords[0].tolist()
示例#6
0
def main():

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--mol',
                        action='store',
                        default=None,
                        help='Load molecule for live simulation',
                        metavar="FILE")
    parser.add_argument('--model', action='store', default="ethanol", help='')
    args = parser.parse_args()

    if args.mol is None:
        nuclear_charges = np.array([6, 6, 8, 1, 1, 1, 1, 1, 1])
        coordinates = np.array([[0.07230959, 0.61441211, -0.03115568],
                                [-1.26644639, -0.27012846, -0.00720771],
                                [1.11516977, -0.30732869, 0.06414394],
                                [0.10673943, 1.44346835, -0.79573006],
                                [-0.02687486, 1.19350887, 0.98075343],
                                [-2.06614011, 0.38757505, 0.39276693],
                                [-1.68213881, -0.60620688, -0.97804526],
                                [-1.18668224, -1.07395366, 0.67075071],
                                [1.37492532, -0.56618891, -0.83172035]])
    else:

        nuclear_charges, coordinates = rmsd.get_coordinates_xyz(args.mol)
        nuclear_charges = [cheminfo.convert(x) for x in nuclear_charges]

    calculator = get_calculator(args.model)
    constant_energy(nuclear_charges, coordinates, calculator=calculator)

    return
示例#7
0
def prepare_xyz(filename, charge, header):
    """
    """

    atoms, coordinates = rmsd.get_coordinates_xyz("test.xyz")

    lines = prepare_atoms(atoms, coordinates)
    header = header.format(charge)

    gmsin = header + lines

    return gmsin
示例#8
0
def prepare_xyz(filename, charge, header):
    """
    """

    atoms, coordinates = rmsd.get_coordinates_xyz("test.xyz")

    lines = prepare_atoms(atoms, coordinates)
    header = header.format(charge)

    gmsin = header + lines

    return gmsin
示例#9
0
def test_reorder_qml():

    filename_1 = pathlib.PurePath(RESOURCE_PATH, "CHEMBL3039407.xyz")

    p_atoms, p_coord = rmsd.get_coordinates_xyz(filename_1)

    # Reorder atoms
    n_atoms = len(p_atoms)
    random_reorder = np.arange(n_atoms, dtype=int)
    np.random.seed(5)
    np.random.shuffle(random_reorder)

    q_atoms = copy.deepcopy(p_atoms)
    q_coord = copy.deepcopy(p_coord)
    q_atoms = q_atoms[random_reorder]
    q_coord = q_coord[random_reorder]

    # Mess up the distance matrix by rotating the molecule
    theta = 180.0
    rotation_y = np.array(
        [
            [np.cos(theta), 0, np.sin(theta)],
            [0, 1, 0],
            [-np.sin(theta), 0, np.cos(theta)],
        ]
    )

    q_coord = np.dot(q_coord, rotation_y)

    # Reorder with standard hungarian, this will fail reorder and give large
    # RMSD
    view_dist = rmsd.reorder_hungarian(p_atoms, q_atoms, p_coord, q_coord)
    q_atoms_dist = q_atoms[view_dist]
    q_coord_dist = q_coord[view_dist]
    _rmsd_dist = rmsd.kabsch_rmsd(p_coord, q_coord_dist)
    assert q_atoms_dist.tolist() == p_atoms.tolist()
    assert _rmsd_dist > 3.0

    # Reorder based in chemical similarity
    view = rmsd.reorder_similarity(p_atoms, q_atoms, p_coord, q_coord)
    q_atoms = q_atoms[view]
    q_coord = q_coord[view]

    # Calculate new RMSD with correct atom order
    _rmsd = rmsd.kabsch_rmsd(p_coord, q_coord)

    # Assert correct atom order
    assert q_atoms.tolist() == p_atoms.tolist()

    # Assert this is the same molecule
    pytest.approx(0.0) == _rmsd
示例#10
0
def main():

    calculator = calculators.get_calculator("_deploy_", debug=False)
    atom_labels, coordinates = rmsd.get_coordinates_xyz("examples/ethanol.xyz")

    molecule = ase.Atoms(atom_labels, coordinates)
    molecule.set_calculator(calculator)

    dyn = BFGS(molecule)
    dyn.run(fmax=0.3)

    dump_xyz(molecule, "_tmp_molecule_optimize.xyz")

    return
示例#11
0
def main_md():

    calculator = calculators.get_calculator("_deploy_", debug=False)
    atom_labels, coordinates = rmsd.get_coordinates_xyz("examples/ethanol.xyz")

    molecule = ase.Atoms(atom_labels, coordinates)
    molecule.set_calculator(calculator)

    energy = molecule.get_potential_energy()
    print(energy)

    dyn = BFGS(molecule)
    dyn.run(fmax=0.5)

    dump_xyz(molecule, "_tmp_molecule_optimize.xyz")


    return
示例#12
0
def main():


    read_model("data/butane")
    atoms, coordinates = rmsd.get_coordinates_xyz("data/butane/butane-1.xyz")

    energy, force = calculate(atoms, coordinates)

    print(energy)
    print(force)
	
    quit()


    description = """
"""

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-f', '--filename', action='store', help='List of molecules', metavar='listfile')
    parser.add_argument('-m', '--model', action='store', help='Output model in npy format', metavar='file')
    args = parser.parse_args()


    # Load model
    PARAMETERS = np.load(args.model + ".parameters.npy")
    train_representations = np.load(args.model + ".representations.npy")
    train_displaced_representations = np.load(args.model + ".displaced_representations.npy")
    train_alphas = np.load(args.model + ".alphas.npy")


    # Get molecule filenames
    f = open(args.filename, 'r')
    molecules = f.readlines()
    molecules = [mol.strip() for mol in molecules]
    f.close()

    DIRECTORY = args.filename.split("/")
    DIRECTORY = "/".join(DIRECTORY[:-1]) + "/"

    # Init all the rep lists
    list_atoms = []
    list_charges = []
    list_coordinates = []
    list_energies = []
    list_forces = []
    list_rep = []
    list_disp_rep = []
    list_disp_rep5 = []


    # HYPER PARAMETERS
    CUT_DISTANCE = PARAMETERS.item().get('cut_distance')
    KERNEL_ARGS = PARAMETERS.item().get('kernel_args')
    DX = PARAMETERS.item().get('dx')
    NMAX = PARAMETERS.item().get('max_atoms')


    # read coordinates
    for filename in molecules:

        atoms, coordinates = rmsd.get_coordinates_xyz(DIRECTORY + filename + ".xyz")
        charges = [NUCLEAR_CHARGE[atom] for atom in atoms]

        rep = generate_representation(coordinates, charges, max_size=NMAX, cut_distance=CUT_DISTANCE)
        disp_rep = generate_displaced_representations(coordinates, charges, max_size=NMAX, cut_distance=CUT_DISTANCE, dx=DX)

        list_rep.append(rep)
        list_disp_rep.append(disp_rep)

        break


    list_rep = np.array(list_rep)
    list_disp_rep = np.array(list_disp_rep)
	
    # generate kernel
    kernel_energies, kernel_forces = get_kernel(
        train_representations,
        list_rep,
        train_displaced_representations,
        list_disp_rep,
		kernel_args=KERNEL_ARGS,
		dx=DX)

    kernel_energies = kernel_energies[0]
    kernel_forces = kernel_forces[0]


    # predict
    energies = np.dot(kernel_energies.T, train_alphas)
    forces = np.dot(kernel_forces.T, train_alphas)


    print(energies)
    print(forces)
示例#13
0
def prepare_training_data_protonafinity():

    distance_cut = 20.0
    parameters = {
        "pad": 25,
        'nRs2': 22,
        'nRs3': 17,
        'eta2': 0.41,
        'eta3': 0.97,
        'three_body_weight': 45.83,
        'three_body_decay': 2.39,
        'two_body_decay': 2.39,
        "rcut": distance_cut,
        "acut": distance_cut,
        "elements": [1, 6, 7, 8, 9, 12]
    }

    dirprefix = "data/dataset-proton-affinity/data/"
    filename = dirprefix + "pm3_properties.csv"
    df = pd.read_csv(filename, sep=",")

    n_rows = df.shape[0]

    # column names
    col_neuidx = "MoleculeIdx"
    col_proidx = "ProtonatedIdx"
    col_refsmi = "ReferenceSmiles"
    col_prosmi = "ProtonatedSmiles"
    col_neueng = "NeutralEnergy"
    col_proeng = "ProtonatedEnergy"

    # Collect energies
    energies_neutr = df[col_neueng]
    energies_proto = df[col_proeng]

    energies = [energies_neutr, energies_proto]
    energies = np.array(energies)

    # Protonated representation
    p_representations = []
    p_coord_list = []
    p_atoms_list = []

    # Neutral representation
    n_representations = []
    n_coord_list = []
    n_atoms_list = []

    for idx, row in tqdm.tqdm(df.iterrows(),
                              desc="Preparing FCHL19",
                              total=n_rows,
                              **TQDM_OPTIONS):

        # print(row)

        nidx = row[col_neuidx]
        pidx = row[col_proidx]

        nname = f"xyz{nidx}_n.xyz"
        pname = f"xyz{nidx}_{pidx}.xyz"

        # Neutral state
        atoms, coord = rmsd.get_coordinates_xyz(dirprefix + "pm3.cosmo.mop/" +
                                                nname)
        atoms = [cheminfo.convert_atom(atom) for atom in atoms]

        n_representation = generate_fchl_acsf(atoms, coord, **parameters)
        n_representations.append(n_representation)
        n_coord_list.append(coord)
        n_atoms_list.append(atoms)

        # Protonated state
        atoms, coord = rmsd.get_coordinates_xyz(dirprefix + "pm3.cosmo.mop/" +
                                                pname)
        atoms = [cheminfo.convert_atom(atom) for atom in atoms]

        # Find protonated atom
        smiles = row[col_prosmi]
        molobj = cheminfo.smiles_to_molobj(smiles)

        assert molobj is not None, "Molobj failed for {smiles}"

        smi_atoms = molobj.GetAtoms()
        atom_charges = [atom.GetFormalCharge() for atom in smi_atoms]
        atom_charges = np.array(atom_charges)
        idx, = np.where(atom_charges > 0)

        assert len(idx) == 1, f"Should only be one charged atom in {pname}"

        idx = idx[0]

        # Set nitrogen to heavy atom
        atoms[idx] = 12

        p_representation = generate_fchl_acsf(atoms, coord, **parameters)
        p_representations.append(n_representation)
        p_coord_list.append(coord)
        p_atoms_list.append(atoms)

    # proton_idxs = np.array(proton_idxs)

    n_representations = np.array(n_representations)
    p_representations = np.array(p_representations)

    return n_representations, p_representations, n_coord_list, p_coord_list, n_atoms_list, p_atoms_list, energies
示例#14
0
def prepare_training_data_qmepa890():

    # distance_cut = 10.0
    # parameters = {
    #     "pad": 25, # max atoms
    #     "rcut": distance_cut,
    #     "acut": distance_cut,
    #     "elements": [1, 6, 7, 8],
    # }

    # Table 5. Free atom energies from DFT/PBE0/def2TZVP.
    # H   C   N   O   S
    # Multiplicity    2   3   4   3   3
    # Energy / Eh     −0.501036   −37.8054    −54.5438    −75.0186    −397.974

    au2kcal = 627.518135759111

    atom_energies = {}
    atom_energies["H"] = -0.501036 * au2kcal
    atom_energies["C"] = -37.8054 * au2kcal
    atom_energies["N"] = -54.5438 * au2kcal
    atom_energies["O"] = -75.0186 * au2kcal
    atom_energies["S"] = -397.974 * au2kcal

    distance_cut = 20.0
    parameters = {
        "pad": 25,
        'nRs2': 22,
        'nRs3': 17,
        'eta2': 0.41,
        'eta3': 0.97,
        'three_body_weight': 45.83,
        'three_body_decay': 2.39,
        'two_body_decay': 2.39,
        "rcut": distance_cut,
        "acut": distance_cut,
        "elements": [1, 6, 7, 8, 12]
    }

    dirprefix = "data/qmepa890/"
    filename = dirprefix + "data.csv"

    # 1. File ID (e.g. 0415 means the information pertains to the files `0415.xyz` and `0415_+.xyz`)
    # 2. Index of the proton (in the `XXXX_+.xyz` file listed in the same row)
    # 3. Gas-phase energy of neutral molecule plus thermal corrections from vibrational analysis
    # 4. Gas-phase energy of protonated molecule plus thermal corrections from vibrational analysis
    # 5. Gas-phase energy of neutral molecule
    # 6. Gas-phase energy of protonated molecule
    # 7. Energy of neutral molecule using SMD implicit solvent model
    # 8. Energy of protonated molecule using SMD implicit solvent model
    # 9. PM6 heat-of-formation of neutral molecule using COSMO implicit solvent model
    # 10. PM6 heat-of-formation of protonated molecule using COSMO implicit solvent model

    df = pd.read_csv(filename, sep=",", header=None)

    molecule_names = df.iloc[:, 0]
    proton_idxs = df.iloc[:, 1]
    energies = df.iloc[:, 2:]

    p_representations = []
    p_coord_list = []
    p_atoms_list = []

    n_representations = []
    n_coord_list = []
    n_atoms_list = []

    atomization_list = []

    for h_idx, name in zip(proton_idxs, molecule_names):

        name = str(name).zfill(4)
        print(f"representing {name}")

        atoms, coord = rmsd.get_coordinates_xyz(dirprefix + "structures/" +
                                                name + ".xyz")

        atom_energy = 0
        for atom in atoms:
            atom_energy += atom_energies[atom]

        atomization_list.append(atom_energy)

        atoms = [cheminfo.convert_atom(atom) for atom in atoms]
        n_representation = generate_fchl_acsf(atoms, coord, **parameters)
        n_representations.append(n_representation)
        n_coord_list.append(coord)
        n_atoms_list.append(atoms)

        atoms, coord = rmsd.get_coordinates_xyz(dirprefix + "structures/" +
                                                name + "_+.xyz")
        atoms = [cheminfo.convert_atom(atom) for atom in atoms]
        atoms[h_idx - 1] = 12
        p_representation = generate_fchl_acsf(atoms, coord, **parameters)
        p_representations.append(n_representation)
        p_coord_list.append(coord)
        p_atoms_list.append(atoms)

    proton_idxs = np.array(proton_idxs)

    n_representations = np.array(n_representations)
    p_representations = np.array(p_representations)

    atomization_list = np.array(atomization_list)

    return n_representations, p_representations, n_coord_list, p_coord_list, n_atoms_list, p_atoms_list, proton_idxs, energies, atomization_list
示例#15
0
文件: train.py 项目: charnley/qml-md
def main():

    description = """
Based on a list of molecules, train a representation-set and alpha set.
Output the npy files
"""

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-f',
                        '--filename',
                        action='store',
                        help='List of molecules',
                        metavar='listfile')
    parser.add_argument('-d',
                        '--dump',
                        action='store',
                        help='Output model in npy format',
                        metavar='file')

    parser.add_argument('--test', action='store_true')
    parser.add_argument('--optimize', action='store_true')

    args = parser.parse_args()

    # Get molecule filenames
    f = open(args.filename, 'r')
    molecules = f.readlines()
    molecules = [mol.strip() for mol in molecules]
    f.close()

    DIRECTORY = args.filename.split("/")
    DIRECTORY = "/".join(DIRECTORY[:-1]) + "/"

    # Init all the rep lists
    list_atoms = []
    list_charges = []
    list_coordinates = []
    list_energies = []
    list_forces = []
    list_rep = []
    list_disp_rep = []
    list_disp_rep5 = []

    # HYPER PARAMETERS
    CUT_DISTANCE = 1e6
    KERNEL_ARGS = {
        "verbose": False,
        "cut_distance": CUT_DISTANCE,
        "kernel": "gaussian",
        "kernel_args": {
            "sigma": [0.64],
        },
    }
    DX = 0.005

    # read coordinates
    for filename in molecules:

        atoms, coordinates = rmsd.get_coordinates_xyz(DIRECTORY + filename +
                                                      ".xyz")
        nuclear_charges = [NUCLEAR_CHARGE[atom] for atom in atoms]

        f = open(DIRECTORY + filename + ".energy", 'r')
        energy = next(f)
        energy = float(energy)

        force = []
        for line in f:
            force.append(line.split(","))
        force = np.array(force, dtype=float)

        list_atoms.append(atoms)
        list_charges.append(nuclear_charges)
        list_coordinates.append(coordinates)
        list_energies.append(energy)
        list_forces.append(force)

    # Calculate NMAX hyperprameter
    NMAX = [len(x) for x in list_atoms]
    NMAX = np.max(NMAX)

    # Save model parameters
    PARAMETERS = {
        "kernel_args": KERNEL_ARGS,
        "cut_distance": CUT_DISTANCE,
        "max_atoms": NMAX,
        "dx": DX
    }

    # Calculate representations
    for charges, coordinates in zip(list_charges, list_coordinates):

        rep = generate_representation(coordinates,
                                      charges,
                                      max_size=NMAX,
                                      cut_distance=CUT_DISTANCE)
        disp_rep = generate_displaced_representations(
            coordinates,
            charges,
            max_size=NMAX,
            cut_distance=CUT_DISTANCE,
            dx=DX)

        list_rep.append(rep)
        list_disp_rep.append(disp_rep)

    list_atoms = np.array(list_atoms)
    list_coordinates = np.array(list_coordinates)
    list_energies = np.array(list_energies)
    list_forces = np.array(list_forces)
    list_rep = np.array(list_rep)
    list_disp_rep = np.array(list_disp_rep)

    # Hack, easy way to normalize energies (same molecule)
    avg = np.sum(list_energies) / len(list_energies)
    list_energies -= avg

    # hatree / bohr to hatree / aangstroem
    list_forces *= 1.0 / 0.529177249

    # generate train / test views
    view_all = np.array(range(len(molecules)))
    # view_train, view_valid = np.split(view_all, 2)
    view_train = view_all

    # TODO cross-validation of hyper-parameter optimization

    # generate kernel
    kernel_train_energies, kernel_train_deriv = wrapper.get_kernel(
        list_rep[view_train],
        list_rep[view_train],
        list_disp_rep[view_train],
        list_disp_rep[view_train],
        dx=DX,
        kernel_args=KERNEL_ARGS)

    kernel_train_energies = kernel_train_energies[0]
    kernel_train_deriv = kernel_train_deriv[0]

    # generate alphas
    alphas = wrapper.get_alphas(kernel_train_energies, kernel_train_deriv,
                                list_energies[view_train],
                                list_forces[view_train])

    # dump the model
    np.save(args.dump + ".alphas", alphas)
    np.save(args.dump + ".representations", list_rep)
    np.save(args.dump + ".displaced_representations", list_disp_rep)
    np.save(args.dump + ".parameters", PARAMETERS)

    # self test
    if args.selftest:
        energy_valid = np.dot(kernel_train_energies.T, alphas)
        force_valid = np.dot(kernel_train_deriv.T, alphas)
        print(
            mae(list_energies[view_train], energy_valid) < 0.08,
            "Error in operator test energy")
        print(
            mae(list_forces[view_train].flatten(), force_valid) < 0.1,
            "Error in  operator test force")

    return