예제 #1
0
def fdiff_grad(X, trainer, dx=0.001):
    """
    finite diff gradient
    """
    natoms = len(X)
    g = np.zeros((natoms, 3))
    X0 = X.copy()
    X1 = X0.copy()

    Xplus, Xminus = ([], [])
    y = []
    for i in range(natoms):
        for j in range(3):
            X1[:, :] = X0[:, :]
            X1[i, j + 1] += dx
            Xplus.append(X1.copy())
            X1[:, :] = X0[:, :]
            X1[i, j + 1] -= dx
            Xminus.append(X1.copy())
            y.append(0.0)

    plus = RawDataset(Xplus, y)
    minus = RawDataset(Xminus, y)

    eplus = np.array(trainer.predict(plus))
    eminus = np.array(trainer.predict(minus))

    g = (eplus - eminus) / (2.0 * dx)

    return g
예제 #2
0
def write_reaction_data(name, X, E, trainer):
    """
    Write some results about this reaction to a file
    """

    rd = RawDataset(X, E)
    grad = list(trainer.coordinate_gradients(rd))
    predictions = trainer.predict(rd)

    # print a table of
    # dft energy, ani1 energy, gradient norm, g.dx

    ngeoms = len(X)

    out_str = ["# dft energy, ani1 energy, grms, g.dx"]
    dist = 0
    for i in range(ngeoms):
        carts = X[i][:, 1:]
        natoms, ncarts = grad[i].shape
        gxyz = grad[i].reshape(natoms * ncarts)

        #       fdiff gradient tests
        #        gfd = fdiff_grad(X[i], trainer)
        #        gerr = gfd - gxyz
        #        grms = np.sqrt(sum(gerr[:]**2.0)/(natoms*ncarts))
        #        gdot = np.dot(gfd, gxyz) / (np.sqrt(np.dot(gxyz, gxyz)) * np.sqrt(np.dot(gfd, gfd)))
        #        print("rms gradient error %.8f" % grms)
        #        print("gradient dot prod %.4f" % gdot)

        if i == 0:
            dX = X[i + 1][:, 1:] - X[i][:, 1:]
        elif i == ngeoms - 1:
            dX = X[i][:, 1:] - X[i - 1][:, 1:]
        else:
            dX = X[i + 1][:, 1:] - X[i - 1][:, 1:]
        dX = dX.reshape(natoms * ncarts)

        gdotx = np.dot(gxyz, dX)
        gdotg = np.dot(gxyz, gxyz)
        xdotx = np.dot(dX, dX)

        #print("%.4f %.4f %.4f" % (gdotx, gdotg, xdotx))

        grms = np.sqrt(gdotg / (natoms * ncarts))
        gdot = -gdotx / (np.sqrt(gdotg) * np.sqrt(xdotx))

        dE = (E[i] - E[0]) * KCAL
        dP = (predictions[i] - predictions[0]) * KCAL
        out_str.append("%.2f %.2f %.2f %.8f %.4f" % (dist, dE, dP, grms, gdot))

        if i < ngeoms - 1:
            dX = X[i + 1][:, 1:] - X[i][:, 1:]
            dX = dX.reshape(natoms * ncarts)
            dist += np.sqrt(sum(dX[:]**2.0))

    with open(name + "_compare.dat", "w") as fin:
        fin.write("\n".join(out_str))
예제 #3
0
    def test_raw_dataset_iterate(self):

        dummy_elem = [1.0, 2.0]

        all_Xs = [
            np.array([dummy_elem] * 3),
            np.array([dummy_elem] * 4),
            np.array([dummy_elem] * 1),
            np.array([dummy_elem] * 2),
            np.array([dummy_elem] * 8),
            np.array([dummy_elem] * 9),
            np.array([dummy_elem] * 3),
            np.array([dummy_elem] * 5),
        ]

        rd = RawDataset(all_Xs)

        for idx, (x_batch, x_offsets,
                  _) in enumerate(rd.iterate(batch_size=3)):
            if idx == 0:
                assert x_batch.shape[0] == 3 + 4 + 1
                np.testing.assert_array_equal(
                    x_offsets,
                    np.array([(0, 3), (3, 7), (7, 8)], dtype=np.int32))
            elif idx == 1:
                x_batch.shape[0] == 2 + 8 + 9
                np.testing.assert_array_equal(
                    x_offsets,
                    np.array([(0, 2), (2, 10), (10, 19)], dtype=np.int32))
            elif idx == 2:
                x_batch.shape[0] == 3 + 5
                np.testing.assert_array_equal(
                    x_offsets, np.array([(0, 3), (3, 8)], dtype=np.int32))
            else:
                assert 0

        all_ys = np.arange(len(all_Xs), dtype=np.float32)

        rdy = RawDataset(all_Xs, all_ys)

        for idx, (_, _, ys) in enumerate(rdy.iterate(batch_size=3)):
            if idx == 0:
                np.testing.assert_array_equal(
                    ys, np.arange(0, 3, dtype=np.float32))
            elif idx == 1:
                np.testing.assert_array_equal(
                    ys, np.arange(3, 6, dtype=np.float32))
            elif idx == 2:
                np.testing.assert_array_equal(
                    ys, np.arange(6, 8, dtype=np.float32))
            else:
                assert 0
예제 #4
0
파일: main_2.py 프로젝트: proteneer/khan
def main():
    #avail_gpus = get_available_gpus()
    #print("Available GPUs:", avail_gpus)

    print('os.environ:', os.environ)

    config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=config) as sess:  # must be at start to reserve GPUs

        parser = argparse.ArgumentParser(
            description="Run ANI1 neural net training.",
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)

        parser.add_argument(
            '--ani_lib',
            required=True,
            help="Location of the shared object for GPU featurization")
        parser.add_argument('--fitted',
                            default=False,
                            action='store_true',
                            help="Whether or use fitted energy corrections")
        parser.add_argument('--add_ffdata',
                            default=True,
                            action='store_true',
                            help="Whether or not to add the forcefield data")
        parser.add_argument('--gpus',
                            default='4',
                            help="Number of GPUs to use")
        parser.add_argument(
            '--cpus',
            default='1',
            help="Number of CPUs to use (GPUs override this if > 0)")
        parser.add_argument(
            '--start_batch_size',
            default='64',
            help=
            "How many training points to consider before calculating each gradient"
        )
        parser.add_argument(
            '--max_local_epoch_count',
            default='50',
            help="How many epochs to try each learning rate before reducing it"
        )
        parser.add_argument('--dataset_index',
                            default='0',
                            help="Index of training set to use")
        parser.add_argument('--testset_index',
                            default='0',
                            help="Index of test set to use")
        parser.add_argument(
            '--fit_charges',
            default=False,
            action='store_true',
            help="Whether or not to add fitted charge energies")

        parser.add_argument('--work-dir',
                            default='~/work',
                            help="location where work data is dumped")
        parser.add_argument('--train-dir',
                            default='/home/yzhao/ANI-1_release',
                            help="location where work data is dumped")
        parser.add_argument('--restart',
                            default=False,
                            action='store_true',
                            help="Whether to restart from the save dir")
        parser.add_argument(
            '--train_size',
            default='0.5',
            help="how much of the dataset to use for gradient evaluations")
        parser.add_argument(
            '--test_size',
            default='0.5',
            help="how much of the dataset to use for testing the energies")

        args = parser.parse_args()

        print("Arguments", args)

        lib_path = os.path.abspath(args.ani_lib)
        print("Loading custom kernel from", lib_path)
        initialize_module(lib_path)

        ANI_TRAIN_DIR = args.train_dir
        ANI_WORK_DIR = args.work_dir
        GRAPH_DB_TRAIN_DIR = '/nfs/working/scidev/stevenso/learning/khan/graphdb_xyz/xyz/train'
        GRAPH_DB_TEST_DIR = '/nfs/working/scidev/stevenso/learning/khan/graphdb_xyz/xyz/test/'
        train_size = float(args.train_size)
        test_size = float(args.test_size)

        CALIBRATION_FILE_TRAIN = os.path.join(ANI_TRAIN_DIR,
                                              "results_QM_M06-2X.txt")
        CALIBRATION_FILE_TEST = os.path.join(ANI_TRAIN_DIR, "gdb_11_cal.txt")
        ROTAMER_TRAIN_DIR = [
            os.path.join(ANI_TRAIN_DIR, "rotamers/train"),
            os.path.join(ANI_TRAIN_DIR, "rotamers/test")
        ]
        ROTAMER_TEST_DIR = os.path.join(ANI_TRAIN_DIR, "rotamers/test")
        CHARGED_ROTAMER_TEST_DIR = os.path.join(ANI_TRAIN_DIR,
                                                "charged_rotamers_2")
        CCSDT_ROTAMER_TEST_DIR = os.path.join(ANI_TRAIN_DIR, "ccsdt_dataset")

        save_dir = os.path.join(ANI_WORK_DIR, "save")
        if os.path.isdir(save_dir) and not args.restart:
            print('save_dir', save_dir, 'exists and this is not a restart job')
            exit()
        batch_size = int(args.start_batch_size)
        use_fitted = args.fitted
        add_ffdata = args.add_ffdata
        data_loader = DataLoader(use_fitted)

        print("------------Load evaluation data--------------")

        pickle_files = [
            'eval_new_graphdb.pickle', 'eval_data_old_fftest.pickle',
            'eval_data_graphdb.pickle', 'rotamer_gdb_opt.pickle'
        ]
        pickle_file = pickle_files[int(args.testset_index)]
        if os.path.isfile(pickle_file):
            print('Loading pickle from', pickle_file)
            rd_gdb11, rd_ffneutral_mo62x, ffneutral_groups_mo62x, rd_ffneutral_ccsdt, ffneutral_groups_ccsdt, rd_ffcharged_mo62x, ffcharged_groups_mo62x = pickle.load(
                open(pickle_file, "rb"))
            # backwards compatibility for pickle files: add all_grads = None
            rd_gdb11.all_grads = None
            rd_ffneutral_mo62x.all_grads = None
            rd_ffneutral_ccsdt.all_grads = None
            rd_ffcharged_mo62x.all_grads = None
            #rd_gdb11, rd_ffneutral_mo62x, ffneutral_groups_mo62x, rd_ffneutral_ccsdt, ffneutral_groups_ccsdt, rd_ffcharged_mo62x, ffcharged_groups_mo62x, rd_gdb_opt, gdb_opt_groups = pickle.load( open(pickle_file, "rb") )
            pickle.dump((rd_gdb11, rd_ffneutral_mo62x, ffneutral_groups_mo62x,
                         rd_ffneutral_ccsdt, ffneutral_groups_ccsdt,
                         rd_ffcharged_mo62x, ffcharged_groups_mo62x),
                        open(pickle_file, "wb"))
        else:
            print('gdb11')
            xs, ys = data_loader.load_gdb11(ANI_TRAIN_DIR,
                                            CALIBRATION_FILE_TEST)
            rd_gdb11 = RawDataset(xs, ys)
            xs, ys, ffneutral_groups_mo62x = data_loader.load_ff(
                GRAPH_DB_TEST_DIR)
            rd_ffneutral_mo62x = RawDataset(xs, ys)
            xs, ys, ffneutral_groups_ccsdt = data_loader.load_ff(
                CCSDT_ROTAMER_TEST_DIR)
            rd_ffneutral_ccsdt = RawDataset(xs, ys)
            xs, ys, ffcharged_groups_mo62x = data_loader.load_ff(
                CHARGED_ROTAMER_TEST_DIR)
            rd_ffcharged_mo62x = RawDataset(xs, ys)
            xs, ys, gdb_opt_groups = data_loader.load_ff('haoyu_opt/xyz/')
            rd_gdb_opt = RawDataset(xs, ys)
            print('Pickling data...')
            pickle.dump((rd_gdb11, rd_ffneutral_mo62x, ffneutral_groups_mo62x,
                         rd_ffneutral_ccsdt, ffneutral_groups_ccsdt,
                         rd_ffcharged_mo62x, ffcharged_groups_mo62x,
                         rd_gdb_opt, gdb_opt_groups), open(pickle_file, "wb"))

        eval_names = [
            "Neutral Rotamers", "Neutral Rotamers CCSDT", "Charged Rotamers",
            "GDB Opt"
        ]
        #eval_groups   = [ffneutral_groups_mo62x, ffneutral_groups_ccsdt, ffcharged_groups_mo62x, gdb_opt_groups]
        #eval_datasets = [rd_ffneutral_mo62x, rd_ffneutral_ccsdt, rd_ffcharged_mo62x, rd_gdb_opt]
        eval_names = [
            "Neutral Rotamers", "Neutral Rotamers CCSDT", "Charged Rotamers"
        ]
        eval_groups = [
            ffneutral_groups_mo62x, ffneutral_groups_ccsdt,
            ffcharged_groups_mo62x
        ]
        eval_datasets = [
            rd_ffneutral_mo62x, rd_ffneutral_ccsdt, rd_ffcharged_mo62x
        ]

        # This training code implements cross-validation based training, whereby we determine convergence on a given
        # epoch depending on the cross-validation error for a given validation set. When a better cross-validation
        # score is detected, we save the model's parameters as the putative best found parameters. If after more than
        # max_local_epoch_count number of epochs have been run and no progress has been made, we decrease the learning
        # rate and restore the best found parameters.

        max_local_epoch_count = int(args.max_local_epoch_count)
        n_gpus = int(args.gpus)  # min( int(args.gpus), len(avail_gpus) )
        n_cpus = min(int(args.cpus), os.cpu_count())
        if n_gpus > 0:
            towers = ["/gpu:" + str(i) for i in range(n_gpus)]
        else:
            towers = ["/cpu:" + str(i) for i in range(n_cpus)]

        print("towers:", towers)

        #layer_sizes=(128, 128, 64, 1) # original
        layer_sizes = (512, 256, 128, 1)
        #layer_sizes=(256, 256, 256, 256, 256, 256, 256, 128, 64, 8, 1) # bigNN
        #layer_sizes=tuple( 20*[128] + [1] )
        #layer_sizes=(1,) # linear
        print('layer_sizes:', layer_sizes)
        n_weights = sum([
            layer_sizes[i] * layer_sizes[i + 1]
            for i in range(len(layer_sizes) - 1)
        ])
        print('n_weights:', n_weights)

        print("------------Load training data--------------")

        pickle_files = [
            "gdb8_fftrain_fftest_xy.pickle", "gdb8_graphdb_xy.pickle",
            "gdb8_xy.pickle", "gdb7_xy.pickle", "gdb6_ffdata_xy.pickle",
            "gdb3_xy.pickle", "gdb8_graphdb_xy_differ3.pickle"
        ]
        pickle_file = pickle_files[int(args.dataset_index)]
        if os.path.isfile(pickle_file):
            print('Loading pickle from', pickle_file)
            Xs, ys = pickle.load(open(pickle_file, "rb"))
        else:
            ff_train_dirs = ROTAMER_TRAIN_DIR + [GRAPH_DB_TRAIN_DIR]
            Xs, ys = data_loader.load_gdb8(ANI_TRAIN_DIR,
                                           CALIBRATION_FILE_TRAIN,
                                           ff_train_dirs)
            print('Pickling data...')
            pickle.dump((Xs, ys), open(pickle_file, "wb"))

        print("------------Initializing model--------------")

        X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
            Xs, ys, train_size=train_size,
            test_size=test_size)  # stratify by UTT would be good to try here
        rd_train, rd_test = RawDataset(X_train,
                                       y_train), RawDataset(X_test, y_test)
        print('n_train =', len(y_train), 'n_test =', len(y_test))

        trainer = TrainerMultiTower(
            sess,
            towers=towers,
            layer_sizes=layer_sizes,
            fit_charges=args.fit_charges,
            precision=tf.
            float32  # train in single precision (james you may want to change this later)
            #precision=tf.float64
        )

        if os.path.exists(save_dir):
            print("Restoring existing model from", save_dir)
            trainer.load_numpy(save_dir + '/best.npz')
        else:  # initialize new random parameters
            trainer.initialize()

        for name, ff_data, ff_groups in zip(eval_names, eval_datasets,
                                            eval_groups):
            print(
                name, "abs/rel rmses: {0:.6f} kcal/mol | ".format(
                    trainer.eval_abs_rmse(ff_data)) +
                "{0:.6f} kcal/mol".format(
                    trainer.eval_eh_rmse(ff_data, ff_groups)))

        print("------------Starting Training--------------")
        trainer.train(save_dir, rd_train, rd_test, rd_gdb11, eval_names,
                      eval_datasets, eval_groups, batch_size,
                      max_local_epoch_count)
예제 #5
0
def main():

    args = parse_args(sys.argv)
    lib_path = os.path.abspath(args.ani_lib)
    initialize_module(lib_path)

    save_dir = os.path.join(args.work_dir, "save")

    config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=config) as sess:

        layer_sizes = (128, 128, 64, 1)
        if args.deep_network:
            layer_sizes = (256, 256, 256, 256, 256, 256, 256, 128, 64, 8, 1)
        towers = ["/cpu:0"]
        print("start with layers", layer_sizes)
        trainer = TrainerMultiTower(
            sess,
            towers,
            layer_sizes=layer_sizes,
            fit_charges=args.fit_charges,
            gaussian_activation=args.gaussian_activation)

        trainer.load(save_dir)

        s = client_server.connect_socket(args.host, args.port, server=True)

        if args.debug:
            print("Server listening on port %d" % args.port)

        while True:

            if args.debug:
                print("awaiting connection...")

            conn, addr = s.accept()

            if args.debug:
                print("Connection established...")

            while True:

                rcv_data = client_server.recieve(conn)

                print("recieved data", rcv_data)

                if rcv_data:

                    X = json.loads(rcv_data).get('X')
                    X_np = np.array(X, dtype=np.float32)
                    rd = RawDataset([X_np], [0.0])

                    # should I go back to total energy?
                    energy = float(trainer.predict(rd)[0])
                    self_interaction = sum(
                        data_utils.selfIxnNrgWB97X_631gdp[example[0]]
                        for example in X)
                    energy += self_interaction

                    gradient = list(trainer.coordinate_gradients(rd))[0]
                    natoms, ndim = gradient.shape
                    gradient = gradient.reshape(natoms * ndim)

                    if args.fdiff_grad:
                        fd_gradient = fdiff_grad(X_np, trainer)
                        dg = gradient - fd_gradient
                        grms = np.sqrt(sum(dg[:]**2.0) / (natoms * ndim))
                        dot = np.dot(gradient, fd_gradient)
                        norm_g = np.sqrt(np.dot(gradient, gradient))
                        norm_fd = np.sqrt(np.dot(fd_gradient, fd_gradient))
                        dot = np.dot(gradient,
                                     fd_gradient) / (norm_fd * norm_g)
                        gradient[:] = fd_gradient[:]
                        print("RMS gradient fdiff/analytic %.4e" % grms)
                        print("Gradient dot product %.4f" % dot)

                    # convert gradient from hartree/angstrom to hartree/bohr
                    # and to jsonable format
                    gradient = [float(g) * BOHR for g in gradient]

                    print("sending gradient")
                    print(gradient)

                    send_data = json.dumps({
                        "energy": energy,
                        "gradient": gradient
                    })

                    print("sending response...")

                    client_server.send(conn, send_data)

                else:
                    break
예제 #6
0
def main():

    args = parse_args(sys.argv)
    lib_path = os.path.abspath(args.ani_lib)
    initialize_module(lib_path)

    save_file = os.path.join(args.save_dir, "save_file.npz")
    if not os.path.exists(save_file):
        raise IOError("Saved NN numpy file does not exist")

    _, _, X_test, y_test, X_big, y_big = load_reactivity_data(
        args.reactivity_dir, 1.0)
    small_reactions, big_reactions = read_all_reactions(args.reactivity_dir)

    rd_test = RawDataset(X_test, y_test)
    rd_big = RawDataset(X_big, y_big)

    config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=config) as sess:
        towers = ["/cpu:0"]
        layers = (128, 128, 64, 1)
        if args.deep_network:
            layers = (256, 256, 256, 256, 256, 256, 256, 128, 64, 8, 1)
        activation_fn = activations.get_fn_by_name(args.activation_function)

        trainer = TrainerMultiTower(
            sess,
            towers=towers,
            precision=tf.float64,
            layer_sizes=layers,
            activation_fn=activation_fn,
            fit_charges=args.fit_charges,
        )

        trainer.load_numpy(save_file)

        if args.analyze_reaction_errors:

            if not os.path.exists("small_reactions_comparison"):
                os.mkdir("small_reactions_comparison")
            if not os.path.exists("big_reactions_comparison"):
                os.mkdir("big_reactions_comparison")

            for dataname, data in (("small_reactions", small_reactions),
                                   ("big_reactions", big_reactions)):

                # get reactant, TS product
                Xr, Er = [], []
                Xts, Ets = [], []
                Xp, Ep = [], []

                for name in data:
                    Xs, Es = data[name]

                    if args.write_comparison_data:
                        # make a directory HERE
                        directory = dataname + "_comparison"
                        write_reaction_data(os.path.join(directory, name), Xs,
                                            Es, trainer)

                    Xr.append(Xs[0])
                    Er.append(Es[0])
                    Xp.append(Xs[-1])
                    Ep.append(Es[-1])

                    # ts is highest energy point along path
                    emax = max(Es)
                    idx = Es.index(emax)
                    Xts.append(Xs[idx])
                    Ets.append(Es[idx])

                # make datasets
                rd_r = RawDataset(Xr, Er)
                rd_p = RawDataset(Xp, Ep)
                rd_ts = RawDataset(Xts, Ets)

                Er = np.array(Er)
                Ep = np.array(Ep)
                Ets = np.array(Ets)

                # predict energies
                r_predictions = np.array(trainer.predict(rd_r))
                p_predictions = np.array(trainer.predict(rd_p))
                ts_predictions = np.array(trainer.predict(rd_ts))

                barriers = (Ets - Er) * KCAL
                reverse_barriers = (Ets - Ep) * KCAL
                predicted_barriers = (ts_predictions - r_predictions) * KCAL
                predicted_reverse_barriers = (ts_predictions -
                                              p_predictions) * KCAL
                rxn_e = (Ep - Er) * KCAL
                predicted_rxn_e = (p_predictions - r_predictions) * KCAL

                barrier_errors = barriers - predicted_barriers
                barrier_rmse = np.sqrt(
                    sum(barrier_errors[:]**2.0) / len(barrier_errors))
                reverse_barrier_errors = reverse_barriers - predicted_reverse_barriers
                reverse_barrier_rmse = np.sqrt(
                    sum(reverse_barrier_errors[:]**2.0) /
                    len(reverse_barrier_errors))
                rxn_errors = rxn_e - predicted_rxn_e
                rxn_rmse = np.sqrt(sum(rxn_errors[:]**2.0) / len(rxn_errors))

                # barrier height plot
                bmu, bsigma = histogram(barrier_errors,
                                        "Barrier height errors")
                rbmu, rbsigma = histogram(reverse_barrier_errors,
                                          "Reverse Barrier height errors")
                rmu, rsigma = histogram(rxn_errors, "Reaction energy errors")
                plt.xlabel("Error (kcal/mol)")
                plt.title("Reaction energetic errors for %s" % dataname)
                plt.legend()

                #plt.scatter(barriers, predicted_barriers)
                #plt.scatter(rxn_e, predicted_rxn_e)
                plt.savefig("%s_barrier_height_errors.pdf" % dataname)
                plt.clf()

                print("errors for %s" % dataname)
                print("Barrier RMSE %.2f rxn RMSE %.2f" %
                      (barrier_rmse, rxn_rmse))
                print("Reverse Barrier RMSE %.2f" % reverse_barrier_rmse)
                print("rxn mu %f sigma %f" % (rmu, rsigma))
                print("barrier mu %f sigma %f" % (bmu, bsigma))
                print("reverse barrier mu %f sigma %f" % (rbmu, rbsigma))

        # plot distribution of raw errors
        if args.analyze_raw_errors:
            #evaluate errors in predictions
            rxn_predictions = trainer.predict(rd_test)
            big_predictions = trainer.predict(rd_big)
            rxn_errors = np.array(rxn_predictions) - np.array(y_test)
            big_errors = np.array(big_predictions) - np.array(y_big)
            rxn_rmse = np.sqrt(sum(rxn_errors[:]**2.0) / len(rxn_errors))
            big_rmse = np.sqrt(sum(big_errors[:]**2.0) / len(big_errors))
            rxn_errors = rxn_errors * KCAL
            big_errors = big_errors * KCAL

            print("small rmse %.4f big rmse %.4f" %
                  (rxn_rmse * KCAL, big_rmse * KCAL))

            smu, ssigma = histogram(
                rxn_errors, "Atomization energy errors for small systems")
            bmu, bsigma = histogram(
                big_errors, "Atomization energy errors for large systems")
            plt.xlabel("Error (kcal/mol)")
            plt.title("Atomization energy errors")
            plt.legend()
            plt.savefig("atomization_errors.pdf")
            plt.clf()

            print("small atomization mu %f sigma %f" % (smu, ssigma))
            print("big atomization mu %f sigma %f" % (bmu, bsigma))
예제 #7
0
def main():

    parser = argparse.ArgumentParser(
        description="Run ANI1 neural net training.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument(
        '--ani-lib',
        required=True,
        help="Location of the shared object for GPU featurization")
    parser.add_argument('--fitted',
                        default=False,
                        action='store_true',
                        help="Whether or use fitted or self-ixn")
    parser.add_argument('--add_ffdata',
                        default=False,
                        action='store_true',
                        help="Whether or not to add the forcefield data")
    parser.add_argument('--gpus', default=1, help="Number of gpus we use")
    parser.add_argument('--train_forces',
                        default=True,
                        help="If we train to the forces")

    parser.add_argument('--save-dir',
                        default='~/work',
                        help="location where save data is dumped")
    parser.add_argument('--train-dir',
                        default='~/ANI-1_release',
                        help="location where work data is dumped")

    args = parser.parse_args()

    print("Arguments", args)

    lib_path = os.path.abspath(args.ani_lib)
    print("Loading custom kernel from", lib_path)
    initialize_module(lib_path)

    ANI_TRAIN_DIR = args.train_dir
    ANI_SAVE_DIR = args.save_dir

    save_dir = os.path.join(ANI_SAVE_DIR, "save")

    use_fitted = args.fitted
    add_ffdata = args.add_ffdata

    data_loader = DataLoader(False)

    all_Xs, all_Ys = data_loader.load_gdb8(ANI_TRAIN_DIR)

    # todo: ensure disjunction in train_test_valid
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
        all_Xs, all_Ys,
        test_size=0.25)  # stratify by UTT would be good to try here
    rd_train, rd_test = RawDataset(X_train,
                                   y_train), RawDataset(X_test, y_test)

    X_gdb11, y_gdb11 = data_loader.load_gdb11(ANI_TRAIN_DIR)
    rd_gdb11 = RawDataset(X_gdb11, y_gdb11)

    batch_size = 1024

    config = tf.ConfigProto(allow_soft_placement=True)

    all_Xs_f, all_Ys_f, all_Fs_f = data_loader.load_gdb8_forces(
        ANI_TRAIN_DIR)  # todo: figure out how to split this consistently later

    rd_train_forces = RawDataset(all_Xs_f, all_Ys_f, all_Fs_f)

    with tf.Session(config=config) as sess:

        # This training code implements cross-validation based training, whereby we determine convergence on a given
        # epoch depending on the cross-validation error for a given validation set. When a better cross-validation
        # score is detected, we save the model's parameters as the putative best found parameters. If after more than
        # max_local_epoch_count number of epochs have been run and no progress has been made, we decrease the learning
        # rate and restore the best found parameters.

        n_gpus = int(args.gpus)
        if n_gpus > 0:
            towers = ["/gpu:" + str(i) for i in range(n_gpus)]
        else:
            towers = [
                "/cpu:" + str(i) for i in range(multiprocessing.cpu_count())
            ]

        print("towers:", towers)

        trainer = TrainerMultiTower(
            sess,
            towers=towers,
            precision=tf.float32,
            layer_sizes=(128, 128, 64, 1),
            # fit_charges=True,
        )

        # if os.path.exists(save_dir):
        # print("Restoring existing model from", save_dir)
        # trainer.load(save_dir)
        # else:
        trainer.initialize()  # initialize to random variables

        max_local_epoch_count = 10

        train_ops = [
            trainer.global_epoch_count,
            trainer.learning_rate,
            trainer.local_epoch_count,
            trainer.unordered_l2s,
            trainer.train_op,
        ]

        print("------------Starting Training--------------")

        start_time = time.time()

        train_forces = bool(int(args.train_forces))  # python is retarded

        # training with forces
        while sess.run(
                trainer.learning_rate
        ) > 5e-10:  # this is to deal with a numerical error, we technically train to 1e-9

            while sess.run(trainer.local_epoch_count) < max_local_epoch_count:

                start_time = time.time()
                # train to forces
                if train_forces:
                    train_results_forces = list(
                        trainer.feed_dataset(
                            rd_train_forces,
                            shuffle=True,
                            target_ops=[
                                trainer.train_op_forces,
                                trainer.tower_force_rmses
                            ],
                            batch_size=batch_size,
                            before_hooks=trainer.max_norm_ops))
                    print(train_results_forces, end=" | ")

                #train to energies
                train_results_energies = list(
                    trainer.feed_dataset(rd_train,
                                         shuffle=True,
                                         target_ops=train_ops,
                                         batch_size=batch_size,
                                         before_hooks=trainer.max_norm_ops))

                train_abs_rmse = np.sqrt(
                    np.mean(flatten_results(train_results_energies,
                                            pos=3))) * HARTREE_TO_KCAL_PER_MOL
                test_abs_rmse = trainer.eval_abs_rmse(rd_test)
                gdb11_abs_rmse = trainer.eval_abs_rmse(rd_gdb11)

                print(time.time() - start_time, train_abs_rmse, test_abs_rmse,
                      gdb11_abs_rmse)

            print("==========Decreasing learning rate==========")
            sess.run(trainer.decr_learning_rate)
            sess.run(trainer.reset_local_epoch_count)
            trainer.load_best_params()

    return
예제 #8
0
def main():

    parser = argparse.ArgumentParser(
        description="Run ANI1 neural net training.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument(
        '--ani_lib',
        required=True,
        help="Location of the shared object for GPU featurization")
    parser.add_argument('--fitted',
                        default=False,
                        action='store_true',
                        help="Whether or use fitted or self-ixn")
    parser.add_argument('--add_ffdata',
                        default=False,
                        action='store_true',
                        help="Whether or not to add the forcefield data")
    parser.add_argument('--gpus', default=1, help="Number of gpus we use")

    parser.add_argument('--work-dir',
                        default='~/work',
                        help="location where work data is dumped")
    parser.add_argument('--train-dir',
                        default='~/ANI-1_release',
                        help="location where work data is dumped")

    args = parser.parse_args()

    print("Arguments", args)

    lib_path = os.path.abspath(args.ani_lib)
    print("Loading custom kernel from", lib_path)
    initialize_module(lib_path)

    ANI_TRAIN_DIR = args.train_dir
    ANI_WORK_DIR = args.work_dir

    save_dir = os.path.join(ANI_WORK_DIR, "save")

    use_fitted = args.fitted
    add_ffdata = args.add_ffdata

    data_loader = DataLoader(False)

    all_Xs, all_Ys = data_loader.load_gdb8(ANI_TRAIN_DIR)

    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
        all_Xs, all_Ys,
        test_size=0.25)  # stratify by UTT would be good to try here
    rd_train, rd_test = RawDataset(X_train,
                                   y_train), RawDataset(X_test, y_test)

    X_gdb11, y_gdb11 = data_loader.load_gdb11(ANI_TRAIN_DIR)
    rd_gdb11 = RawDataset(X_gdb11, y_gdb11)

    batch_size = 1024

    config = tf.ConfigProto(allow_soft_placement=True)

    with tf.Session(config=config) as sess:

        # This training code implements cross-validation based training, whereby we determine convergence on a given
        # epoch depending on the cross-validation error for a given validation set. When a better cross-validation
        # score is detected, we save the model's parameters as the putative best found parameters. If after more than
        # max_local_epoch_count number of epochs have been run and no progress has been made, we decrease the learning
        # rate and restore the best found parameters.

        # n_gpus = int(args.gpus)
        # if n_gpus > 0:
        #     towers = ["/gpu:"+str(i) for i in range(n_gpus)]
        # else:
        #     towers = ["/cpu:"+str(i) for i in range(multiprocessing.cpu_count())]

        # print("towers:", towers)

        with tf.variable_scope("james"):

            trainer_james = TrainerMultiTower(
                sess,
                towers=["/gpu:0"],
                layer_sizes=(128, 128, 64, 1),
                fit_charges=True,
            )

        with tf.variable_scope("yutong"):

            trainer_yutong = TrainerMultiTower(
                sess,
                towers=["/gpu:1"],
                layer_sizes=(128, 128, 64, 1),
                fit_charges=True,
            )

        saver = tf.train.Saver()

        sess.run(tf.global_variables_initializer())

        pool = ThreadPool(2)

        data = ((trainer_james, rd_train, rd_test), (trainer_yutong, rd_train,
                                                     rd_test))

        for e in range(10):
            pool.map(run_one_epoch, data)

        # need to use saver across all
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        save_path = os.path.join(save_dir, "model.ckpt")
        saver.save(sess, save_path)
예제 #9
0
def main():

    parser = argparse.ArgumentParser(description="Run ANI1 neural net training.", formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--ani-lib', required=True, help="Location of the shared object for GPU featurization")
    parser.add_argument('--fitted', default=False, action='store_true', help="Whether or use fitted or self-ixn")
    parser.add_argument('--add-ffdata', default=False, action='store_true', help="Whether or not to add the forcefield data")
    parser.add_argument('--gpus', default=1, help="Number of gpus we use")

    parser.add_argument('--save-dir', default='~/work', help="Location where save data is dumped. If the folder does not exist then it will be created.")
    parser.add_argument('--train-dir', default='~/ANI-1_release', help="Location where training data is located")

    parser.add_argument(
        '--reactivity-dir',
        default=None,
        help='location of reactivity data'
    )

    parser.add_argument(
        '--reactivity-test-percent',
        default=0.25,
        type=float,
        help='percent of reactions to put in test set'
    )

    parser.add_argument(
        '--deep-network',
        action='store_true',
        help='Use James super deep network (256, 256, 256, 256, 256, 256, 256, 128, 64, 8, 1)'
    )

    parser.add_argument(
        '--fit-charges',
        action='store_true',
        help='fit charges'
    )

    parser.add_argument(
        '--activation-function',
        type=str,
        choices=activations.get_all_fn_names(),
        help='choice of activation function',
        default="celu"
    )

    parser.add_argument(
        '--convert-checkpoint',
        default=False,
        action='store_true',
        help='Convert a checkpoint file to a numpy file and exit'
    )

    parser.add_argument(
        '--precision',
        default='single',
        type=str,
        choices=PRECISION.keys(),
        help="Floating point precision of NN"
    )

    args = parser.parse_args()

    print("Arguments", args)

    lib_path = os.path.abspath(args.ani_lib)
    print("Loading custom kernel from", lib_path)
    initialize_module(lib_path)

    ANI_TRAIN_DIR = args.train_dir
    ANI_SAVE_DIR = args.save_dir

    # save_dir = os.path.join(ANI_SAVE_DIR, "save")
    save_file = os.path.join(ANI_SAVE_DIR, "save_file.npz")

    use_fitted = args.fitted
    add_ffdata = args.add_ffdata

    data_loader = DataLoader(False)

    all_Xs, all_Ys = data_loader.load_gdb8(ANI_TRAIN_DIR)

    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(all_Xs, all_Ys, test_size=0.25) # stratify by UTT would be good to try here
    rd_train, rd_test = RawDataset(X_train, y_train), RawDataset(X_test,  y_test)

    X_gdb11, y_gdb11 = data_loader.load_gdb11(ANI_TRAIN_DIR)
    rd_gdb11 = RawDataset(X_gdb11, y_gdb11)

    rd_rxn_test, rd_rxn_train, rd_rxn_all, rd_rxn_big = \
        (None, None, None, None)
    if args.reactivity_dir is not None:
        # add training data
        X_rxn_train, Y_rxn_train, X_rxn_test, Y_rxn_test, X_rxn_big, Y_rxn_big = \
            load_reactivity_data(args.reactivity_dir, args.reactivity_test_percent)

        X_train.extend(X_rxn_train)
        y_train.extend(Y_rxn_train)

        print("Number of reactivity points in training set {0:d}".format(len(Y_rxn_train)))
        print("Number of reactivity points in test set {0:d}".format(len(Y_rxn_test)))

        # keep reaction test set separate
        rd_rxn_test = RawDataset(X_rxn_test, Y_rxn_test) if X_rxn_test else None
        rd_rxn_train = RawDataset(X_rxn_train, Y_rxn_train) if X_rxn_train else None

        # redundant, can be eliminated
        rd_rxn_all = RawDataset(X_rxn_test + X_rxn_train, Y_rxn_test + Y_rxn_train)
        
        # cannot currently handle this in test either
        # everything over 32 atoms
        rd_rxn_big = RawDataset(X_rxn_big, Y_rxn_big)

    batch_size = 1024

    config = tf.ConfigProto(allow_soft_placement=True)

    with tf.Session(config=config) as sess:

        # This training code implements cross-validation based training, whereby we determine convergence on a given
        # epoch depending on the cross-validation error for a given validation set. When a better cross-validation
        # score is detected, we save the model's parameters as the putative best found parameters. If after more than
        # max_local_epoch_count number of epochs have been run and no progress has been made, we decrease the learning
        # rate and restore the best found parameters.

        n_gpus = int(args.gpus)
        if n_gpus > 0:
            towers = ["/gpu:"+str(i) for i in range(n_gpus)]
        else:
            towers = ["/cpu:"+str(i) for i in range(multiprocessing.cpu_count())]

        layers = (128, 128, 64, 1)
        if args.deep_network:
            layers = (256, 256, 256, 256, 256, 256, 256, 128, 64, 8, 1)

        print("Soft placing operations onto towers:", towers)

        activation_fn = activations.get_fn_by_name(args.activation_function)
        precision = PRECISION[args.precision]

        trainer = TrainerMultiTower(
            sess,
            towers=towers,
            precision=precision,
            layer_sizes=layers,
            activation_fn=activation_fn,
            fit_charges=args.fit_charges,
        )

        if args.convert_checkpoint:
            print("Converting saved network to numpy")
            save_dir = os.path.join(args.save_dir, "save")
            trainer.load(save_dir)
            trainer.save_numpy(save_file)
            print("Complete, exiting")
            return

        if os.path.exists(save_file):
            print("Restoring existing model from", save_file)
            trainer.load_numpy(save_file)
        else:
            if not os.path.exists(ANI_SAVE_DIR):
                print("Save directory",ANI_SAVE_DIR,"does not existing... creating")
                os.makedirs(ANI_SAVE_DIR)
            trainer.initialize() # initialize to random variables

        max_local_epoch_count = 10

        train_ops = [
            trainer.global_epoch_count,
            trainer.learning_rate,
            trainer.local_epoch_count,
            trainer.unordered_l2s,
            trainer.train_op,
        ]

        best_test_score = trainer.eval_abs_rmse(rd_test)

        # Uncomment if you'd like to inspect the gradients
        # all_grads = []
        # for grad in trainer.coordinate_gradients(rd_test):
        #     all_grads.append(grad)
        # assert len(all_grads) == rd_test.num_mols()

        print("------------Starting Training--------------")

        start_time = time.time()

        while sess.run(trainer.learning_rate) > 5e-10: # this is to deal with a numerical error, we technically train to 1e-9

            while sess.run(trainer.local_epoch_count) < max_local_epoch_count:

                # sess.run(trainer.max_norm_ops) # should this run after every batch instead?

                start_time = time.time()
                train_results = list(trainer.feed_dataset(
                    rd_train,
                    shuffle=True,
                    target_ops=train_ops,
                    batch_size=batch_size,
                    before_hooks=trainer.max_norm_ops))

                global_epoch = train_results[0][0]
                time_per_epoch = time.time() - start_time
                train_abs_rmse = np.sqrt(np.mean(flatten_results(train_results, pos=3))) * HARTREE_TO_KCAL_PER_MOL
                learning_rate = train_results[0][1]
                local_epoch_count = train_results[0][2]

                test_abs_rmse = trainer.eval_abs_rmse(rd_test)
                print(time.strftime("%Y-%m-%d %H:%M:%S"), 'tpe:', "{0:.2f}s,".format(time_per_epoch), 'g-epoch', global_epoch, 'l-epoch', local_epoch_count, 'lr', "{0:.0e}".format(learning_rate), \
                    'train/test abs rmse:', "{0:.2f} kcal/mol,".format(train_abs_rmse), "{0:.2f} kcal/mol".format(test_abs_rmse), end='')

                if test_abs_rmse < best_test_score:
                    gdb11_abs_rmse = trainer.eval_abs_rmse(rd_gdb11)
                    print(' | gdb11 abs rmse', "{0:.2f} kcal/mol | ".format(gdb11_abs_rmse), end='')

                    best_test_score = test_abs_rmse
                    sess.run([trainer.incr_global_epoch_count, trainer.reset_local_epoch_count])

                    # info about reactivity training
                    rxn_pairs = [
                        (rd_rxn_train, "train"),
                        (rd_rxn_test, "test"),
                        (rd_rxn_all, "all"),
                        (rd_rxn_big, "big")
                    ]
                    for rd, name in rxn_pairs: 
                        if rd is not None:
                            rxn_abs_rmse = trainer.eval_abs_rmse(rd)
                            print(
                                ' | reactivity abs rmse ({0:s})'.format(name),
                                "{0:.2f} kcal/mol | ".format(rxn_abs_rmse),
                                end=''
                            )
                            # should really be a weighted ave
                            if name == "test":
                                best_test_score += rxn_abs_rmse

                else:
                    sess.run([trainer.incr_global_epoch_count, trainer.incr_local_epoch_count])

                trainer.save_numpy(save_file)

                print('', end='\n')

            print("==========Decreasing learning rate==========")
            sess.run(trainer.decr_learning_rate)
            sess.run(trainer.reset_local_epoch_count)
            # trainer.load_best_params()

    return
예제 #10
0
파일: gdb8.py 프로젝트: schrodinger/khan
def main():

    parser = argparse.ArgumentParser(description="Run ANI1 neural net training.", formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--ani-lib', required=True, help="Location of the shared object for GPU featurization")
    parser.add_argument('--fitted', default=False, action='store_true', help="Whether or use fitted or self-ixn")
    parser.add_argument('--add-ffdata', default=False, action='store_true', help="Whether or not to add the forcefield data")
    parser.add_argument('--gpus', default=1, help="Number of gpus we use")

    parser.add_argument('--save-dir', default='~/work', help="Location where save data is dumped. If the folder does not exist then it will be created.")
    parser.add_argument('--train-dir', default='~/ANI-1_release', help="Location where training data is located")

    args = parser.parse_args()

    print("Arguments", args)

    lib_path = os.path.abspath(args.ani_lib)
    print("Loading custom kernel from", lib_path)
    initialize_module(lib_path)

    print("Available activation functions:", activations.get_all_fn_names())

    ANI_TRAIN_DIR = args.train_dir
    ANI_SAVE_DIR = args.save_dir

    # save_dir = os.path.join(ANI_SAVE_DIR, "save")
    save_file = os.path.join(ANI_SAVE_DIR, "save_file.npz")

    use_fitted = args.fitted
    add_ffdata = args.add_ffdata

    data_loader = DataLoader(False)

    all_Xs, all_Ys = data_loader.load_gdb8(ANI_TRAIN_DIR)

    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(all_Xs, all_Ys, test_size=0.25) # stratify by UTT would be good to try here
    rd_train, rd_test = RawDataset(X_train, y_train), RawDataset(X_test,  y_test)

    X_gdb11, y_gdb11 = data_loader.load_gdb11(ANI_TRAIN_DIR)
    rd_gdb11 = RawDataset(X_gdb11, y_gdb11)

    batch_size = 1024

    config = tf.ConfigProto(allow_soft_placement=True)

    with tf.Session(config=config) as sess:

        # This training code implements cross-validation based training, whereby we determine convergence on a given
        # epoch depending on the cross-validation error for a given validation set. When a better cross-validation
        # score is detected, we save the model's parameters as the putative best found parameters. If after more than
        # max_local_epoch_count number of epochs have been run and no progress has been made, we decrease the learning
        # rate and restore the best found parameters.

        n_gpus = int(args.gpus)
        if n_gpus > 0:
            towers = ["/gpu:"+str(i) for i in range(n_gpus)]
        else:
            towers = ["/cpu:"+str(i) for i in range(multiprocessing.cpu_count())]

        print("Soft placing operations onto towers:", towers)

        # activation_fn = activations.get_fn_by_name("celu") # if you want to use the command line.
        activation_fn = activations.celu # preferred
        # activation_fn = tf.nn.selu
        # activation_fn = functools.partial(tf.nn.leaky_relu, alpha=0.2)
        # activation_fn = activations.get_fn_by_name("normal", 0.5, 0.2)


        trainer = TrainerMultiTower(
            sess,
            towers=towers,
            precision=tf.float32,
            layer_sizes=(128, 128, 64, 1),
            activation_fn=activation_fn,
            fit_charges=False,
        )

        if os.path.exists(save_file):
            print("Restoring existing model from", save_file)
            trainer.load_numpy(save_file)
        else:
            if not os.path.exists(ANI_SAVE_DIR):
                print("Save directory",ANI_SAVE_DIR,"does not existing... creating")
                os.makedirs(ANI_SAVE_DIR)
            trainer.initialize() # initialize to random variables

        max_local_epoch_count = 10

        train_ops = [
            trainer.global_epoch_count,
            trainer.learning_rate,
            trainer.local_epoch_count,
            trainer.unordered_l2s,
            trainer.train_op,
        ]

        best_test_score = trainer.eval_abs_rmse(rd_test)

        # Uncomment if you'd like to inspect the gradients
        # all_grads = []
        # for grad in trainer.coordinate_gradients(rd_test):
        #     all_grads.append(grad)
        # assert len(all_grads) == rd_test.num_mols()

        print("------------Starting Training--------------")

        start_time = time.time()

        while sess.run(trainer.learning_rate) > 5e-10: # this is to deal with a numerical error, we technically train to 1e-9

            while sess.run(trainer.local_epoch_count) < max_local_epoch_count:

                # sess.run(trainer.max_norm_ops) # should this run after every batch instead?

                start_time = time.time()
                train_results = list(trainer.feed_dataset(
                    rd_train,
                    shuffle=True,
                    target_ops=train_ops,
                    batch_size=batch_size,
                    before_hooks=trainer.max_norm_ops))

                global_epoch = train_results[0][0]
                time_per_epoch = time.time() - start_time
                train_abs_rmse = np.sqrt(np.mean(flatten_results(train_results, pos=3))) * HARTREE_TO_KCAL_PER_MOL
                learning_rate = train_results[0][1]
                local_epoch_count = train_results[0][2]

                test_abs_rmse = trainer.eval_abs_rmse(rd_test)
                print(time.strftime("%Y-%m-%d %H:%M:%S"), 'tpe:', "{0:.2f}s,".format(time_per_epoch), 'g-epoch', global_epoch, 'l-epoch', local_epoch_count, 'lr', "{0:.0e}".format(learning_rate), \
                    'train/test abs rmse:', "{0:.2f} kcal/mol,".format(train_abs_rmse), "{0:.2f} kcal/mol".format(test_abs_rmse), end='')

                if test_abs_rmse < best_test_score:
                    gdb11_abs_rmse = trainer.eval_abs_rmse(rd_gdb11)
                    print(' | gdb11 abs rmse', "{0:.2f} kcal/mol | ".format(gdb11_abs_rmse), end='')

                    best_test_score = test_abs_rmse
                    sess.run([trainer.incr_global_epoch_count, trainer.reset_local_epoch_count])
                else:
                    sess.run([trainer.incr_global_epoch_count, trainer.incr_local_epoch_count])

                trainer.save_numpy(save_file)

                print('', end='\n')

            print("==========Decreasing learning rate==========")
            sess.run(trainer.decr_learning_rate)
            sess.run(trainer.reset_local_epoch_count)
            # trainer.load_best_params()

    return
예제 #11
0
    def test_featurize(self):

        dummy_elem = [1.0, 2.0, 3.0, 4.0]
        all_Xs = []
        all_ys = []

        for mol_idx in range(125):
            num_atoms = np.random.randint(12, 64)
            mol_coords = []
            for i in range(num_atoms):
                atom_type = np.random.randint(0, 4)
                x = np.random.rand()
                y = np.random.rand()
                z = np.random.rand()
                mol_coords.append((atom_type, x, y, z))
            all_Xs.append(np.array(mol_coords, dtype=np.float32))
            all_ys.append(np.random.rand())

        rd = RawDataset(all_Xs, all_ys)

        sym = Symmetrizer()

        bam = tf.placeholder(tf.float32)
        bao = tf.placeholder(tf.int32)

        feat_op = sym.featurize_batch(bam, bao)

        with tf.Session() as sess:
            with tempfile.TemporaryDirectory() as tmpd:
                batch_size = 16
                fd = rd.featurize(batch_size=batch_size,
                                  data_dir=tmpd,
                                  symmetrizer=sym)
                for batch_idx, (af, ao, gi, mi,
                                my) in enumerate(fd.iterate(shuffle=False)):

                    assert len(ao) > 0
                    assert len(ao) <= 4

                    results = af[gi]

                    s_m_idx = batch_idx * batch_size
                    e_m_idx = min((batch_idx + 1) * batch_size, len(all_Xs))

                    pre_concat_X = all_Xs[s_m_idx:e_m_idx]
                    pre_concat_y = all_ys[s_m_idx:e_m_idx]

                    batch_Xs = np.concatenate(pre_concat_X, axis=0)
                    batch_offsets = np.array(compute_offsets(pre_concat_X),
                                             dtype=np.int32)

                    expected = sess.run(feat_op,
                                        feed_dict={
                                            bam: batch_Xs,
                                            bao: batch_offsets
                                        })

                    np.testing.assert_array_equal(results, expected)

                    expected_mol_idxs = []
                    for m_idx, mm in enumerate(pre_concat_X):
                        expected_mol_idxs.extend([m_idx] * len(mm))

                    np.testing.assert_array_equal(mi, expected_mol_idxs)
                    np.testing.assert_array_equal(my, pre_concat_y)