예제 #1
0
    def assign_samples(self):
        self.sampled_X = [v[:-1] for v in self.historical]
        self.sampled_names = [
            pal_strings.parseNum(v,
                                 self.solvents,
                                 mixed_halides=self.mixed_halides,
                                 sort=True,
                                 num_has_IS=True) for v in self.sampled_X
        ]
        self.sampled_indices = [
            self.combinations.index(v) for v in self.sampled_names
        ]

        assert len(self.sampled_indices) == len(
            list(set(self.sampled_indices)
                 )), "Error - Sampled indices contain duplicates!"

        self.sampled_solvent_properties = np.array(
            [np.array(v[-4:-2]) for v in self.historical])
        self.sampled_objectives = [v[-1] for v in self.historical]
예제 #2
0
    def sample(self,
               specify=None,
               debug=False,
               MAX_LOOP=10,
               allow_reduced=False):
        '''
        This function will run, in parallel, N_samples of the objective functions for
        historical data generation.  Note, these are run for EVERY information source.
        '''

        if debug:
            print("Collecting LHS samples...")

        if specify is None:
            counter, samples = 0, []
            while len(
                    samples) != self.historical_nsample and counter < MAX_LOOP:

                # Grab a latin hypercube sample
                samples = doe_lhs.lhs(
                    int(self.mixed_halides) * 2 + 2, self.historical_nsample)
                # Round the LHS and figure out the samples
                solvent_ranges = [
                    i * 1.0 / len(self.S)
                    for i in range(1,
                                   len(self.solvents) + 1)
                ]
                solv = lambda v: self.S[[v <= s
                                         for s in solvent_ranges].index(True)]
                trio = lambda v: [
                    int(v > (chk - 1.0 / 3.0) and v <= chk)
                    for chk in [1. / 3., 2. / 3., 1.0]
                ]

                # Grab our samples
                if self.mixed_halides:
                    halides = [sorted([s[0], s[1], s[2]]) for s in samples]
                    halides = [[trio(h) for h in hh] for hh in halides]
                    samples = [
                        h[0] + h[1] + h[2] + trio(s[3]) + [
                            self.solvents[solv(s[-1])]["density"],
                            self.solvents[solv(s[-1])]["dielectric"],
                            self.solvents[solv(s[-1])]["index"]
                        ] for h, s in zip(halides, samples)
                    ]
                else:
                    samples = [
                        trio(s[0]) + trio(s[1]) + [
                            self.solvents[solv(s[-1])]["density"],
                            self.solvents[solv(s[-1])]["dielectric"],
                            self.solvents[solv(s[-1])]["index"]
                        ] for s in samples
                    ]

                # Ensure no duplicates
                samples = sorted(samples, key=lambda x: x[-1])
                samples = [tuple(s) for s in samples]
                samples = [list(s) for s in set(samples)]

                counter += 1
        else:
            if isinstance(specify, int):
                specify = [specify]
            self.historical_nsample = len(specify)
            samples = [self.combinations[i] for i in specify]
            samples = pal_strings.alphaToNum(samples,
                                             solvents,
                                             mixed_halides=self.mixed_halides,
                                             name_has_IS=True)
            samples = [s[1:] for s in samples
                       ]  # Remove the IS label from the descriptor

        if allow_reduced:
            print(
                "Warning - Will sample from subspace due to duplicates (%d instead of %d)."
                % (len(samples), self.historical_nsample))
            self.historical_nsample = len(samples)
        elif specify is None:
            assert counter < MAX_LOOP, "Error - Unable to sample from space without duplicates!"

        if debug:
            print("Will sample %s" % str(samples))

        # Now, run these simulations to get the sample points
        jobs = []
        for i, sample in enumerate(samples):
            if debug:
                print "Running %s..." % sample
            s = pal_strings.parseNum(sample,
                                     self.solvents,
                                     mixed_halides=self.mixed_halides,
                                     num_has_IS=False)
            hat, cat, _, solv, _ = pal_strings.parseName(s, name_has_IS=False)
            cat = cat[0]
            if not self.mixed_halides:
                hat = hat[0]
            if debug:
                print("\tAdding %s to sample runs..." % s)

            for j, obj in enumerate(self.IS):
                jobs.append([[j] + copy.deepcopy(sample), obj(hat, cat, solv)])

        # Now, get results from each simulation
        samples = []
        for sample, j in jobs:
            if not isinstance(j, float):
                j.wait()
                samples.append(sample + [j.get_result()])
            # In special situations, when we are reading from a list for example, we don't need to worry
            # about a job object, and can just assign the value directly.
            else:
                samples.append(sample + [j])

            s = pal_strings.parseNum(samples[-1][:-1],
                                     self.solvents,
                                     mixed_halides=self.mixed_halides,
                                     num_has_IS=True)
            if debug:
                print("\t%s was found as %lg" % (s, samples[-1][-1]))

        # Save the sampled data
        fptr = open(self.fname_historical, "w")
        pickle.dump(samples, fptr)
        fptr.close()
        self.historical = samples

        if debug:
            print("Done Collecting Samples\n")
예제 #3
0
sim.combinations = [k[1] + "Pb" + k[0] + "_" + k[2] + "_" + str(IS) for k in [key.split() for key in IS0.keys()] for IS in range(len(sim.IS))]
combos_no_IS = [k[1] + "Pb" + k[0] + "_" + k[2] for k in [key.split() for key in IS0.keys()]]

# Because we do this, we should also generate our own historical sample
sim.historical_nsample = len(combos_no_IS)
choices = combos_no_IS
tmp_data = pal_strings.alphaToNum(
    choices,
    solvents,
    mixed_halides=True,
    name_has_IS=False)

data = []
for IS in range(len(sim.IS)):
    for i, d in enumerate(tmp_data):
        h, c, _, s, _ = pal_strings.parseName(pal_strings.parseNum(d, solvents, mixed_halides=True, num_has_IS=False), name_has_IS=False)
        c = c[0]
        data.append([IS] + d + [sim.IS[IS](h, c, s)])

IS0 = np.array([x[-1] for x in data if x[0] == 0])
IS1 = np.array([x[-1] * 1.8 for x in data if x[0] == 1])

IS0, IS1 = zip(*sorted(zip(IS0, IS1)))

# IS0 = IS0 / np.linalg.norm(IS0)
# IS1 = IS1 / np.linalg.norm(IS1)

# print IS0
# print IS1

import matplotlib.pyplot as plt
예제 #4
0
def run_misokg(run_index):

    # Store data for debugging
    IS0 = pickle.load(open("enthalpy_N1_R3_Ukcal-mol", 'r'))
    IS1 = pickle.load(open("enthalpy_N1_R2_Ukcal-mol", 'r'))

    # Generate the main object
    sim = Optimizer()

    # Assign simulation properties
    #sim.hyperparameter_objective = MAP
    sim.hyperparameter_objective = MLE
    ###################################################################################################
    # File names
    sim.fname_out = "enthalpy_misokg.dat"
    sim.fname_historical = None

    # Information sources, in order from expensive to cheap
    sim.IS = [
        lambda h, c, s: -1.0 * IS0[' '.join([''.join(h), c, s])],
        lambda h, c, s: -1.0 * IS1[' '.join([''.join(h), c, s])]
    ]
    sim.costs = [1.0, 0.1]

    sim.logger_fname = "data_dumps/%d_misokg.log" % run_index
    if os.path.exists(sim.logger_fname):
        os.system("rm %s" % sim.logger_fname)
    os.system("touch %s" % sim.logger_fname)

    sim.obj_vs_cost_fname = "data_dumps/%d_misokg.dat" % run_index
    sim.mu_fname = "data_dumps/%d_mu_misokg.dat" % run_index
    sim.sig_fname = "data_dumps/%d_sig_misokg.dat" % run_index
    sim.combos_fname = "data_dumps/%d_combos_misokg.dat" % run_index
    sim.hp_fname = "data_dumps/%d_hp_misokg.dat" % run_index
    sim.acquisition_fname = "data_dumps/%d_acq_misokg.dat" % run_index
    sim.save_extra_files = True
    ########################################
    # Override the possible combinations with the reduced list of IS0
    # Because we do this, we should also generate our own historical sample
    combos_no_IS = [
        k[1] + "Pb" + k[0] + "_" + k[2]
        for k in [key.split() for key in IS0.keys()]
    ]
    sim.historical_nsample = 10
    choices = np.random.choice(combos_no_IS,
                               sim.historical_nsample,
                               replace=False)
    tmp_data = pal_strings.alphaToNum(choices,
                                      solvents,
                                      mixed_halides=True,
                                      name_has_IS=False)

    data = []
    for IS in range(len(sim.IS)):
        for i, d in enumerate(tmp_data):
            h, c, _, s, _ = pal_strings.parseName(pal_strings.parseNum(
                d, solvents, mixed_halides=True, num_has_IS=False),
                                                  name_has_IS=False)
            c = c[0]
            data.append([IS] + d + [sim.IS[IS](h, c, s)])

    sim.fname_historical = "data_dumps/%d.history" % run_index
    pickle.dump(data, open(sim.fname_historical, 'w'))
    simple_data = [d for d in data if d[0] == 0]
    pickle.dump(simple_data,
                open("data_dumps/%d_reduced.history" % run_index, 'w'))

    ########################################

    sim.n_start = 10  # The number of starting MLE samples
    sim.reopt = 20
    sim.ramp_opt = None
    sim.parallel = False

    # Possible compositions by default
    sim.A = ["Cs", "MA", "FA"]
    sim.B = ["Pb"]
    sim.X = ["Cl", "Br", "I"]
    sim.solvents = copy.deepcopy(solvents)
    sim.S = list(set([v["name"] for k, v in sim.solvents.items()]))
    sim.mixed_halides = True
    sim.mixed_solvents = False

    # Parameters for debugging and overwritting
    sim.debug = False
    sim.verbose = True
    sim.overwrite = True  # If True, warning, else Error

    sim.acquisition = getNextSample_misokg

    # Functional forms of our mean and covariance
    # MEAN: 4 * mu_alpha + mu_zeta
    # COV: sig_alpha * |X><X| + sig_beta * I_N + sig_zeta + MaternKernel(S, weights, sig_m)

    SCALE = [2.0, 4.0][int(sim.mixed_halides)]

    # _1, _2, _3 used as dummy entries
    def mean(X, Y, theta):
        mu = np.array([SCALE * theta.mu_alpha + theta.mu_zeta for _ in Y])
        return mu

    sim.mean = mean

    def cov_old(X, Y, theta):
        A = theta.sig_alpha * np.dot(
            np.array(X)[:, 1:-3],
            np.array(X)[:, 1:-3].T)
        B = theta.sig_beta * np.diag(np.ones(len(X)))
        C = theta.sig_zeta
        D = mk52(np.array(X)[:, -3:-1], [theta.l1, theta.l2], theta.sig_m)
        return theta.rho_matrix(X) * (A + B + C + D)

    def cov(X0, Y, theta):
        A = theta.sig_alpha * np.dot(
            np.array(X0)[:, :-3],
            np.array(X0)[:, :-3].T)
        B = theta.sig_beta * np.diag(np.ones(len(X0)))
        C = theta.sig_zeta
        D = mk52(np.array(X0)[:, -3:-1], [theta.l1, theta.l2], theta.sig_m)
        Kx = A + B + C + D

        Ks = np.array([
            np.array(
                [theta.rho[str(sorted([i, j]))] for j in range(theta.n_IS)])
            for i in range(theta.n_IS)
        ])
        if theta.normalize_Ks:
            Ks = Ks / np.linalg.norm(Ks)

        e = np.diag(np.array([theta.e1, theta.e2]))
        Ks = e.dot(Ks.dot(e))

        return np.kron(Ks, Kx)

    sim.cov = cov

    sim.theta.bounds = {}
    sim.theta.mu_alpha, sim.theta.bounds['mu_alpha'] = None, (
        1E-3, lambda _, Y: max(Y))
    sim.theta.sig_alpha, sim.theta.bounds['sig_alpha'] = None, (
        1E-2, lambda _, Y: 10.0 * np.var(Y))
    sim.theta.sig_beta, sim.theta.bounds['sig_beta'] = None, (
        1E-2, lambda _, Y: 10.0 * np.var(Y))
    sim.theta.mu_zeta, sim.theta.bounds['mu_zeta'] = None, (
        1E-3, lambda _, Y: max(Y))
    sim.theta.sig_zeta, sim.theta.bounds['sig_zeta'] = None, (
        1E-2, lambda _, Y: 10.0 * np.var(Y))
    sim.theta.sig_m, sim.theta.bounds['sig_m'] = None, (1E-2,
                                                        lambda _, Y: np.var(Y))
    sim.theta.l1, sim.theta.bounds['l1'] = None, (1E-1, 1)
    sim.theta.l2, sim.theta.bounds['l2'] = None, (1E-1, 1)

    sim.theta.e1, sim.theta.bounds['e1'] = None, (1E-1, 1.0)
    sim.theta.e2, sim.theta.bounds['e2'] = None, (1E-1, 1.0)

    # # NOTE! This is a reserved keyword in misoKG.  We will generate a list of the same length
    # # of the information sources, and use this for scaling our IS.
    sim.theta.rho = {"[0, 0]": 1.0, "[0, 1]": 0.96, "[1, 1]": 1.0}
    sim.theta.bounds['rho [0, 0]'] = (0.1, 1.0)
    sim.theta.bounds['rho [0, 1]'] = (0.1, 1.0)
    sim.theta.bounds['rho [1, 1]'] = (0.1, 1.0)

    sim.theta.set_hp_names()

    sim.primary_rho_opt = False
    sim.update_hp_only_with_IS0 = False
    sim.update_hp_only_with_overlapped = False

    sim.theta.normalize_L = False
    sim.theta.normalize_Ks = False

    # This was a test feature that actually over-wrote rho to be PSD
    # sim.force_rho_psd = True
    sim.recommendation_kill_switch = "FAPbBrBrCl_THTO_0"

    ###################################################################################################

    # Start simulation
    sim.run()
예제 #5
0
def run_misokg(run_index):

    # Store data for debugging
    IS0 = pickle.load(open("enthalpy_N1_R3_Ukcal-mol", 'r'))
    IS1 = pickle.load(open("enthalpy_N1_R2_Ukcal-mol", 'r'))

    # Generate the main object
    sim = Optimizer()

    # Assign simulation properties
    #sim.hyperparameter_objective = MAP
    sim.hyperparameter_objective = MLE
    ###################################################################################################
    # File names
    sim.fname_out = "enthalpy_misokg.dat"
    sim.fname_historical = None

    # Information sources, in order from expensive to cheap
    sim.IS = [
        lambda h, c, s: -1.0 * IS0[' '.join([''.join(h), c, s])],
        lambda h, c, s: -1.0 * IS1[' '.join([''.join(h), c, s])]
    ]
    sim.costs = [
        1.0,
        0.1,
    ]

    sim.logger_fname = "data_dumps/%d_misokg.log" % run_index
    if os.path.exists(sim.logger_fname):
        os.system("rm %s" % sim.logger_fname)
    os.system("touch %s" % sim.logger_fname)

    sim.obj_vs_cost_fname = "data_dumps/%d_misokg.dat" % run_index
    sim.mu_fname = "data_dumps/%d_mu_misokg.dat" % run_index
    sim.sig_fname = "data_dumps/%d_sig_misokg.dat" % run_index
    sim.combos_fname = "data_dumps/%d_combos_misokg.dat" % run_index
    sim.hp_fname = "data_dumps/%d_hp_misokg.dat" % run_index
    sim.acquisition_fname = "data_dumps/%d_acq_misokg.dat" % run_index
    sim.save_extra_files = True
    ########################################
    # Override the possible combinations with the reduced list of IS0
    # Because we do this, we should also generate our own historical sample
    combos_no_IS = [
        k[1] + "Pb" + k[0] + "_" + k[2]
        for k in [key.split() for key in IS0.keys()]
    ]
    #sim.historical_nsample = 240
    sim.historical_nsample = 10
    choices = np.random.choice(combos_no_IS,
                               sim.historical_nsample,
                               replace=False)
    tmp_data = pal_strings.alphaToNum(choices,
                                      solvents,
                                      mixed_halides=True,
                                      name_has_IS=False)

    data = []
    for IS in range(len(sim.IS)):
        for i, d in enumerate(tmp_data):
            h, c, _, s, _ = pal_strings.parseName(pal_strings.parseNum(
                d, solvents, mixed_halides=True, num_has_IS=False),
                                                  name_has_IS=False)
            c = c[0]
            data.append([IS] + d + [sim.IS[IS](h, c, s)])

    sim.fname_historical = "data_dumps/%d.history" % run_index
    pickle.dump(data, open(sim.fname_historical, 'w'))
    simple_data = [d for d in data if d[0] == 0]
    pickle.dump(simple_data,
                open("data_dumps/%d_reduced.history" % run_index, 'w'))

    ########################################

    sim.n_start = 10  # The number of starting MLE samples
    sim.reopt = 10
    sim.ramp_opt = None
    sim.parallel = False

    # Possible compositions by default
    sim.A = ["Cs", "MA", "FA"]
    sim.B = ["Pb"]
    sim.X = ["Cl", "Br", "I"]
    sim.solvents = copy.deepcopy(solvents)
    sim.S = list(set([v["name"] for k, v in sim.solvents.items()]))
    sim.mixed_halides = True
    sim.mixed_solvents = False

    # Parameters for debugging and overwritting
    sim.debug = False
    sim.verbose = True
    sim.overwrite = True  # If True, warning, else Error

    sim.acquisition = getNextSample_misokg

    # Functional forms of our mean and covariance
    # MEAN: 4 * mu_alpha + mu_zeta
    # COV: sig_alpha * |X><X| + sig_beta * I_N + sig_zeta + MaternKernel(S, weights, sig_m)

    SCALE = [2.0, 4.0][int(sim.mixed_halides)]

    # _1, _2, _3 used as dummy entries
    def mean(X, Y, theta):
        mu = np.array([SCALE * theta.mu_alpha + theta.mu_zeta for _ in Y])
        return mu

    sim.mean = mean

    def cov_old(X, Y, theta):
        A = theta.sig_alpha * np.dot(
            np.array(X)[:, 1:-3],
            np.array(X)[:, 1:-3].T)
        B = theta.sig_beta * np.diag(np.ones(len(X)))
        C = theta.sig_zeta
        D = mk52(np.array(X)[:, -3:-1], [theta.l1, theta.l2], theta.sig_m)
        return theta.rho_matrix(X) * (A + B + C + D)

    def cov_old2(X, Y, theta):
        A = theta.sig_alpha * np.dot(
            np.array(X)[:, 1:-3],
            np.array(X)[:, 1:-3].T)
        B = theta.sig_beta * np.diag(np.ones(len(X)))
        C = theta.sig_zeta
        D = mk52(np.array(X)[:, -3:-1], [theta.l1, theta.l2], theta.sig_m)
        return theta.rho_matrix(X, use_psd=True) * (A + B + C + D)

    def cov_new(X, Y, theta):
        # Get a list of all unique X, removing initial IS identifier
        X0 = []
        for x in X:
            if not any(
                [all([a == b for a, b in zip(x[1:], xchk)]) for xchk in X0]):
                X0.append(x[1:])

        A = theta.sig_alpha * np.dot(
            np.array(X0)[:, :-3],
            np.array(X0)[:, :-3].T)
        B = theta.sig_beta * np.diag(np.ones(len(X0)))
        C = theta.sig_zeta
        D = mk52(np.array(X0)[:, -3:-1], [theta.l1, theta.l2], theta.sig_m)
        Kx = A + B + C + D

        L = np.array([
            np.array([
                theta.rho[str(sorted([i, j]))] if i >= j else 0.0
                for j in range(theta.n_IS)
            ]) for i in range(theta.n_IS)
        ])
        # Normalize L to stop over-scaling values small
        L = L / np.linalg.norm(L)
        # Force it to be positive semi-definite
        Ks = L.dot(L.T)

        return np.kron(Ks, Kx)
        #K = np.kron(Ks, Kx)

        # Now, we get the sub-covariance matrix for the specified sampled X and Y
        indices = []
        for l in range(theta.n_IS):
            for i, x in enumerate(X0):
                test = [l] + list(x)
                if any(
                    [all([a == b for a, b in zip(test, xchk)]) for xchk in X]):
                    indices.append(l * len(X0) + i)

        K_local = K[np.ix_(indices, indices)]

        return K_local

    sim.cov = cov_new

    sim.theta.bounds = {}
    sim.theta.mu_alpha, sim.theta.bounds['mu_alpha'] = None, (
        1E-3, lambda _, Y: max(Y))
    sim.theta.sig_alpha, sim.theta.bounds['sig_alpha'] = None, (
        1E-2, lambda _, Y: 10.0 * np.var(Y))
    sim.theta.sig_beta, sim.theta.bounds['sig_beta'] = None, (
        1E-2, lambda _, Y: 10.0 * np.var(Y))
    sim.theta.mu_zeta, sim.theta.bounds['mu_zeta'] = None, (
        1E-3, lambda _, Y: max(Y))
    sim.theta.sig_zeta, sim.theta.bounds['sig_zeta'] = None, (
        1E-2, lambda _, Y: 10.0 * np.var(Y))
    sim.theta.sig_m, sim.theta.bounds['sig_m'] = None, (1E-2,
                                                        lambda _, Y: np.var(Y))
    sim.theta.l1, sim.theta.bounds['l1'] = None, (1E-1, 1)
    sim.theta.l2, sim.theta.bounds['l2'] = None, (1E-1, 1)

    # # NOTE! This is a reserved keyword in misoKG.  We will generate a list of the same length
    # # of the information sources, and use this for scaling our IS.
    # sim.theta.rho = {"[0, 0]": 1, "[0, 1]": None, "[1, 1]": 1}
    # sim.theta.bounds['rho [0, 1]'] = (-1.0, 1.0)
    # sim.theta.bounds['rho [0, 0]'] = (1, 1)
    # sim.theta.bounds['rho [1, 1]'] = (1, 1)

    sim.theta.rho = {"[0, 0]": None, "[0, 1]": None, "[1, 1]": None}
    sim.theta.bounds['rho [0, 0]'] = (0.1, 1.0)
    sim.theta.bounds['rho [0, 1]'] = (0.1, 1.0)
    sim.theta.bounds['rho [1, 1]'] = (0.1, 1.0)

    sim.theta.set_hp_names()

    sim.primary_rho_opt = False
    #sim.update_hp_only_with_IS0 = True
    sim.update_hp_only_with_overlapped = True

    ###################################################################################################

    # Start simulation
    sim.run()