def assign_samples(self): self.sampled_X = [v[:-1] for v in self.historical] self.sampled_names = [ pal_strings.parseNum(v, self.solvents, mixed_halides=self.mixed_halides, sort=True, num_has_IS=True) for v in self.sampled_X ] self.sampled_indices = [ self.combinations.index(v) for v in self.sampled_names ] assert len(self.sampled_indices) == len( list(set(self.sampled_indices) )), "Error - Sampled indices contain duplicates!" self.sampled_solvent_properties = np.array( [np.array(v[-4:-2]) for v in self.historical]) self.sampled_objectives = [v[-1] for v in self.historical]
def sample(self, specify=None, debug=False, MAX_LOOP=10, allow_reduced=False): ''' This function will run, in parallel, N_samples of the objective functions for historical data generation. Note, these are run for EVERY information source. ''' if debug: print("Collecting LHS samples...") if specify is None: counter, samples = 0, [] while len( samples) != self.historical_nsample and counter < MAX_LOOP: # Grab a latin hypercube sample samples = doe_lhs.lhs( int(self.mixed_halides) * 2 + 2, self.historical_nsample) # Round the LHS and figure out the samples solvent_ranges = [ i * 1.0 / len(self.S) for i in range(1, len(self.solvents) + 1) ] solv = lambda v: self.S[[v <= s for s in solvent_ranges].index(True)] trio = lambda v: [ int(v > (chk - 1.0 / 3.0) and v <= chk) for chk in [1. / 3., 2. / 3., 1.0] ] # Grab our samples if self.mixed_halides: halides = [sorted([s[0], s[1], s[2]]) for s in samples] halides = [[trio(h) for h in hh] for hh in halides] samples = [ h[0] + h[1] + h[2] + trio(s[3]) + [ self.solvents[solv(s[-1])]["density"], self.solvents[solv(s[-1])]["dielectric"], self.solvents[solv(s[-1])]["index"] ] for h, s in zip(halides, samples) ] else: samples = [ trio(s[0]) + trio(s[1]) + [ self.solvents[solv(s[-1])]["density"], self.solvents[solv(s[-1])]["dielectric"], self.solvents[solv(s[-1])]["index"] ] for s in samples ] # Ensure no duplicates samples = sorted(samples, key=lambda x: x[-1]) samples = [tuple(s) for s in samples] samples = [list(s) for s in set(samples)] counter += 1 else: if isinstance(specify, int): specify = [specify] self.historical_nsample = len(specify) samples = [self.combinations[i] for i in specify] samples = pal_strings.alphaToNum(samples, solvents, mixed_halides=self.mixed_halides, name_has_IS=True) samples = [s[1:] for s in samples ] # Remove the IS label from the descriptor if allow_reduced: print( "Warning - Will sample from subspace due to duplicates (%d instead of %d)." % (len(samples), self.historical_nsample)) self.historical_nsample = len(samples) elif specify is None: assert counter < MAX_LOOP, "Error - Unable to sample from space without duplicates!" if debug: print("Will sample %s" % str(samples)) # Now, run these simulations to get the sample points jobs = [] for i, sample in enumerate(samples): if debug: print "Running %s..." % sample s = pal_strings.parseNum(sample, self.solvents, mixed_halides=self.mixed_halides, num_has_IS=False) hat, cat, _, solv, _ = pal_strings.parseName(s, name_has_IS=False) cat = cat[0] if not self.mixed_halides: hat = hat[0] if debug: print("\tAdding %s to sample runs..." % s) for j, obj in enumerate(self.IS): jobs.append([[j] + copy.deepcopy(sample), obj(hat, cat, solv)]) # Now, get results from each simulation samples = [] for sample, j in jobs: if not isinstance(j, float): j.wait() samples.append(sample + [j.get_result()]) # In special situations, when we are reading from a list for example, we don't need to worry # about a job object, and can just assign the value directly. else: samples.append(sample + [j]) s = pal_strings.parseNum(samples[-1][:-1], self.solvents, mixed_halides=self.mixed_halides, num_has_IS=True) if debug: print("\t%s was found as %lg" % (s, samples[-1][-1])) # Save the sampled data fptr = open(self.fname_historical, "w") pickle.dump(samples, fptr) fptr.close() self.historical = samples if debug: print("Done Collecting Samples\n")
sim.combinations = [k[1] + "Pb" + k[0] + "_" + k[2] + "_" + str(IS) for k in [key.split() for key in IS0.keys()] for IS in range(len(sim.IS))] combos_no_IS = [k[1] + "Pb" + k[0] + "_" + k[2] for k in [key.split() for key in IS0.keys()]] # Because we do this, we should also generate our own historical sample sim.historical_nsample = len(combos_no_IS) choices = combos_no_IS tmp_data = pal_strings.alphaToNum( choices, solvents, mixed_halides=True, name_has_IS=False) data = [] for IS in range(len(sim.IS)): for i, d in enumerate(tmp_data): h, c, _, s, _ = pal_strings.parseName(pal_strings.parseNum(d, solvents, mixed_halides=True, num_has_IS=False), name_has_IS=False) c = c[0] data.append([IS] + d + [sim.IS[IS](h, c, s)]) IS0 = np.array([x[-1] for x in data if x[0] == 0]) IS1 = np.array([x[-1] * 1.8 for x in data if x[0] == 1]) IS0, IS1 = zip(*sorted(zip(IS0, IS1))) # IS0 = IS0 / np.linalg.norm(IS0) # IS1 = IS1 / np.linalg.norm(IS1) # print IS0 # print IS1 import matplotlib.pyplot as plt
def run_misokg(run_index): # Store data for debugging IS0 = pickle.load(open("enthalpy_N1_R3_Ukcal-mol", 'r')) IS1 = pickle.load(open("enthalpy_N1_R2_Ukcal-mol", 'r')) # Generate the main object sim = Optimizer() # Assign simulation properties #sim.hyperparameter_objective = MAP sim.hyperparameter_objective = MLE ################################################################################################### # File names sim.fname_out = "enthalpy_misokg.dat" sim.fname_historical = None # Information sources, in order from expensive to cheap sim.IS = [ lambda h, c, s: -1.0 * IS0[' '.join([''.join(h), c, s])], lambda h, c, s: -1.0 * IS1[' '.join([''.join(h), c, s])] ] sim.costs = [1.0, 0.1] sim.logger_fname = "data_dumps/%d_misokg.log" % run_index if os.path.exists(sim.logger_fname): os.system("rm %s" % sim.logger_fname) os.system("touch %s" % sim.logger_fname) sim.obj_vs_cost_fname = "data_dumps/%d_misokg.dat" % run_index sim.mu_fname = "data_dumps/%d_mu_misokg.dat" % run_index sim.sig_fname = "data_dumps/%d_sig_misokg.dat" % run_index sim.combos_fname = "data_dumps/%d_combos_misokg.dat" % run_index sim.hp_fname = "data_dumps/%d_hp_misokg.dat" % run_index sim.acquisition_fname = "data_dumps/%d_acq_misokg.dat" % run_index sim.save_extra_files = True ######################################## # Override the possible combinations with the reduced list of IS0 # Because we do this, we should also generate our own historical sample combos_no_IS = [ k[1] + "Pb" + k[0] + "_" + k[2] for k in [key.split() for key in IS0.keys()] ] sim.historical_nsample = 10 choices = np.random.choice(combos_no_IS, sim.historical_nsample, replace=False) tmp_data = pal_strings.alphaToNum(choices, solvents, mixed_halides=True, name_has_IS=False) data = [] for IS in range(len(sim.IS)): for i, d in enumerate(tmp_data): h, c, _, s, _ = pal_strings.parseName(pal_strings.parseNum( d, solvents, mixed_halides=True, num_has_IS=False), name_has_IS=False) c = c[0] data.append([IS] + d + [sim.IS[IS](h, c, s)]) sim.fname_historical = "data_dumps/%d.history" % run_index pickle.dump(data, open(sim.fname_historical, 'w')) simple_data = [d for d in data if d[0] == 0] pickle.dump(simple_data, open("data_dumps/%d_reduced.history" % run_index, 'w')) ######################################## sim.n_start = 10 # The number of starting MLE samples sim.reopt = 20 sim.ramp_opt = None sim.parallel = False # Possible compositions by default sim.A = ["Cs", "MA", "FA"] sim.B = ["Pb"] sim.X = ["Cl", "Br", "I"] sim.solvents = copy.deepcopy(solvents) sim.S = list(set([v["name"] for k, v in sim.solvents.items()])) sim.mixed_halides = True sim.mixed_solvents = False # Parameters for debugging and overwritting sim.debug = False sim.verbose = True sim.overwrite = True # If True, warning, else Error sim.acquisition = getNextSample_misokg # Functional forms of our mean and covariance # MEAN: 4 * mu_alpha + mu_zeta # COV: sig_alpha * |X><X| + sig_beta * I_N + sig_zeta + MaternKernel(S, weights, sig_m) SCALE = [2.0, 4.0][int(sim.mixed_halides)] # _1, _2, _3 used as dummy entries def mean(X, Y, theta): mu = np.array([SCALE * theta.mu_alpha + theta.mu_zeta for _ in Y]) return mu sim.mean = mean def cov_old(X, Y, theta): A = theta.sig_alpha * np.dot( np.array(X)[:, 1:-3], np.array(X)[:, 1:-3].T) B = theta.sig_beta * np.diag(np.ones(len(X))) C = theta.sig_zeta D = mk52(np.array(X)[:, -3:-1], [theta.l1, theta.l2], theta.sig_m) return theta.rho_matrix(X) * (A + B + C + D) def cov(X0, Y, theta): A = theta.sig_alpha * np.dot( np.array(X0)[:, :-3], np.array(X0)[:, :-3].T) B = theta.sig_beta * np.diag(np.ones(len(X0))) C = theta.sig_zeta D = mk52(np.array(X0)[:, -3:-1], [theta.l1, theta.l2], theta.sig_m) Kx = A + B + C + D Ks = np.array([ np.array( [theta.rho[str(sorted([i, j]))] for j in range(theta.n_IS)]) for i in range(theta.n_IS) ]) if theta.normalize_Ks: Ks = Ks / np.linalg.norm(Ks) e = np.diag(np.array([theta.e1, theta.e2])) Ks = e.dot(Ks.dot(e)) return np.kron(Ks, Kx) sim.cov = cov sim.theta.bounds = {} sim.theta.mu_alpha, sim.theta.bounds['mu_alpha'] = None, ( 1E-3, lambda _, Y: max(Y)) sim.theta.sig_alpha, sim.theta.bounds['sig_alpha'] = None, ( 1E-2, lambda _, Y: 10.0 * np.var(Y)) sim.theta.sig_beta, sim.theta.bounds['sig_beta'] = None, ( 1E-2, lambda _, Y: 10.0 * np.var(Y)) sim.theta.mu_zeta, sim.theta.bounds['mu_zeta'] = None, ( 1E-3, lambda _, Y: max(Y)) sim.theta.sig_zeta, sim.theta.bounds['sig_zeta'] = None, ( 1E-2, lambda _, Y: 10.0 * np.var(Y)) sim.theta.sig_m, sim.theta.bounds['sig_m'] = None, (1E-2, lambda _, Y: np.var(Y)) sim.theta.l1, sim.theta.bounds['l1'] = None, (1E-1, 1) sim.theta.l2, sim.theta.bounds['l2'] = None, (1E-1, 1) sim.theta.e1, sim.theta.bounds['e1'] = None, (1E-1, 1.0) sim.theta.e2, sim.theta.bounds['e2'] = None, (1E-1, 1.0) # # NOTE! This is a reserved keyword in misoKG. We will generate a list of the same length # # of the information sources, and use this for scaling our IS. sim.theta.rho = {"[0, 0]": 1.0, "[0, 1]": 0.96, "[1, 1]": 1.0} sim.theta.bounds['rho [0, 0]'] = (0.1, 1.0) sim.theta.bounds['rho [0, 1]'] = (0.1, 1.0) sim.theta.bounds['rho [1, 1]'] = (0.1, 1.0) sim.theta.set_hp_names() sim.primary_rho_opt = False sim.update_hp_only_with_IS0 = False sim.update_hp_only_with_overlapped = False sim.theta.normalize_L = False sim.theta.normalize_Ks = False # This was a test feature that actually over-wrote rho to be PSD # sim.force_rho_psd = True sim.recommendation_kill_switch = "FAPbBrBrCl_THTO_0" ################################################################################################### # Start simulation sim.run()
def run_misokg(run_index): # Store data for debugging IS0 = pickle.load(open("enthalpy_N1_R3_Ukcal-mol", 'r')) IS1 = pickle.load(open("enthalpy_N1_R2_Ukcal-mol", 'r')) # Generate the main object sim = Optimizer() # Assign simulation properties #sim.hyperparameter_objective = MAP sim.hyperparameter_objective = MLE ################################################################################################### # File names sim.fname_out = "enthalpy_misokg.dat" sim.fname_historical = None # Information sources, in order from expensive to cheap sim.IS = [ lambda h, c, s: -1.0 * IS0[' '.join([''.join(h), c, s])], lambda h, c, s: -1.0 * IS1[' '.join([''.join(h), c, s])] ] sim.costs = [ 1.0, 0.1, ] sim.logger_fname = "data_dumps/%d_misokg.log" % run_index if os.path.exists(sim.logger_fname): os.system("rm %s" % sim.logger_fname) os.system("touch %s" % sim.logger_fname) sim.obj_vs_cost_fname = "data_dumps/%d_misokg.dat" % run_index sim.mu_fname = "data_dumps/%d_mu_misokg.dat" % run_index sim.sig_fname = "data_dumps/%d_sig_misokg.dat" % run_index sim.combos_fname = "data_dumps/%d_combos_misokg.dat" % run_index sim.hp_fname = "data_dumps/%d_hp_misokg.dat" % run_index sim.acquisition_fname = "data_dumps/%d_acq_misokg.dat" % run_index sim.save_extra_files = True ######################################## # Override the possible combinations with the reduced list of IS0 # Because we do this, we should also generate our own historical sample combos_no_IS = [ k[1] + "Pb" + k[0] + "_" + k[2] for k in [key.split() for key in IS0.keys()] ] #sim.historical_nsample = 240 sim.historical_nsample = 10 choices = np.random.choice(combos_no_IS, sim.historical_nsample, replace=False) tmp_data = pal_strings.alphaToNum(choices, solvents, mixed_halides=True, name_has_IS=False) data = [] for IS in range(len(sim.IS)): for i, d in enumerate(tmp_data): h, c, _, s, _ = pal_strings.parseName(pal_strings.parseNum( d, solvents, mixed_halides=True, num_has_IS=False), name_has_IS=False) c = c[0] data.append([IS] + d + [sim.IS[IS](h, c, s)]) sim.fname_historical = "data_dumps/%d.history" % run_index pickle.dump(data, open(sim.fname_historical, 'w')) simple_data = [d for d in data if d[0] == 0] pickle.dump(simple_data, open("data_dumps/%d_reduced.history" % run_index, 'w')) ######################################## sim.n_start = 10 # The number of starting MLE samples sim.reopt = 10 sim.ramp_opt = None sim.parallel = False # Possible compositions by default sim.A = ["Cs", "MA", "FA"] sim.B = ["Pb"] sim.X = ["Cl", "Br", "I"] sim.solvents = copy.deepcopy(solvents) sim.S = list(set([v["name"] for k, v in sim.solvents.items()])) sim.mixed_halides = True sim.mixed_solvents = False # Parameters for debugging and overwritting sim.debug = False sim.verbose = True sim.overwrite = True # If True, warning, else Error sim.acquisition = getNextSample_misokg # Functional forms of our mean and covariance # MEAN: 4 * mu_alpha + mu_zeta # COV: sig_alpha * |X><X| + sig_beta * I_N + sig_zeta + MaternKernel(S, weights, sig_m) SCALE = [2.0, 4.0][int(sim.mixed_halides)] # _1, _2, _3 used as dummy entries def mean(X, Y, theta): mu = np.array([SCALE * theta.mu_alpha + theta.mu_zeta for _ in Y]) return mu sim.mean = mean def cov_old(X, Y, theta): A = theta.sig_alpha * np.dot( np.array(X)[:, 1:-3], np.array(X)[:, 1:-3].T) B = theta.sig_beta * np.diag(np.ones(len(X))) C = theta.sig_zeta D = mk52(np.array(X)[:, -3:-1], [theta.l1, theta.l2], theta.sig_m) return theta.rho_matrix(X) * (A + B + C + D) def cov_old2(X, Y, theta): A = theta.sig_alpha * np.dot( np.array(X)[:, 1:-3], np.array(X)[:, 1:-3].T) B = theta.sig_beta * np.diag(np.ones(len(X))) C = theta.sig_zeta D = mk52(np.array(X)[:, -3:-1], [theta.l1, theta.l2], theta.sig_m) return theta.rho_matrix(X, use_psd=True) * (A + B + C + D) def cov_new(X, Y, theta): # Get a list of all unique X, removing initial IS identifier X0 = [] for x in X: if not any( [all([a == b for a, b in zip(x[1:], xchk)]) for xchk in X0]): X0.append(x[1:]) A = theta.sig_alpha * np.dot( np.array(X0)[:, :-3], np.array(X0)[:, :-3].T) B = theta.sig_beta * np.diag(np.ones(len(X0))) C = theta.sig_zeta D = mk52(np.array(X0)[:, -3:-1], [theta.l1, theta.l2], theta.sig_m) Kx = A + B + C + D L = np.array([ np.array([ theta.rho[str(sorted([i, j]))] if i >= j else 0.0 for j in range(theta.n_IS) ]) for i in range(theta.n_IS) ]) # Normalize L to stop over-scaling values small L = L / np.linalg.norm(L) # Force it to be positive semi-definite Ks = L.dot(L.T) return np.kron(Ks, Kx) #K = np.kron(Ks, Kx) # Now, we get the sub-covariance matrix for the specified sampled X and Y indices = [] for l in range(theta.n_IS): for i, x in enumerate(X0): test = [l] + list(x) if any( [all([a == b for a, b in zip(test, xchk)]) for xchk in X]): indices.append(l * len(X0) + i) K_local = K[np.ix_(indices, indices)] return K_local sim.cov = cov_new sim.theta.bounds = {} sim.theta.mu_alpha, sim.theta.bounds['mu_alpha'] = None, ( 1E-3, lambda _, Y: max(Y)) sim.theta.sig_alpha, sim.theta.bounds['sig_alpha'] = None, ( 1E-2, lambda _, Y: 10.0 * np.var(Y)) sim.theta.sig_beta, sim.theta.bounds['sig_beta'] = None, ( 1E-2, lambda _, Y: 10.0 * np.var(Y)) sim.theta.mu_zeta, sim.theta.bounds['mu_zeta'] = None, ( 1E-3, lambda _, Y: max(Y)) sim.theta.sig_zeta, sim.theta.bounds['sig_zeta'] = None, ( 1E-2, lambda _, Y: 10.0 * np.var(Y)) sim.theta.sig_m, sim.theta.bounds['sig_m'] = None, (1E-2, lambda _, Y: np.var(Y)) sim.theta.l1, sim.theta.bounds['l1'] = None, (1E-1, 1) sim.theta.l2, sim.theta.bounds['l2'] = None, (1E-1, 1) # # NOTE! This is a reserved keyword in misoKG. We will generate a list of the same length # # of the information sources, and use this for scaling our IS. # sim.theta.rho = {"[0, 0]": 1, "[0, 1]": None, "[1, 1]": 1} # sim.theta.bounds['rho [0, 1]'] = (-1.0, 1.0) # sim.theta.bounds['rho [0, 0]'] = (1, 1) # sim.theta.bounds['rho [1, 1]'] = (1, 1) sim.theta.rho = {"[0, 0]": None, "[0, 1]": None, "[1, 1]": None} sim.theta.bounds['rho [0, 0]'] = (0.1, 1.0) sim.theta.bounds['rho [0, 1]'] = (0.1, 1.0) sim.theta.bounds['rho [1, 1]'] = (0.1, 1.0) sim.theta.set_hp_names() sim.primary_rho_opt = False #sim.update_hp_only_with_IS0 = True sim.update_hp_only_with_overlapped = True ################################################################################################### # Start simulation sim.run()