def _evaluate(self, X, out, *args, **kwargs): # Convert X to a DataSet X = DataSet(np.atleast_2d(X), columns=self.X_columns) # Add in any fixed columns (i.e., values for cateogricals) if self.fixed_variables is not None: for key, value in self.fixed_variables.items(): X[key] = value F = np.zeros([X.shape[0], self.n_obj]) for i in range(self.n_obj): F[:, i] = self.models[i].predict(X) # Negate objectives that are need to be maximized for i, v in enumerate(self.domain.output_variables): if v.maximize: F[:, i] *= -1 out["F"] = F # Add constraints if necessary if self.domain.constraints: constraint_res = [ X.eval(c.lhs, resolvers=[X]) for c in self.domain.constraints ] out["G"] = [c.tolist()[0] for c in constraint_res]
def transform_inputs_outputs(self, ds, **kwargs): """Transform of data into inputs and outptus for a strategy This will do a log transform on the objectives (outputs). Parameters ---------- ds: `DataSet` Dataset with columns corresponding to the inputs and objectives of the domain. copy: bool, optional Copy the dataset internally. Defaults to True. transform_descriptors: bool, optional Transform the descriptors into continuous variables. Default True. Returns ------- inputs, outputs Datasets with the input and output datasets """ inputs, outputs = super().transform_inputs_outputs(ds, **kwargs) if (outputs.any() < 0).any(): raise ValueError( "Cannot complete log transform for values less than zero.") outputs = outputs.apply(np.log) columns = [ v.name for v in self.transform_domain.variables if v.is_objective ] outputs = DataSet(outputs.data_to_numpy(), columns=columns) return inputs, outputs
def problem_wrapper(X): X = DataSet(np.atleast_2d(X), columns=input_columns) X[("strategy", "METADATA")] = "NSGAII" result = self.experiment.run_experiments(X) if self.domain.constraints: constraint_res = [ X.eval(c.lhs, resolvers=[X]) for c in self.domain.constraints ] constraint_res = [c.tolist()[0] for c in constraint_res] return result[output_columns].to_numpy()[0,:], constraint_res else: return result[output_columns].to_numpy()[0,:]
def from_dict(cls, d): tsemo = super().from_dict(d) ae = d["strategy_params"]["all_experiments"] if ae is not None: tsemo.all_experiments = DataSet.from_dict(ae) return tsemo
def test_snar_benchmark(noise_level): """Test the SnAr benchmark""" b = SnarBenchmark(noise_level=noise_level) columns = [v.name for v in b.domain.variables] values = { ("tau", "DATA"): 1.5, # minutes ("equiv_pldn", "DATA"): 0.5, ("conc_dfnb", "DATA"): 0.1, # molar ("temperature", "DATA"): 30.0, # degrees celsius } # Check that results are reasonable conditions = DataSet([values], columns=columns) results = b.run_experiments(conditions) assert float(results["tau"]) == values[("tau", "DATA")] assert float(results["equiv_pldn"]) == values[("equiv_pldn", "DATA")] assert float(results["conc_dfnb"]) == values[("conc_dfnb", "DATA")] assert float(results["temperature"]) == values[("temperature", "DATA")] if noise_level == 0.0: assert np.isclose(results["sty"].values[0], 168.958672) assert np.isclose(results["e_factor"].values[0], 191.260294) # Test serialization d = b.to_dict() new_b = SnarBenchmark.from_dict(d) assert b.noise_level == noise_level return results
def test_baumgartner_CC_emulator(): """ Test the Baumgartner Cross Coupling emulator""" b = BaumgartnerCrossCouplingEmulator() columns = [v.name for v in b.domain.variables] values = { ("catalyst", "DATA"): "tBuXPhos", ("base", "DATA"): "DBU", ("t_res", "DATA"): 328.717801570892, ("temperature", "DATA"): 30, ("base_equivalents", "DATA"): 2.18301549894049, ("yield", "DATA"): 0.19, } conditions = DataSet([values], columns=columns) results = b.run_experiments(conditions) assert str(results["catalyst", "DATA"].iloc[0]) == values["catalyst", "DATA"] assert str(results["base", "DATA"].iloc[0]) == values["base", "DATA"] assert float(results["t_res"]) == values["t_res", "DATA"] assert float(results["temperature"]) == values["temperature", "DATA"] assert np.isclose(float(results["yld"]), 0.173581) # Test serialization d = b.to_dict() exp = BaumgartnerCrossCouplingEmulator.from_dict(d) return results
def test_reizman_emulator(show_plots=False): b = get_pretrained_reizman_suzuki_emulator(case=1) b.parity_plot(include_test=True) if show_plots: plt.show() columns = [v.name for v in b.domain.variables] values = { "catalyst": ["P1-L3"], "t_res": [600], "temperature": [30], "catalyst_loading": [0.498], } conditions = pd.DataFrame(values) conditions = DataSet.from_df(conditions) results = b.run_experiments(conditions, return_std=True) for name, value in values.items(): if type(value[0]) == str: assert str(results[name].iloc[0]) == value[0] else: assert float(results[name].iloc[0]) == value[0] assert np.isclose(float(results["yld"]), 0.6, atol=15) assert np.isclose(float(results["ton"]), 1.1, atol=15) # Test serialization d = b.to_dict() exp = ReizmanSuzukiEmulator.from_dict(d) return results
def transform_inputs_outputs(self, ds, copy=True, **kwargs): """Transform of data into inputs and outptus for a strategy This will do a log transform on the objectives (outputs). Parameters ---------- ds: `DataSet` Dataset with columns corresponding to the inputs and objectives of the domain. copy: bool, optional Copy the dataset internally. Defaults to True. transform_descriptors: bool, optional Transform the descriptors into continuous variables. Default True. Returns ------- inputs, outputs Datasets with the input and output datasets """ # Get inputs and outputs inputs, outputs = super().transform_inputs_outputs(ds, copy=copy, **kwargs) # Scalarize using Chimera outputs_arr = outputs[self.ordered_objective_names].to_numpy() outputs_arr = (outputs_arr * self.directions ) # Change maximization to minimization scalarized_array = self._scalarize(outputs_arr) # Write scalarized objective back to DataSet outputs = DataSet(scalarized_array, columns=["chimera"]) return inputs, outputs
def test_baumgartner_CC_emulator(use_descriptors, include_cost, show_plots=False): """ Test the Baumgartner Cross Coupling emulator""" b = get_pretrained_baumgartner_cc_emulator(use_descriptors=use_descriptors, include_cost=include_cost) b.parity_plot(include_test=True) if show_plots: plt.show() columns = [v.name for v in b.domain.variables] values = { "catalyst": ["tBuXPhos"], "base": ["DBU"], "t_res": [328.717801570892], "temperature": [30], "base_equivalents": [2.18301549894049], } conditions = pd.DataFrame(values) conditions = DataSet.from_df(conditions) results = b.run_experiments(conditions, return_std=True) assert str(results["catalyst"].iloc[0]) == values["catalyst"][0] assert str(results["base"].iloc[0]) == values["base"][0] assert float(results["t_res"]) == values["t_res"][0] assert float(results["temperature"]) == values["temperature"][0] assert np.isclose(float(results["yld"]), 0.042832638, atol=0.15) # Test serialization d = b.to_dict() exp = BaumgartnerCrossCouplingEmulator.from_dict(d) return results
def _transform_categorical(self, X): transformed_combos = {} for v in self.domain.input_variables: if v.variable_type == "categorical": values = X[v.name].to_numpy() # Descriptor transformation if self.use_descriptors and v.ds is not None: transformed_values = v.ds.loc[values] for col in transformed_values: transformed_combos[col] = transformed_values[ col[0]].to_numpy() var_max = v.ds[col[0]].max() var_min = v.ds[col[0]].min() transformed_combos[col] = (transformed_combos[col] - var_min) / (var_max - var_min) elif self.use_descriptors and v.ds is None: raise DomainError( f"use_descriptors is true, but {v.name} has no descriptors." ) # One hot encoding transformation else: enc = self.transform.encoders[v.name] one_hot_values = enc.transform(values[:, np.newaxis]) if issparse(one_hot_values): one_hot_values = one_hot_values.toarray() for loc, l in enumerate(v.levels): column_name = f"{v.name}_{l}" transformed_combos[(column_name, "DATA")] = one_hot_values[:, loc] return DataSet(transformed_combos)
def from_dict(cls, d): snobfit = super().from_dict(d) params = d["strategy_params"]["prev_param"] if params is not None: params[0] = (np.array(params[0][0]), params[0][1], np.array(params[0][2])) params[1] = [DataSet.from_dict(p) for p in params[1]] snobfit.prev_param = params return snobfit
def _check_datasets(self, dataset=None, csv_dataset=None): if csv_dataset: if dataset: print( "Dataset and csv.dataset are given, hence dataset will be overwritten by csv.data." ) dataset = DataSet.read_csv(csv_dataset, index_col=None) return dataset
def _evaluate(self, X, out, *args, **kwargs): X = DataSet(np.atleast_2d(X), columns=self.input_columns) X[("strategy", "METADATA")] = "NSGAII" F = self.experiment.run_experiments(X) F = F[self.output_columns].data_to_numpy() # Negate objectives that need to be maximized for i, v in enumerate(self.domain.output_variables): if v.maximize: F[:,i] *= -1 out["F"] = F # Add constraints if necessary if self.domain.constraints: constraint_res = [ X.eval(c.lhs, resolvers=[X]) for c in self.domain.constraints ] out["G"] = [c.tolist()[0] for c in constraint_res]
def test_dltz2_benchmark(num_inputs): """Test the DTLZ2 benchmark""" b = DTLZ2(num_inputs=num_inputs, num_objectives=2) values = {(f"x_{i}", "DATA"): [0.5] for i in range(num_inputs)} ds = DataSet(values) b.run_experiments(ds) data = b.data assert np.isclose(data["y_0"].iloc[0], 0.7071) assert np.isclose(data["y_1"].iloc[0], 0.7071)
def from_dict(cls, d): nm = super().from_dict(d) prev_param = d["strategy_params"]["prev_param"] if prev_param is not None: nm.prev_param = [ unjsonify_dict(prev_param[0]), DataSet.from_dict(prev_param[1]), ] return nm
def from_dict(variable_dict): ds = variable_dict["ds"] ds = DataSet.from_dict(ds) if ds is not None else None return CategoricalVariable( name=variable_dict["name"], description=variable_dict["description"], levels=variable_dict["levels"], descriptors=ds, is_objective=variable_dict["is_objective"], )
def _categorical_enumerate(self, models): """Make predictions on all combinations of categorical domain""" combos = self.categorical_combos X = self._transform_categorical(combos) n_obj = len(self.domain.output_variables) y = np.zeros([X.shape[0], n_obj]) for i, v in enumerate(self.domain.output_variables): y[:, i] = models[i].predict(X) y = DataSet(y, columns=[v.name for v in self.domain.output_variables]) return X, y
def _nsga_optimize(self, models): """NSGA-II optimization with categorical domains""" from pymoo.algorithms.nsga2 import NSGA2 from pymoo.optimize import minimize from pymoo.factory import get_termination optimizer = NSGA2(pop_size=self.pop_size) problem = TSEMOInternalWrapper(models, self.domain) termination = get_termination("n_gen", self.generations) self.internal_res = minimize(problem, optimizer, termination, seed=1, verbose=False) X = np.atleast_2d(self.internal_res.X).tolist() y = np.atleast_2d(self.internal_res.F).tolist() X = DataSet(X, columns=problem.X_columns) y = DataSet(y, columns=[v.name for v in self.domain.output_variables]) return X, y
def reset(self): """Reset the experiment This will clear all data. """ self.prev_itr_time = None columns = [var.name for var in self.domain.variables] md_columns = ["computation_t", "experiment_t", "strategy"] columns += md_columns self._data = DataSet(columns=columns, metadata_columns=md_columns) self.extras = []
def test_multitosingleobjective_transform(): class MockStrategy(Strategy): def suggest_experiments(self, num_experiments, previous_results): inputs, outputs = self.transform.transform_inputs_outputs( previous_results) objectives = [v for v in self.domain.variables if v.is_objective] assert len(objectives) == 1 assert objectives[0].name == "scalar_objective" assert outputs["scalar_objective"].iloc[0] == 70.0 return self.transform.un_transform(inputs) def reset(self): pass domain = Domain() domain += ContinuousVariable( name="temperature", description="reaction temperature in celsius", bounds=[50, 100], ) domain += ContinuousVariable(name="flowrate_a", description="flow of reactant a in mL/min", bounds=[0.1, 0.5]) domain += ContinuousVariable(name="flowrate_b", description="flow of reactant b in mL/min", bounds=[0.1, 0.5]) domain += ContinuousVariable(name="yield_", description="", bounds=[0, 100], is_objective=True, maximize=True) domain += ContinuousVariable( name="de", description="diastereomeric excess", bounds=[0, 100], is_objective=True, maximize=True, ) columns = [v.name for v in domain.variables] values = { ("temperature", "DATA"): 60, ("flowrate_a", "DATA"): 0.5, ("flowrate_b", "DATA"): 0.5, ("yield_", "DATA"): 50, ("de", "DATA"): 90, } previous_results = DataSet([values], columns=columns) transform = MultitoSingleObjective(domain, expression="(yield_+de)/2", maximize=True) strategy = MockStrategy(domain, transform=transform) strategy.suggest_experiments(5, previous_results)
def _nsga_optimize_mixed(self, models): """NSGA-II optimization with mixed continuous-categorical domains""" from pymoo.algorithms.nsga2 import NSGA2 from pymoo.optimize import minimize from pymoo.factory import get_termination combos = self.categorical_combos transformed_combos = self._transform_categorical(combos) X_list, y_list = [], [] # Loop through all combinations of categoricals and run optimization bar = progress_bar(transformed_combos.iterrows(), total=transformed_combos.shape[0]) for _, combo in bar: # bar.comment = "NSGA Mixed Optimization" optimizer = NSGA2(pop_size=self.pop_size) problem = TSEMOInternalWrapper(models, self.domain, fixed_variables=combo.to_dict()) termination = get_termination("n_gen", self.generations) self.internal_res = minimize(problem, optimizer, termination, seed=1, verbose=False) X = np.atleast_2d(self.internal_res.X).tolist() y = np.atleast_2d(self.internal_res.F).tolist() X = DataSet(X, columns=problem.X_columns) y = DataSet(y, columns=[v.name for v in self.domain.output_variables]) # Add in categorical variables for key, value in combo.to_dict().items(): X[key] = value X_list.append(X) y_list.append(y) return pd.concat(X_list, axis=0), pd.concat(y_list, axis=0)
def from_dict(cls, d): domain = Domain.from_dict(d["domain"]) experiment_params = d.get("experiment_params", {}) exp = cls(domain=domain, **experiment_params) exp._data = DataSet.from_dict(d["data"]) for e in d["extras"]: if type(e) == dict: exp.extras.append(unjsonify_dict(e)) elif type(e) == list: exp.extras.append(np.array(e)) else: exp.extras.append(e) return exp
def test_logspaceobjectives_transform(): class MockStrategy(Strategy): def suggest_experiments(self, num_experiments, previous_results): inputs, outputs = self.transform.transform_inputs_outputs( previous_results) objectives = [v for v in self.domain.variables if v.is_objective] assert len(objectives) == 2 assert np.isclose(outputs["log_yield_"].iloc[0], np.log(50)) assert np.isclose(outputs["log_de"].iloc[0], np.log(90)) return self.transform.un_transform(inputs) def reset(self): pass domain = Domain() domain += ContinuousVariable( name="temperature", description="reaction temperature in celsius", bounds=[50, 100], ) domain += ContinuousVariable(name="flowrate_a", description="flow of reactant a in mL/min", bounds=[0.1, 0.5]) domain += ContinuousVariable(name="flowrate_b", description="flow of reactant b in mL/min", bounds=[0.1, 0.5]) domain += ContinuousVariable(name="yield_", description="", bounds=[0, 100], is_objective=True, maximize=True) domain += ContinuousVariable( name="de", description="diastereomeric excess", bounds=[0, 100], is_objective=True, maximize=True, ) columns = [v.name for v in domain.variables] values = { ("temperature", "DATA"): [60, 100], ("flowrate_a", "DATA"): [0.5, 0.4], ("flowrate_b", "DATA"): [0.5, 0.4], ("yield_", "DATA"): [50, 60], ("de", "DATA"): [90, 80], } previous_results = DataSet(values, columns=columns) transform = LogSpaceObjectives(domain) strategy = MockStrategy(domain, transform=transform) strategy.suggest_experiments(5, previous_results)
def optimize(self, **kwargs): input_columns = [v.name for v in self.domain.variables if not v.is_objective] output_columns = [v.name for v in self.domain.variables if v.is_objective] def problem_wrapper(X): X = DataSet(np.atleast_2d(X), columns=input_columns) X[("strategy", "METADATA")] = "NSGAII" result = self.experiment.run_experiments(X) if self.domain.constraints: constraint_res = [ X.eval(c.lhs, resolvers=[X]) for c in self.domain.constraints ] constraint_res = [c.tolist()[0] for c in constraint_res] return result[output_columns].to_numpy()[0,:], constraint_res else: return result[output_columns].to_numpy()[0,:] # Run optimization self.problem.function = problem_wrapper algorithm = pp.NSGAII(self.problem, population_size=1000) iterations = kwargs.get("iterations", 1000) algorithm.run(iterations) x = [ [s.variables[i] for i in range(self.domain.num_variables())] for s in algorithm.result if s.feasible ] x = DataSet(x, columns=input_columns) y = [ [s.objectives[i] for i in range(len(self.domain.output_variables))] for s in algorithm.result if s.feasible ] y = DataSet(y, columns=output_columns) return OptimizeResult(x=x, fun=y, success=True)
def __init__(self, domain, transform=None, **kwargs): Strategy.__init__(self, domain, transform, **kwargs) # Input bounds lowers = [] uppers = [] self.columns = [] for v in self.domain.input_variables: if type(v) == ContinuousVariable: lowers.append(v.bounds[0]) uppers.append(v.bounds[1]) self.columns.append(v.name) elif type(v) == CategoricalVariable and v.ds is not None: lowers += v.ds.min().to_list() uppers += v.ds.max().to_list() self.columns += [c[0] for c in v.ds.columns] elif type(v) == CategoricalVariable and v.ds is None: raise DomainError( "TSEMO only supports categorical variables with descriptors." ) self.inputs_min = DataSet([lowers], columns=self.columns) self.inputs_max = DataSet([uppers], columns=self.columns) self.kern_dim = len(self.columns) # Kernel self.kernel = kwargs.get("kernel", GPy.kern.Exponential) # Spectral sampling settings self.n_spectral_points = kwargs.get("n_spectral_points", 1500) self.n_retries = kwargs.get("n_retries", 10) # NSGA-II tsemo_settings self.generations = kwargs.get("generations", 100) self.pop_size = kwargs.get("pop_size", 100) self.logger = kwargs.get("logger", logging.getLogger(__name__)) self.reset()
def test_train_experimental_emulator(): model_name = f"reizman_suzuki_case_1" domain = ReizmanSuzukiEmulator.setup_domain() ds = DataSet.read_csv(DATA_PATH / f"{model_name}.csv") exp = ExperimentalEmulator(model_name, domain, dataset=ds, regressor=ANNRegressor) # Test grid search cross validation and training # params = { # "regressor__net__max_epochs": [1, 1000], # } params = None exp.train(cv_folds=5, max_epochs=1000, random_state=100, search_params=params, verbose=0) # Testing res = exp.test() r2 = res["test_r2"].mean() assert r2 > 0.8 # Test plotting fig, ax = exp.parity_plot(output_variables="yld", include_test=True) # Test saving/loading exp.save("test_ee") exp_2 = ExperimentalEmulator.load(model_name, "test_ee") assert all(exp.descriptors_features) == all(exp_2.descriptors_features) assert exp.n_examples == exp_2.n_examples assert all(exp.output_variable_names) == all(exp_2.output_variable_names) assert exp.clip == exp_2.clip exp_2.X_train, exp_2.y_train, exp_2.X_test, exp_2.y_test = ( exp.X_train, exp.y_train, exp.X_test, exp.y_test, ) res = exp_2.test(X_test=exp.X_test, y_test=exp.y_test) exp.parity_plot(output_variables="yld", include_test=True) r2 = res["test_r2"].mean() assert r2 > 0.8 shutil.rmtree("test_ee")
def to_dataset(self) -> DataSet: """Get design as a pandas dataframe Returns ------- ds: summit.utils.dataset.Dataset """ df = pd.DataFrame([]) for i, variable in enumerate(self._domain.input_variables): if isinstance(variable, ContinuousVariable): values = self.get_values(variable.name)[:, 0] elif isinstance(variable, CategoricalVariable): values = [ variable.levels[i] for i in self.get_indices(variable.name)[:, 0] ] df.insert(i, variable.name, values) return DataSet.from_df(df)
def _train_baumgartner(use_descriptors=False, show_plots=False, save_plots=True): # Setup model_name = f"baumgartner_aniline_cn_crosscoupling" domain = BaumgartnerCrossCouplingEmulator.setup_domain() ds = DataSet.read_csv(DATA_PATH / f"{model_name}.csv") # Create emulator and train model_name += "_descriptors" if use_descriptors else "" exp = ExperimentalEmulator( model_name, domain, dataset=ds, regressor=ANNRegressor, output_variable_names=["yield"], descriptors_features=["catalyst", "base"] if use_descriptors else [], ) res = exp.train(max_epochs=MAX_EPOCHS, cv_folds=CV_FOLDS, random_state=100, test_size=0.2) # Run test res_test = exp.test() res.update(res_test) # Save emulator model_path = pathlib.Path(MODELS_PATH / model_name) model_path.mkdir(exist_ok=True) exp.save(model_path) # Make plot for posteriority sake fig, ax = exp.parity_plot(include_test=True) if save_plots: fig.savefig(f"results/{model_name}.png", dpi=100) if show_plots: plt.show() return res
def to_dataset(self) -> DataSet: """Get design as a pandas dataframe Returns ------- ds: summit.utils.dataset.Dataset """ df = pd.DataFrame([]) i = 0 for variable in self._domain.variables: if variable.is_objective or variable.name in self.exclude: continue elif isinstance(variable, ContinuousVariable): values = self.get_values(variable.name)[:, 0] elif isinstance(variable, CategoricalVariable): values = [ variable.levels[i] for i in self.get_indices(variable.name)[:, 0] ] df.insert(i, variable.name, values) i += 1 return DataSet.from_df(df)
def get_categorical_combinations(self): """Get all combinations of categoricals using full factorial design Returns ------- ds: DataSet A dataset containing the combinations of all categorical cvariables. """ levels = [ len(v.levels) for v in self.input_variables if v.variable_type == "categorical" ] doe = fullfact(levels) i = 0 combos = {} for v in self.input_variables: if v.variable_type == "categorical": indices = doe[:, i] indices = indices.astype(int) combos[v.name, "DATA"] = [v.levels[i] for i in indices] i += 1 return DataSet(combos)