def test_coef_predict_same_as_predict_XGB(): # Generate data p(z | x) = N(x, 1) def generate_data(n_draws): x = np.random.normal(0, 1, n_draws) z = np.random.normal(x, 1, n_draws) return x, z x_train, z_train = generate_data(5000) x_validation, z_validation = generate_data(5000) x_test, z_test = generate_data(5000) # Parameterize model model = flexcode.FlexCodeModel(XGBoost, max_basis=31, basis_system="cosine", regression_params={ "max_depth": [3, 5, 8], 'eta': [0.1, 0.2, 0.5] }) # Fit and tune model model.fit(x_train, z_train) model.tune(x_validation, z_validation, bump_threshold_grid=np.linspace(0, 0.2, 3), sharpen_grid=np.linspace(0.5, 1.5, 3)) cdes_predict, z_grid = model.predict(x_test, n_grid=200) coefs = model.predict_coefs(x_test) cdes_coefs = coefs.evaluate(z_grid) assert np.max(np.abs(cdes_predict - cdes_coefs)) <= 1e-4
def test_example(): # Generate data p(z | x) = N(x, 1) def generate_data(n_draws): x = np.random.normal(0, 1, n_draws) z = np.random.normal(x, 1, n_draws) return x.reshape((len(x), 1)), z.reshape((len(z), 1)) x_train, z_train = generate_data(10000) x_validation, z_validation = generate_data(10000) x_test, z_test = generate_data(10000) # Parameterize model model = flexcode.FlexCodeModel(NN, max_basis=31, basis_system="cosine", regression_params={"k": 20}) # Fit and tune model model.fit(x_train, z_train) model.tune(x_validation, z_validation, bump_threshold_grid=np.linspace(0, 0.2, 3), sharpen_grid=np.linspace(0.5, 1.5, 3)) # Estimate CDE loss model.estimate_error(x_test, z_test) cdes, z_grid = model.predict(x_test, n_grid=200) assert True
def inform(self, training_data): """ train flexzboost model model """ speczs = training_data['redshift'] print("stacking some data...") color_data = make_color_data(training_data) train_dat, val_dat, train_sz, val_sz = self.split_data( color_data, speczs, self.trainfrac) print("read in training data") model = flexcode.FlexCodeModel(XGBoost, max_basis=self.max_basis, basis_system=self.basis_system, z_min=self.zmin, z_max=self.zmax, regression_params=self.regress_params) print("fit the model...") model.fit(train_dat, train_sz) bump_grid = np.linspace(self.bumpmin, self.bumpmax, self.nbump) print("finding best bump thresh...") bestloss = 9999 for bumpt in bump_grid: model.bump_threshold = bumpt model.tune(val_dat, val_sz) tmpcdes, z_grid = model.predict(val_dat, n_grid=self.nzbins) tmploss = cde_loss(tmpcdes, z_grid, val_sz) if tmploss < bestloss: bestloss = tmploss bestbump = bumpt model.bump_threshold = bestbump print("finding best sharpen parameter...") sharpen_grid = np.linspace(self.sharpmin, self.sharpmax, self.nsharp) bestloss = 9999 bestsharp = 9999 for sharp in sharpen_grid: model.sharpen_alpha = sharp tmpcdes, z_grid = model.predict(val_dat, n_grid=301) tmploss = cde_loss(tmpcdes, z_grid, val_sz) if tmploss < bestloss: bestloss = tmploss bestsharp = sharp model.sharpen_alpha = bestsharp self.model = model if self.inform_options['save_train']: with open(self.inform_options['modelfile'], 'wb') as f: pickle.dump(file=f, obj=model, protocol=pickle.HIGHEST_PROTOCOL)
def inform(self): """ train flexzboost model model """ speczs = self.training_data['redshift'] print("stacking some data...") color_data = make_color_data(self.training_data) train_data, val_data, train_sz, val_sz = self.partition_data( color_data, speczs, self.trainfrac) print("read in training data") model = flexcode.FlexCodeModel( XGBoost, max_basis=self.max_basis, basis_system=self.basis_system, z_min=self.zmin, z_max=self.zmax, regression_params=self.regression_params) print("fit the model...") model.fit(train_data, train_sz) bump_grid = np.linspace(self.bumpmin, self.bumpmax, self.nbump) print("finding best bump thresh...") bestloss = 9999 for bumpt in bump_grid: model.bump_threshold = bumpt model.tune(val_data, val_sz) tmpcdes, z_grid = model.predict(val_data, n_grid=self.nzbins) tmploss = cde_loss(tmpcdes, z_grid, val_sz) if tmploss < bestloss: bestloss = tmploss bestbump = bumpt model.bump_threshold = bestbump print("finding best sharpen parameter...") sharpen_grid = np.linspace(self.sharpmin, self.sharpmax, self.nsharp) bestloss = 9999 bestsharp = 9999 for sharp in sharpen_grid: model.sharpen_alpha = sharp tmpcdes, z_grid = model.predict(val_data, n_grid=301) tmploss = cde_loss(tmpcdes, z_grid, val_sz) if tmploss < bestloss: bestloss = tmploss bestsharp = sharp model.sharpen_alpha = bestsharp self.model = model
def main(argv): if len(argv) != 2: print("usage: train_FlexZBoost.py [yamlfile]") exit() else: infile = argv[1] with open(infile, "r") as infp: ymldata = yaml.load(infp) output_file = ymldata['output_file'] validationfile = ymldata['sharpen_bumpthresh_outputfile'] #trainfile = "z_2_3.step_all.healpix_10447_magwerrSNtrim.hdf5" trainfile = ymldata['training_file'] max_basis = ymldata['max_basis_functions'] basis_system = ymldata['basis_system'] z_min = float(ymldata['z_min']) z_max = float(ymldata['z_max']) regression_params = ymldata['regression_params'] bumpmin = float(ymldata['bump_thresh_grid_min']) bumpmax = float(ymldata['bump_thresh_grid_max']) bumpdelta = float(ymldata['bump_thresh_grid_delta']) train_frac = float(ymldata['training_fraction']) sharpmin = float(ymldata['sharpen_min']) sharpmax = float(ymldata['sharpen_max']) sharpdelta = float(ymldata['sharpen_delta']) bump_grid = np.arange(bumpmin, bumpmax, bumpdelta) print("read in training data") fz_data, sz_data = read_in_data(trainfile) print("partition into train and validate") fz_train, fz_val, sz_train, sz_val = partition_data( fz_data, sz_data, train_frac) print(fz_train.shape[0]) print("train the model") model = flexcode.FlexCodeModel(XGBoost, max_basis=max_basis, basis_system=basis_system, z_min=z_min, z_max=z_max, regression_params=regression_params) model.fit(fz_train, sz_train) print("tune model, including bump trimming") #running with a grid as input wasn't working as it should, just add loop #model.tune(fz_val,sz_val,bump_threshold_grid=bump_grid) #the tuning computes the CDE loss for each bump_threshold in the grid and #chooses the best value based on the validation data #NOTE: sample runs on two samples both chose lowest possible bump thresh #do a brute force loop and spit out the CDE loss to make sure that the #lowest bump thresh really has the best loss score outfp = open(validationfile, "w") outfp.write("CDE Loss values for bump thresh and sharpen grids\n") bestloss = 9999 for bumpt in bump_grid: model.bump_threshold = bumpt model.tune(fz_val, sz_val) tmpcdes, z_grid = model.predict(fz_val, n_grid=300) tmploss = cde_loss(tmpcdes, z_grid, sz_val) if tmploss < bestloss: bestloss = tmploss bestbump = bumpt print(f"\n\n\nbumptrim val: {bumpt} cde loss: {tmploss}") outfp.write(f"bumptrim val: {bumpt} cde loss: {tmploss}\n") print(f"\n\n\nbest bump threshold: {bestbump} setting in model\n\n\n") model.bump_threshold = bestbump #now do the same for sharpening parameter! # sharpen_grid = np.arange(0.8,2.101,0.1) sharpen_grid = np.arange(sharpmin, sharpmax, sharpdelta) bestloss = 9999 bestsharp = 9999 for sharp in sharpen_grid: model.sharpen_alpha = sharp tmpcdes, z_grid = model.predict(fz_val, n_grid=301) tmploss = cde_loss(tmpcdes, z_grid, sz_val) if tmploss < bestloss: bestloss = tmploss bestsharp = sharp print(f"\n\n\nsharpparam: {sharp} cdeloss: {tmploss}") outfp.write(f"sharpparam: {sharp} cdeloss: {tmploss}\n") print(f"best sharpen param: {bestsharp}") model.sharpen_alpha = bestsharp # Saving the model pickle.dump(file=open(output_file, 'wb'), obj=model, protocol=pickle.HIGHEST_PROTOCOL) print(f"wrote out model file file to {output_file}") outfp.close() print(model.__dict__) print("finished")