def update(self, lambda_mults=[1.0], localizer=None, run_subset=None, use_approx=True, calc_only=False): """update the iES one GLM cycle Parameters ---------- lambda_mults : list a list of lambda multipliers to test. Each lambda mult value will require evaluating (a subset of) the parameter ensemble. localizer : pyemu.Matrix a jacobian localizing matrix run_subset : int the number of realizations to test for each lambda_mult value. For example, if run_subset = 30 and num_reals=100, the first 30 realizations will be run (in parallel) for each lambda_mult value. Then the best lambda_mult is selected and the remaining 70 realizations for that lambda_mult value are run (in parallel). use_approx : bool a flag to use the MLE or MAP upgrade solution. True indicates use MLE solution calc_only : bool a flag to calculate the upgrade matrix only (not run the ensemble). This is mostly for debugging and testing on travis. Default is False Example ------- ``>>>import pyemu`` ``>>>es = pyemu.EnsembleSmoother(pst="pest.pst")`` ``>>>es.initialize(num_reals=100)`` ``>>>es.update(lambda_mults=[0.1,1.0,10.0],run_subset=30)`` """ if run_subset is not None: if run_subset >= self.obsensemble.shape[0]: self.logger.warn("run_subset ({0}) >= num of active reals ({1})...ignoring ".\ format(run_subset,self.obsensemble.shape[0])) run_subset = None self.iter_num += 1 self.logger.log("iteration {0}".format(self.iter_num)) self.logger.statement("{0} active realizations".format( self.obsensemble.shape[0])) if self.obsensemble.shape[0] < 2: self.logger.lraise( "at least active 2 realizations (really like 300) are needed to update" ) if not self.__initialized: #raise Exception("must call initialize() before update()") self.logger.lraise("must call initialize() before update()") self.logger.log("calculate scaled delta obs") scaled_delta_obs = self._calc_delta_obs(self.obsensemble) self.logger.log("calculate scaled delta obs") self.logger.log("calculate scaled delta par") scaled_delta_par = self._calc_delta_par(self.parensemble) self.logger.log("calculate scaled delta par") self.logger.log("calculate pseudo inv comps") u, s, v = scaled_delta_obs.pseudo_inv_components() self.logger.log("calculate pseudo inv comps") self.logger.log("calculate obs diff matrix") obs_diff = self.obscov_inv_sqrt * self._get_residual_matrix( self.obsensemble).T self.logger.log("calculate obs diff matrix") # here is the math part...calculate upgrade matrices mean_lam, std_lam, paren_lam, obsen_lam = [], [], [], [] lam_vals = [] for ilam, cur_lam_mult in enumerate(lambda_mults): parensemble_cur_lam = self.parensemble.copy() #print(parensemble_cur_lam.isnull().values.any()) cur_lam = self.current_lambda * cur_lam_mult lam_vals.append(cur_lam) self.logger.log("calcs for lambda {0}".format(cur_lam_mult)) scaled_ident = Cov.identity_like(s) * (cur_lam + 1.0) scaled_ident += s**2 scaled_ident = scaled_ident.inv # build up this matrix as a single element so we can apply # localization self.logger.log("building upgrade_1 matrix") upgrade_1 = -1.0 * (self.half_parcov_diag * scaled_delta_par) *\ v * s * scaled_ident * u.T self.logger.log("building upgrade_1 matrix") # apply localization if localizer is not None: self.logger.log("applying localization") upgrade_1.hadamard_product(localizer) self.logger.log("applying localization") # apply residual information self.logger.log("applying residuals") upgrade_1 *= obs_diff self.logger.log("applying residuals") self.logger.log("processing upgrade_1") upgrade_1 = upgrade_1.to_dataframe() upgrade_1.index.name = "parnme" upgrade_1 = upgrade_1.T upgrade_1.index = [int(i) for i in upgrade_1.index] upgrade_1.to_csv(self.pst.filename+".upgrade_1.{0:04d}.csv".\ format(self.iter_num)) if upgrade_1.isnull().values.any(): self.logger.lraise("NaNs in upgrade_1") self.logger.log("processing upgrade_1") #print(upgrade_1.isnull().values.any()) #print(parensemble_cur_lam.index) #print(upgrade_1.index) parensemble_cur_lam += upgrade_1 # parameter-based upgrade portion if not use_approx and self.iter_num > 1: self.logger.log("building upgrade_2 matrix") par_diff = (self.parensemble - self.parensemble_0.loc[self.parensemble.index,:]).\ as_pyemu_matrix().T x4 = self.Am.T * self.half_parcov_diag * par_diff x5 = self.Am * x4 x6 = scaled_delta_par.T * x5 x7 = v * scaled_ident * v.T * x6 upgrade_2 = -1.0 * (self.half_parcov_diag * scaled_delta_par * x7).to_dataframe() upgrade_2.index.name = "parnme" upgrade_2 = upgrade_2.T upgrade_2.to_csv(self.pst.filename+".upgrade_2.{0:04d}.csv".\ format(self.iter_num)) upgrade_2.index = [int(i) for i in upgrade_2.index] if upgrade_2.isnull().values.any(): self.logger.lraise("NaNs in upgrade_2") parensemble_cur_lam += upgrade_2 self.logger.log("building upgrade_2 matrix") parensemble_cur_lam.enforce(self.enforce_bounds) # this is for testing failed runs on upgrade testing # works with the 10par_xsec smoother test #parensemble_cur_lam.iloc[:,:] = -1000000.0 paren_lam.append(pd.DataFrame(parensemble_cur_lam.loc[:, :])) self.logger.log("calcs for lambda {0}".format(cur_lam_mult)) if calc_only: return # subset if needed # and combine lambda par ensembles into one par ensemble for evaluation if run_subset is not None and run_subset < self.parensemble.shape[0]: #subset_idx = ["{0:d}".format(i) for i in np.random.randint(0,self.parensemble.shape[0]-1,run_subset)] subset_idx = self.parensemble.iloc[:run_subset, :].index.values self.logger.statement("subset idxs: " + ','.join([str(s) for s in subset_idx])) paren_lam_subset = [pe.loc[subset_idx, :] for pe in paren_lam] paren_combine = pd.concat(paren_lam_subset, ignore_index=True) paren_lam_subset = None else: subset_idx = self.parensemble.index.values paren_combine = pd.concat(paren_lam, ignore_index=True) self.logger.log("evaluating ensembles for lambdas : {0}".\ format(','.join(["{0:8.3E}".format(l) for l in lam_vals]))) failed_runs, obsen_combine = self._calc_obs(paren_combine) #if failed_runs is not None: # obsen_combine.loc[failed_runs,:] = np.NaN self.logger.log("evaluating ensembles for lambdas : {0}".\ format(','.join(["{0:8.3E}".format(l) for l in lam_vals]))) paren_combine = None if failed_runs is not None and len( failed_runs) == obsen_combine.shape[0]: self.logger.lraise("all runs failed - cannot continue") # unpack lambda obs ensembles from combined obs ensemble nrun_per_lam = self.obsensemble.shape[0] if run_subset is not None: nrun_per_lam = run_subset obsen_lam = [] for i in range(len(lam_vals)): sidx = i * nrun_per_lam eidx = sidx + nrun_per_lam oe = ObservationEnsemble.from_dataframe( df=obsen_combine.iloc[sidx:eidx, :].copy(), pst=self.pst) oe.index = subset_idx # check for failed runs in this set - drop failed runs from obs ensembles if failed_runs is not None: failed_runs_this = np.array( [f for f in failed_runs if f >= sidx and f < eidx]) - sidx if len(failed_runs_this) > 0: if len(failed_runs_this) == oe.shape[0]: self.logger.warn( "all runs failed for lambda {0}".format( lam_vals[i])) else: self.logger.warn("{0} run failed for lambda {1}".\ format(len(failed_runs_this),lam_vals[i])) oe.iloc[failed_runs_this, :] = np.NaN oe = oe.dropna() # don't drop bad reals here, instead, mask bad reals in the lambda # selection and drop later # if self.drop_bad_reals is not None: # assert isinstance(drop_bad_reals, float) # drop_idx = np.argwhere(self.current_phi_vec > self.drop_bad_reals).flatten() # run_ids = self.obsensemble.index.values # drop_idx = run_ids[drop_idx] # if len(drop_idx) == self.obsensemble.shape[0]: # raise Exception("dropped all realizations as 'bad'") # if len(drop_idx) > 0: # self.logger.warn("{0} realizations dropped as 'bad' (indices :{1})". \ # format(len(drop_idx), ','.join([str(d) for d in drop_idx]))) # self.parensemble.loc[drop_idx, :] = np.NaN # self.parensemble = self.parensemble.dropna() # self.obsensemble.loc[drop_idx, :] = np.NaN # self.obsensemble = self.obsensemble.dropna() # # self.current_phi_vec = self._calc_phi_vec(self.obsensemble) obsen_lam.append(oe) obsen_combine = None # here is where we need to select out the "best" lambda par and obs # ensembles self.logger.statement("\n**************************") self.logger.statement(str(datetime.now())) self.logger.statement("total runs:{0}".format(self.total_runs)) self.logger.statement("iteration: {0}".format(self.iter_num)) self.logger.statement("current lambda:{0:15.6G}, mean:{1:15.6G}, std:{2:15.6G}".\ format(self.current_lambda, self.last_best_mean,self.last_best_std)) phi_vecs = [self._calc_phi_vec(obsen) for obsen in obsen_lam] if self.drop_bad_reals is not None: for i, pv in enumerate(phi_vecs): #for testing the drop_bad_reals functionality #pv[[0,3,7]] = self.drop_bad_reals + 1.0 pv[pv > self.drop_bad_reals] = np.NaN pv = pv[~np.isnan(pv)] if len(pv) == 0: raise Exception("all realization for lambda {0} dropped as 'bad'".\ format(lam_vals[i])) phi_vecs[i] = pv mean_std = [(pv.mean(), pv.std()) for pv in phi_vecs] update_pars = False update_lambda = False # accept a new best if its within 10% best_mean = self.last_best_mean * 1.1 best_std = self.last_best_std * 1.1 best_i = 0 for i, (m, s) in enumerate(mean_std): self.logger.statement(" tested lambda:{0:15.6G}, mean:{1:15.6G}, std:{2:15.6G}".\ format(self.current_lambda * lambda_mults[i],m,s)) if m < best_mean: update_pars = True best_mean = m best_i = i if s < best_std: update_lambda = True best_std = s if np.isnan(best_mean): self.logger.lraise("best mean = NaN") if np.isnan(best_std): self.logger.lraise("best std = NaN") if not update_pars: self.current_lambda *= max(lambda_mults) * 10.0 self.current_lambda = min(self.current_lambda, 100000) self.logger.statement("not accepting iteration, increased lambda:{0}".\ format(self.current_lambda)) else: self.parensemble = ParameterEnsemble.from_dataframe( df=paren_lam[best_i], pst=self.pst) if run_subset is not None: failed_runs, self.obsensemble = self._calc_obs( self.parensemble) if failed_runs is not None: self.logger.warn("dropping failed realizations") self.parensemble.loc[failed_runs, :] = np.NaN self.parensemble = self.parensemble.dropna() self.obsensemble.loc[failed_runs, :] = np.NaN self.obsensemble = self.obsensemble.dropna() self.current_phi_vec = self._calc_phi_vec(self.obsensemble) #self._phi_report(self.current_phi_vec,self.current_lambda * lambda_mults[best_i]) best_mean = self.current_phi_vec.mean() best_std = self.current_phi_vec.std() else: self.obsensemble = obsen_lam[best_i] # reindex parensemble in case failed runs self.parensemble = ParameterEnsemble.from_dataframe( df=self.parensemble.loc[self.obsensemble.index], pst=self.pst) self.current_phi_vec = phi_vecs[best_i] if self.drop_bad_reals is not None: # for testing drop_bad_reals functionality # self.current_phi_vec[::2] = self.drop_bad_reals + 1.0 drop_idx = np.argwhere( self.current_phi_vec > self.drop_bad_reals).flatten() run_ids = self.obsensemble.index.values drop_idx = run_ids[drop_idx] if len(drop_idx) > self.obsensemble.shape[0] - 3: raise Exception("dropped too many realizations as 'bad'") if len(drop_idx) > 0: self.logger.warn("{0} realizations dropped as 'bad' (indices :{1})". \ format(len(drop_idx), ','.join([str(d) for d in drop_idx]))) self.parensemble.loc[drop_idx, :] = np.NaN self.parensemble = self.parensemble.dropna() self.obsensemble.loc[drop_idx, :] = np.NaN self.obsensemble = self.obsensemble.dropna() self.current_phi_vec = self._calc_phi_vec(self.obsensemble) best_mean = self.current_phi_vec.mean() best_std = self.current_phi_vec.std() self._phi_report(self.phi_csv, self.current_phi_vec, self.current_lambda * lambda_mults[best_i]) self._phi_report(self.phi_act_csv, self.obsensemble.phi_vector.values, self.current_lambda * lambda_mults[best_i]) self.logger.statement(" best lambda:{0:15.6G}, mean:{1:15.6G}, std:{2:15.6G}".\ format(self.current_lambda*lambda_mults[best_i], best_mean,best_std)) self.logger.statement(" actual mean phi: {0:15.6G}".format( float(self.current_actual_phi.mean()))) self.last_best_mean = best_mean self.last_best_std = best_std if update_lambda: # be aggressive self.current_lambda *= (lambda_mults[best_i] * 0.75) # but don't let lambda get too small self.current_lambda = max(self.current_lambda, 0.00001) self.logger.statement("updating lambda: {0:15.6G}".\ format(self.current_lambda )) self.logger.statement("**************************\n") self.parensemble.to_csv(self.pst.filename+self.paren_prefix.\ format(self.iter_num)) self.obsensemble.to_csv(self.pst.filename+self.obsen_prefix.\ format(self.iter_num)) if self.raw_sweep_out is not None: self.raw_sweep_out.to_csv(self.pst.filename+"_raw{0}".\ format(self.iter_num)) self.logger.log("iteration {0}".format(self.iter_num))
def initialize(self,num_reals,init_lambda=None): ''' (re)initialize the process ''' assert num_reals > 1 # initialize the phi report csv self.phi_csv = open(self.pst.filename+".iobj.csv",'w') self.phi_csv.write("iter_num,total_runs,lambda,min,max,mean,median,std,") self.phi_csv.write(','.join(["{0:010d}".\ format(i+1) for i in range(num_reals)])) self.phi_csv.write('\n') self.total_runs = 0 # this matrix gets used a lot, so only calc once and store self.obscov_inv_sqrt = self.obscov.get(self.pst.nnz_obs_names).inv.sqrt if self.restart: print("restarting...ignoring num_reals") raise NotImplementedError() df = pd.read_csv(self.pst.filename+self.paren_prefix.format(self.restart_iter)) self.parensemble_0 = ParameterEnsemble.from_dataframe(df=df,pst=self.pst) self.parensemble = self.parensemble_0.copy() df = pd.read_csv(self.pst.filename+self.obsen_prefix.format(0)) self.obsensemble_0 = ObservationEnsemble.from_dataframe(df=df.loc[:,self.pst.nnz_obs_names], pst=self.pst) # this matrix gets used a lot, so only calc once self.obs0_matrix = self.obsensemble_0.as_pyemu_matrix() df = pd.read_csv(self.pst.filename+self.obsen_prefix.format(self.restart_iter)) self.obsensemble = ObservationEnsemble.from_dataframe(df=df.loc[:,self.pst.nnz_obs_names], pst=self.pst) assert self.parensemble.shape[0] == self.obsensemble.shape[0] self.num_reals = self.parensemble.shape[0] else: self.num_reals = int(num_reals) self.parensemble_0 = ParameterEnsemble(self.pst) self.parensemble_0.draw(cov=self.parcov,num_reals=num_reals) self.parensemble_0.enforce() self.parensemble = self.parensemble_0.copy() self.parensemble_0.to_csv(self.pst.filename +\ self.paren_prefix.format(0)) self.obsensemble_0 = ObservationEnsemble(self.pst) self.obsensemble_0.draw(cov=self.obscov,num_reals=num_reals) #self.obsensemble = self.obsensemble_0.copy() # save the base obsensemble self.obsensemble_0.to_csv(self.pst.filename +\ self.obsen_prefix.format(-1)) self.obs0_matrix = self.obsensemble_0.nonzero.as_pyemu_matrix() # run the initial parameter ensemble self.obsensemble = self._calc_obs(self.parensemble) self.obsensemble.to_csv(self.pst.filename +\ self.obsen_prefix.format(0)) self.current_phi_vec = self._calc_phi_vec(self.obsensemble) self._phi_report(self.current_phi_vec,0.0) self.last_best_mean = self.current_phi_vec.mean() self.last_best_std = self.current_phi_vec.std() if init_lambda is not None: self.current_lambda = float(init_lambda) else: #following chen and oliver x = self.last_best_mean / (2.0 * float(self.obsensemble.shape[1])) self.current_lambda = 10.0**(np.floor(np.log10(x))) # if using the approximate form of the algorithm, let # the parameter scaling matrix be the identity matrix # jwhite - dec 5 2016 - using the actual parcov inv # for upgrades seems to be pushing parameters around # too much. for now, just not using it, maybe # better choices of lambda will tame it if self.use_approx: self.half_parcov_diag = 1.0 else: # if self.parcov.isdiagonal: # self.half_parcov_diag = self.parcov.sqrt.inv # else: # self.half_parcov_diag = Cov(x=np.diag(self.parcov.x), # names=self.parcov.col_names, # isdiagonal=True).inv.sqrt self.half_parcov_diag = 1.0 self.delta_par_prior = self._calc_delta_par(self.parensemble_0) u,s,v = self.delta_par_prior.pseudo_inv_components() self.Am = u * s.inv self.__initialized = True
def initialize( self, num_reals=1, init_lambda=None, enforce_bounds="reset", parensemble=None, obsensemble=None, restart_obsensemble=None, ): """Initialize the iES process. Depending on arguments, draws or loads initial parameter observations ensembles and runs the initial parameter ensemble Parameters ---------- num_reals : int the number of realizations to draw. Ignored if parensemble/obsensemble are not None init_lambda : float the initial lambda to use. During subsequent updates, the lambda is updated according to upgrade success enforce_bounds : str how to enfore parameter bound transgression. options are reset, drop, or None parensemble : pyemu.ParameterEnsemble or str a parameter ensemble or filename to use as the initial parameter ensemble. If not None, then obsenemble must not be None obsensemble : pyemu.ObservationEnsemble or str an observation ensemble or filename to use as the initial observation ensemble. If not None, then parensemble must not be None restart_obsensemble : pyemu.ObservationEnsemble or str an observation ensemble or filename to use as an evaluated observation ensemble. If not None, this will skip the initial parameter ensemble evaluation - user beware! Example ------- ``>>>import pyemu`` ``>>>es = pyemu.EnsembleSmoother(pst="pest.pst")`` ``>>>es.initialize(num_reals=100)`` """ ''' (re)initialize the process ''' # initialize the phi report csv self.enforce_bounds = enforce_bounds self.total_runs = 0 # this matrix gets used a lot, so only calc once and store self.obscov_inv_sqrt = self.obscov.get(self.pst.nnz_obs_names).inv.sqrt if parensemble is not None and obsensemble is not None: self.logger.log("initializing with existing ensembles") if isinstance(parensemble, str): self.logger.log("loading parensemble from file") if not os.path.exists(obsensemble): self.logger.lraise("can not find parensemble file: {0}".\ format(parensemble)) df = pd.read_csv(parensemble, index_col=0) #df.index = [str(i) for i in df.index] self.parensemble_0 = ParameterEnsemble.from_dataframe( df=df, pst=self.pst) self.logger.log("loading parensemble from file") elif isinstance(parensemble, ParameterEnsemble): self.parensemble_0 = parensemble.copy() else: raise Exception("unrecognized arg type for parensemble, " +\ "should be filename or ParameterEnsemble" +\ ", not {0}".format(type(parensemble))) self.parensemble = self.parensemble_0.copy() if isinstance(obsensemble, str): self.logger.log("loading obsensemble from file") if not os.path.exists(obsensemble): self.logger.lraise("can not find obsensemble file: {0}".\ format(obsensemble)) df = pd.read_csv(obsensemble, index_col=0).loc[:, self.pst.nnz_obs_names] #df.index = [str(i) for i in df.index] self.obsensemble_0 = ObservationEnsemble.from_dataframe( df=df, pst=self.pst) self.logger.log("loading obsensemble from file") elif isinstance(obsensemble, ObservationEnsemble): self.obsensemble_0 = obsensemble.copy() else: raise Exception("unrecognized arg type for obsensemble, " +\ "should be filename or ObservationEnsemble" +\ ", not {0}".format(type(obsensemble))) assert self.parensemble_0.shape[0] == self.obsensemble_0.shape[0] #self.num_reals = self.parensemble_0.shape[0] num_reals = self.parensemble.shape[0] self.logger.log("initializing with existing ensembles") else: self.logger.log( "initializing smoother with {0} realizations".format( num_reals)) #self.num_reals = int(num_reals) #assert self.num_reals > 1 self.logger.log("initializing parensemble") #self.parensemble_0 = ParameterEnsemble(self.pst) #self.parensemble_0.draw(cov=self.parcov,num_reals=num_reals) self.parensemble_0 = pyemu.ParameterEnsemble.from_gaussian_draw( ParameterEnsemble(self.pst), self.parcov, num_reals=num_reals) self.parensemble_0.enforce(enforce_bounds=enforce_bounds) self.logger.log("initializing parensemble") self.parensemble = self.parensemble_0.copy() self.parensemble_0.to_csv(self.pst.filename +\ self.paren_prefix.format(0)) self.logger.log("initializing parensemble") self.logger.log("initializing obsensemble") #self.obsensemble_0 = ObservationEnsemble(self.pst) #self.obsensemble_0.draw(cov=self.obscov,num_reals=num_reals) self.obsensemble_0 = pyemu.ObservationEnsemble.from_id_gaussian_draw( ObservationEnsemble(self.pst), num_reals=num_reals) #self.obsensemble = self.obsensemble_0.copy() # save the base obsensemble self.obsensemble_0.to_csv(self.pst.filename +\ self.obsen_prefix.format(-1)) self.logger.log("initializing obsensemble") self.logger.log( "initializing smoother with {0} realizations".format( num_reals)) self.obs0_matrix = self.obsensemble_0.nonzero.as_pyemu_matrix() self.enforce_bounds = enforce_bounds self.phi_csv = open(self.pst.filename + ".iobj.csv", 'w') self.phi_csv.write( "iter_num,total_runs,lambda,min,max,mean,median,std,") self.phi_csv.write(','.join(["{0:010d}". \ format(i + 1) for i in range(num_reals)])) self.phi_csv.write('\n') self.phi_act_csv = open(self.pst.filename + ".iobj.actual.csv", 'w') self.phi_act_csv.write( "iter_num,total_runs,lambda,min,max,mean,median,std,") self.phi_act_csv.write(','.join(["{0:010d}". \ format(i + 1) for i in range(num_reals)])) self.phi_act_csv.write('\n') if restart_obsensemble is not None: self.logger.log( "loading restart_obsensemble {0}".format(restart_obsensemble)) failed_runs, self.obsensemble = self._load_obs_ensemble( restart_obsensemble) assert self.obsensemble.shape[0] == self.obsensemble_0.shape[0] assert list(self.obsensemble.columns) == list( self.obsensemble_0.columns) self.logger.log( "loading restart_obsensemble {0}".format(restart_obsensemble)) else: # run the initial parameter ensemble self.logger.log("evaluating initial ensembles") failed_runs, self.obsensemble = self._calc_obs(self.parensemble) self.obsensemble.to_csv(self.pst.filename +\ self.obsen_prefix.format(0)) self.logger.log("evaluating initial ensembles") if failed_runs is not None: self.logger.warn("dropping failed realizations") #failed_runs_str = [str(f) for f in failed_runs] #self.parensemble = self.parensemble.drop(failed_runs) #self.obsensemble = self.obsensemble.drop(failed_runs) self.parensemble.loc[failed_runs, :] = np.NaN self.parensemble = self.parensemble.dropna() self.obsensemble.loc[failed_runs, :] = np.NaN self.obsensemble = self.obsensemble.dropna() self.current_phi_vec = self._calc_phi_vec(self.obsensemble) if self.drop_bad_reals is not None: drop_idx = np.argwhere( self.current_phi_vec > self.drop_bad_reals).flatten() run_ids = self.obsensemble.index.values drop_idx = run_ids[drop_idx] if len(drop_idx) == self.obsensemble.shape[0]: raise Exception("dropped all realizations as 'bad'") if len(drop_idx) > 0: self.logger.warn("{0} realizations dropped as 'bad' (indices :{1})".\ format(len(drop_idx),','.join([str(d) for d in drop_idx]))) self.parensemble.loc[drop_idx, :] = np.NaN self.parensemble = self.parensemble.dropna() self.obsensemble.loc[drop_idx, :] = np.NaN self.obsensemble = self.obsensemble.dropna() self.current_phi_vec = self._calc_phi_vec(self.obsensemble) self._phi_report(self.phi_csv, self.current_phi_vec, 0.0) self._phi_report(self.phi_act_csv, self.obsensemble.phi_vector.values, 0.0) self.last_best_mean = self.current_phi_vec.mean() self.last_best_std = self.current_phi_vec.std() self.logger.statement("initial phi (mean, std): {0:15.6G},{1:15.6G}".\ format(self.last_best_mean,self.last_best_std)) if init_lambda is not None: self.current_lambda = float(init_lambda) else: #following chen and oliver x = self.last_best_mean / (2.0 * float(self.obsensemble.shape[1])) self.current_lambda = 10.0**(np.floor(np.log10(x))) # if using the approximate form of the algorithm, let # the parameter scaling matrix be the identity matrix # jwhite - dec 5 2016 - using the actual parcov inv # for upgrades seems to be pushing parameters around # too much. for now, just not using it, maybe # better choices of lambda will tame it self.logger.statement("current lambda:{0:15.6g}".format( self.current_lambda)) if self.use_approx_prior: self.logger.statement("using approximate parcov in solution") self.half_parcov_diag = 1.0 else: #self.logger.statement("using full parcov in solution") # if self.parcov.isdiagonal: # self.half_parcov_diag = self.parcov.sqrt.inv # else: # self.half_parcov_diag = Cov(x=np.diag(self.parcov.x), # names=self.parcov.col_names, # isdiagonal=True).inv.sqrt self.half_parcov_diag = 1.0 self.delta_par_prior = self._calc_delta_par(self.parensemble_0) u, s, v = self.delta_par_prior.pseudo_inv_components() self.Am = u * s.inv self.__initialized = True
def update(self, lambda_mults=[1.0], localizer=None, run_subset=None, use_approx=True, calc_only=False): """update the iES one GLM cycle Parameters ---------- lambda_mults : list a list of lambda multipliers to test. Each lambda mult value will require evaluating (a subset of) the parameter ensemble. localizer : pyemu.Matrix a jacobian localizing matrix run_subset : int the number of realizations to test for each lambda_mult value. For example, if run_subset = 30 and num_reals=100, the first 30 realizations will be run (in parallel) for each lambda_mult value. Then the best lambda_mult is selected and the remaining 70 realizations for that lambda_mult value are run (in parallel). use_approx : bool a flag to use the MLE or MAP upgrade solution. True indicates use MLE solution calc_only : bool a flag to calculate the upgrade matrix only (not run the ensemble). This is mostly for debugging and testing on travis. Default is False Example ------- ``>>>import pyemu`` ``>>>es = pyemu.EnsembleSmoother(pst="pest.pst")`` ``>>>es.initialize(num_reals=100)`` ``>>>es.update(lambda_mults=[0.1,1.0,10.0],run_subset=30)`` """ #if not self.parensemble.istransformed: # self.parensemble._transform(inplace=False) if run_subset is not None: if run_subset >= self.obsensemble.shape[0]: self.logger.warn("run_subset ({0}) >= num of active reals ({1})...ignoring ".\ format(run_subset,self.obsensemble.shape[0])) run_subset = None self.iter_num += 1 mat_prefix = self.pst.filename.replace('.pst', '') + ".{0}".format( self.iter_num) self.logger.log("iteration {0}".format(self.iter_num)) self.logger.statement("{0} active realizations".format( self.obsensemble.shape[0])) if self.obsensemble.shape[0] < 2: self.logger.lraise( "at least active 2 realizations (really like 300) are needed to update" ) if not self._initialized: #raise Exception("must call initialize() before update()") self.logger.lraise("must call initialize() before update()") self.logger.log("calculate scaled delta obs") scaled_delta_obs = self._calc_delta_obs(self.obsensemble) self.logger.log("calculate scaled delta obs") self.logger.log("calculate scaled delta par") scaled_delta_par = self._calc_delta_par(self.parensemble) self.logger.log("calculate scaled delta par") self.logger.log("calculate pseudo inv comps") u, s, v = scaled_delta_obs.pseudo_inv_components( eigthresh=self.pst.svd_data.eigthresh) s.col_names = s.row_names self.logger.log("calculate pseudo inv comps") self.logger.log("calculate obs diff matrix") #obs_diff = self.obscov_inv_sqrt * self._get_residual_obs_matrix(self.obsensemble).T obs_diff = self.obscov_inv_sqrt * self.phi.get_residual_obs_matrix( self.obsensemble).T self.logger.log("calculate obs diff matrix") if self.save_mats: np.savetxt(mat_prefix + ".obs_diff.dat", scaled_delta_obs.x, fmt="%15.6e") np.savetxt(mat_prefix + ".par_diff.dat", scaled_delta_par.x, fmt="%15.6e") np.savetxt(mat_prefix + ".u.dat", u.x, fmt="%15.6e") np.savetxt(mat_prefix + ".s.dat", s.x, fmt="%15.6e") np.savetxt(mat_prefix + ".v.dat", v.x, fmt="%15.6e") # here is the math part...calculate upgrade matrices mean_lam, std_lam, paren_lam, obsen_lam = [], [], [], [] lam_vals = [] for ilam, cur_lam_mult in enumerate(lambda_mults): parensemble_cur_lam = self.parensemble.copy() #print(parensemble_cur_lam.isnull().values.any()) cur_lam = self.current_lambda * cur_lam_mult lam_vals.append(cur_lam) self.logger.log("calcs for lambda {0}".format(cur_lam_mult)) scaled_ident = Cov.identity_like(s) * (cur_lam + 1.0) scaled_ident += s**2 scaled_ident = scaled_ident.inv # build up this matrix as a single element so we can apply # localization self.logger.log("building upgrade_1 matrix") upgrade_1 = -1.0 * (self.parcov_inv_sqrt * scaled_delta_par) *\ v * s * scaled_ident * u.T if self.save_mats: np.savetxt(mat_prefix + ".ivec.dat".format(self.iter_num), scaled_ident.x, fmt="%15.6e") self.logger.log("building upgrade_1 matrix") # apply localization if localizer is not None: self.logger.log("applying localization") upgrade_1.hadamard_product(localizer) self.logger.log("applying localization") # apply residual information self.logger.log("applying residuals") upgrade_1 *= obs_diff self.logger.log("applying residuals") self.logger.log("processing upgrade_1") if self.save_mats: np.savetxt(mat_prefix + ".upgrade_1.dat", upgrade_1.T.x, fmt="%15.6e") upgrade_1 = upgrade_1.to_dataframe() upgrade_1.index.name = "parnme" upgrade_1 = upgrade_1.T upgrade_1.index = [int(i) for i in upgrade_1.index] upgrade_1.to_csv(self.pst.filename+".upgrade_1.{0:04d}.csv".\ format(self.iter_num)) if upgrade_1.isnull().values.any(): self.logger.lraise("NaNs in upgrade_1") self.logger.log("processing upgrade_1") #print(upgrade_1.isnull().values.any()) #print(parensemble_cur_lam.index) #print(upgrade_1.index) parensemble_cur_lam += upgrade_1 # parameter-based upgrade portion if not use_approx and self.iter_num > 1: #if True: self.logger.log("building upgrade_2 matrix") par_diff = (self.parensemble - self.parensemble_0.loc[self.parensemble.index,:]).\ as_pyemu_matrix().T x4 = self.Am.T * self.parcov_inv_sqrt * par_diff x5 = self.Am * x4 x6 = scaled_delta_par.T * x5 x7 = v * scaled_ident * v.T * x6 ug2_mat = -1.0 * (self.parcov_inv_sqrt * scaled_delta_par * x7) upgrade_2 = ug2_mat.to_dataframe() upgrade_2.index.name = "parnme" upgrade_2 = upgrade_2.T upgrade_2.to_csv(self.pst.filename+".upgrade_2.{0:04d}.csv".\ format(self.iter_num)) upgrade_2.index = [int(i) for i in upgrade_2.index] if self.save_mats: np.savetxt(mat_prefix + ".scaled_par_resid.dat", par_diff.x, fmt="%15.6e") np.savetxt(mat_prefix + ".x4.dat", x4.x, fmt="%15.6e") np.savetxt(mat_prefix + ".x5.dat", x5.x, fmt="%15.6e") np.savetxt(mat_prefix + ".x6.dat", x6.x, fmt="%15.6e") np.savetxt(mat_prefix + ".x7.dat", x7.x, fmt="%15.6e") np.savetxt(mat_prefix + ".upgrade_2.dat", ug2_mat.T.x, fmt="%15.6e") if upgrade_2.isnull().values.any(): self.logger.lraise("NaNs in upgrade_2") parensemble_cur_lam += upgrade_2 self.logger.log("building upgrade_2 matrix") self.logger.log("enforcing bounds") parensemble_cur_lam.enforce(self.enforce_bounds) self.logger.log("enforcing bounds") self.logger.log("filling fixed parameters") #fill in fixed pars with initial values fi = parensemble_cur_lam.fixed_indexer li = parensemble_cur_lam.log_indexer log_values = self.pst.parameter_data.loc[:, "parval1"].copy() log_values.loc[li] = log_values.loc[li].apply(np.log10) fixed_vals = log_values.loc[fi] for fname, fval in zip(fixed_vals.index, fixed_vals.values): # if fname not in df.columns: # continue # print(fname) parensemble_cur_lam.loc[:, fname] = fval self.logger.log("filling fixed parameters") # this is for testing failed runs on upgrade testing # works with the 10par_xsec smoother test #parensemble_cur_lam.iloc[:,:] = -1000000.0 # some hackery - we lose track of the transform flag here, but just # know it is transformed. Need to create dataframe here because # pd.concat doesn't like par ensembles later paren_lam.append(pd.DataFrame(parensemble_cur_lam.loc[:, :])) self.logger.log("calcs for lambda {0}".format(cur_lam_mult)) if calc_only: return # subset if needed # and combine lambda par ensembles into one par ensemble for evaluation if run_subset is not None and run_subset < self.parensemble.shape[0]: #subset_idx = ["{0:d}".format(i) for i in np.random.randint(0,self.parensemble.shape[0]-1,run_subset)] subset_idx = self.parensemble.iloc[:run_subset, :].index.values self.logger.statement("subset idxs: " + ','.join([str(s) for s in subset_idx])) # more tracking of transformed - just know it! Creating dataframes... paren_lam_subset = [pe.loc[subset_idx, :] for pe in paren_lam] paren_combine = pd.concat(paren_lam_subset, ignore_index=True) paren_lam_subset = None else: subset_idx = self.parensemble.index.values paren_combine = pd.concat(paren_lam, ignore_index=True) self.logger.log("evaluating ensembles for lambdas : {0}".\ format(','.join(["{0:8.3E}".format(l) for l in lam_vals]))) # back to par ensemble and know it is transformed paren_combine = ParameterEnsemble.from_dataframe(df=paren_combine, pst=self.pst, istransformed=True) failed_runs, obsen_combine = self._calc_obs(paren_combine) self.logger.log("evaluating ensembles for lambdas : {0}".\ format(','.join(["{0:8.3E}".format(l) for l in lam_vals]))) paren_combine = None if failed_runs is not None and len( failed_runs) == obsen_combine.shape[0]: self.logger.lraise("all runs failed - cannot continue") # unpack lambda obs ensembles from combined obs ensemble nrun_per_lam = self.obsensemble.shape[0] if run_subset is not None: nrun_per_lam = run_subset obsen_lam = [] for i in range(len(lam_vals)): sidx = i * nrun_per_lam eidx = sidx + nrun_per_lam oe = ObservationEnsemble.from_dataframe( df=obsen_combine.iloc[sidx:eidx, :].copy(), pst=self.pst) oe.index = subset_idx # check for failed runs in this set - drop failed runs from obs ensembles if failed_runs is not None: failed_runs_this = np.array( [f for f in failed_runs if f >= sidx and f < eidx]) - sidx if len(failed_runs_this) > 0: if len(failed_runs_this) == oe.shape[0]: self.logger.warn( "all runs failed for lambda {0}".format( lam_vals[i])) else: self.logger.warn("{0} run failed for lambda {1}".\ format(len(failed_runs_this),lam_vals[i])) oe.iloc[failed_runs_this, :] = np.NaN oe = oe.dropna() paren_lam[i].iloc[failed_runs_this, :] = np.NaN paren_lam[i] = ParameterEnsemble.from_dataframe( df=paren_lam[i].dropna(), pst=self.pst) paren_lam[i].__instransformed = True # don't drop bad reals here, instead, mask bad reals in the lambda # selection and drop later # if self.drop_bad_reals is not None: # assert isinstance(drop_bad_reals, float) # drop_idx = np.argwhere(self.current_phi_vec > self.drop_bad_reals).flatten() # run_ids = self.obsensemble.index.values # drop_idx = run_ids[drop_idx] # if len(drop_idx) == self.obsensemble.shape[0]: # raise Exception("dropped all realizations as 'bad'") # if len(drop_idx) > 0: # self.logger.warn("{0} realizations dropped as 'bad' (indices :{1})". \ # format(len(drop_idx), ','.join([str(d) for d in drop_idx]))) # self.parensemble.loc[drop_idx, :] = np.NaN # self.parensemble = self.parensemble.dropna() # self.obsensemble.loc[drop_idx, :] = np.NaN # self.obsensemble = self.obsensemble.dropna() # # self.current_phi_vec = self._calc_phi_vec(self.obsensemble) obsen_lam.append(oe) obsen_combine = None # here is where we need to select out the "best" lambda par and obs # ensembles #phi_vecs = [self._calc_phi_vec(obsen) for obsen in obsen_lam] #phi_vecs_reg = [self._calc_regul_phi_vec(paren) for paren in paren_lam] #if self.regul_factor > 0.0: # for i,(pv,prv) in enumerate(zip(phi_vecs,phi_vecs_reg)): # phi_vecs[i] = pv + (prv * self.regul_factor) self.logger.log("calc lambda phi vectors") phi_vecs = [ self.phi.get_meas_and_regul_phi(oe, pe.loc[oe.index, :]) for oe, pe in zip(obsen_lam, paren_lam) ] self.logger.log("calc lambda phi vectors") if self.drop_bad_reals is not None: for i, (meas_pv, regul_pv) in enumerate(phi_vecs): #for testing the drop_bad_reals functionality #pv[[0,3,7]] = self.drop_bad_reals + 1.0 regul_pv = regul_pv.copy() regul_pv[meas_pv > self.drop_bad_reals] = np.NaN regul_pv = regul_pv[~np.isnan(regul_pv)] meas_pv[meas_pv > self.drop_bad_reals] = np.NaN meas_pv = meas_pv[~np.isnan(meas_pv)] if len(meas_pv) == 0: #raise Exception("all realization for lambda {0} dropped as 'bad'".\ # format(lam_vals[i])) self.logger.warn( "all realizations for lambda {0} marked as 'bad'") meas_pv = np.zeros_like(obsen_lam[0].shape[0]) + 1.0e+30 regul_pv = np.zeros_like(obsen_lam[0].shape[0]) + 1.0e+30 phi_vecs[i] = (meas_pv, regul_pv) mean_std_meas = [(pv[0].mean(), pv[0].std()) for pv in phi_vecs] mean_std_regul = [(pv[1].mean(), pv[1].std()) for pv in phi_vecs] update_pars = False update_lambda = False self.logger.statement("**************************") # self.logger.statement(str(datetime.now())) self.logger.statement("lambda testing summary") self.logger.statement("total runs:{0}".format(self.total_runs)) self.logger.statement("iteration: {0}".format(self.iter_num)) self.logger.statement("current lambda:{0:15.6G}, mean:{1:15.6G}, std:{2:15.6G}". \ format(self.current_lambda, self.last_best_mean, self.last_best_std)) # accept a new best if its within 10% best_mean = self.last_best_mean * 1.1 best_std = self.last_best_std * 1.1 best_i = 0 for i, ((mm, ms), (rm, rs)) in enumerate(zip(mean_std_meas, mean_std_regul)): self.logger.statement( " tested lambda:{0:15.6G}, meas mean:{1:15.6G}, meas std:{2:15.6G}" .format(self.current_lambda * lambda_mults[i], mm, ms)) self.logger.statement("{0:30s}regul mean:{1:15.6G}, regul std:{2:15.6G}".\ format(' ',rm,rs)) m = mm + (self.regul_factor * rm) s = ms + (self.regul_factor * rs) if m < best_mean: update_pars = True best_mean = m best_i = i if s < best_std: update_lambda = True best_std = s if np.isnan(best_mean): self.logger.lraise("best mean = NaN") if np.isnan(best_std): self.logger.lraise("best std = NaN") if not update_pars: self.current_lambda *= max(lambda_mults) * 10.0 self.current_lambda = min(self.current_lambda, 100000) self.logger.statement("not accepting iteration, increased lambda:{0}".\ format(self.current_lambda)) else: #more transformation status hard coding - ugly self.parensemble = ParameterEnsemble.from_dataframe( df=paren_lam[best_i], pst=self.pst, istransformed=True) if run_subset is not None: failed_runs, self.obsensemble = self._calc_obs( self.parensemble) if failed_runs is not None: self.logger.warn("dropping failed realizations") self.parensemble.loc[failed_runs, :] = np.NaN self.parensemble = self.parensemble.dropna() self.obsensemble.loc[failed_runs, :] = np.NaN self.obsensemble = self.obsensemble.dropna() self.phi.update() best_mean = self.phi.comp_phi.mean() best_std = self.phi.comp_phi.std() else: self.obsensemble = obsen_lam[best_i] # reindex parensemble in case failed runs self.parensemble = ParameterEnsemble.from_dataframe( df=self.parensemble.loc[self.obsensemble.index], pst=self.pst, istransformed=self.parensemble.istransformed) self.phi.update() if self.drop_bad_reals is not None: # for testing drop_bad_reals functionality # self.current_phi_vec[::2] = self.drop_bad_reals + 1.0 #drop_idx = np.argwhere(self.current_phi_vec > self.drop_bad_reals).flatten() drop_idx = np.argwhere( self.phi.comp_phi > self.drop_bad_reals).flatten() run_ids = self.obsensemble.index.values drop_idx = run_ids[drop_idx] if len(drop_idx) > self.obsensemble.shape[0] - 3: raise Exception("dropped too many realizations as 'bad'") if len(drop_idx) > 0: self.logger.warn("{0} realizations dropped as 'bad' (indices :{1})". \ format(len(drop_idx), ','.join([str(d) for d in drop_idx]))) self.parensemble.loc[drop_idx, :] = np.NaN self.parensemble = self.parensemble.dropna() self.obsensemble.loc[drop_idx, :] = np.NaN self.obsensemble = self.obsensemble.dropna() self.phi.update() best_mean = self.phi.comp_phi.mean() best_std = self.phi.comp_phi.std() self.phi.report(cur_lam=self.current_lambda * lambda_mults[best_i]) self.logger.statement(" best lambda:{0:15.6G}, mean:{1:15.6G}, std:{2:15.6G}".\ format(self.current_lambda*lambda_mults[best_i], best_mean,best_std)) #self.logger.statement(" actual mean phi: {0:15.6G}".format(float(self.current_actual_phi.mean()))) self.last_best_mean = best_mean self.last_best_std = best_std if update_lambda: # be aggressive self.current_lambda *= (lambda_mults[best_i] * 0.75) # but don't let lambda get too small self.current_lambda = max(self.current_lambda, 0.00001) self.logger.statement("updating lambda: {0:15.6G}".\ format(self.current_lambda )) self.logger.statement("**************************\n") self.parensemble.to_csv(self.pst.filename+self.paren_prefix.\ format(self.iter_num)) self.obsensemble.to_csv(self.pst.filename+self.obsen_prefix.\ format(self.iter_num)) if self.raw_sweep_out is not None: self.raw_sweep_out.to_csv(self.pst.filename+"_sweepraw{0}.csv".\ format(self.iter_num)) self.logger.log("iteration {0}".format(self.iter_num))
def initialize(self, num_reals=1, init_lambda=None, enforce_bounds="reset", parensemble=None, obsensemble=None, restart_obsensemble=None, regul_factor=0.0, use_approx_prior=True, build_empirical_prior=False): """Initialize the iES process. Depending on arguments, draws or loads initial parameter observations ensembles and runs the initial parameter ensemble Parameters ---------- num_reals : int the number of realizations to draw. Ignored if parensemble/obsensemble are not None init_lambda : float the initial lambda to use. During subsequent updates, the lambda is updated according to upgrade success enforce_bounds : str how to enfore parameter bound transgression. options are reset, drop, or None parensemble : pyemu.ParameterEnsemble or str a parameter ensemble or filename to use as the initial parameter ensemble. If not None, then obsenemble must not be None obsensemble : pyemu.ObservationEnsemble or str an observation ensemble or filename to use as the initial observation ensemble. If not None, then parensemble must not be None restart_obsensemble : pyemu.ObservationEnsemble or str an observation ensemble or filename to use as an evaluated observation ensemble. If not None, this will skip the initial parameter ensemble evaluation - user beware! regul_factor : float the regularization penalty fraction of the composite objective. The Prurist, MAP solution would be regul_factor = 1.0, yielding equal parts measurement and regularization to the composite objective function. Default is 0.0, which means only seek to minimize the measurement objective function use_approx_prior : bool a flag to use the inverse, square root of the prior ccovariance matrix for scaling the upgrade calculation. If True, this matrix is not used. Default is True build_empirical_prior : bool flag to build the prior parameter covariance matrix from an existing parensemble. If True and parensemble is None, an exception is raised Example ------- ``>>>import pyemu`` ``>>>es = pyemu.EnsembleSmoother(pst="pest.pst")`` ``>>>es.initialize(num_reals=100)`` """ ''' (re)initialize the process ''' # initialize the phi report csv self.enforce_bounds = enforce_bounds self.regul_factor = float(regul_factor) self.total_runs = 0 # this matrix gets used a lot, so only calc once and store self.obscov_inv_sqrt = self.obscov.get(self.pst.nnz_obs_names).inv.sqrt if use_approx_prior: self.logger.statement("using approximate parcov in solution") self.parcov_inv_sqrt = 1.0 else: self.logger.statement("using full parcov in solution") # Chen and Oliver use a low rank approx here, but so far, # I haven't needed it - not using enough parameters yet self.logger.log("forming inverse sqrt parcov matrix") self.parcov_inv_sqrt = self.parcov.inv.sqrt self.logger.log("forming inverse sqrt parcov matrix") if parensemble is not None and obsensemble is not None: self.logger.log("initializing with existing ensembles") if isinstance(parensemble, str): self.logger.log("loading parensemble from file") if not os.path.exists(obsensemble): self.logger.lraise("can not find parensemble file: {0}".\ format(parensemble)) df = pd.read_csv(parensemble, index_col=0) #df.index = [str(i) for i in df.index] self.parensemble_0 = ParameterEnsemble.from_dataframe( df=df, pst=self.pst) self.logger.log("loading parensemble from file") elif isinstance(parensemble, ParameterEnsemble): self.parensemble_0 = parensemble.copy() else: raise Exception("unrecognized arg type for parensemble, " +\ "should be filename or ParameterEnsemble" +\ ", not {0}".format(type(parensemble))) self.parensemble = self.parensemble_0.copy() if isinstance(obsensemble, str): self.logger.log("loading obsensemble from file") if not os.path.exists(obsensemble): self.logger.lraise("can not find obsensemble file: {0}".\ format(obsensemble)) df = pd.read_csv(obsensemble, index_col=0).loc[:, self.pst.nnz_obs_names] #df.index = [str(i) for i in df.index] self.obsensemble_0 = ObservationEnsemble.from_dataframe( df=df, pst=self.pst) self.logger.log("loading obsensemble from file") elif isinstance(obsensemble, ObservationEnsemble): self.obsensemble_0 = obsensemble.copy() else: raise Exception("unrecognized arg type for obsensemble, " +\ "should be filename or ObservationEnsemble" +\ ", not {0}".format(type(obsensemble))) assert self.parensemble_0.shape[0] == self.obsensemble_0.shape[0] #self.num_reals = self.parensemble_0.shape[0] num_reals = self.parensemble.shape[0] self.logger.log("initializing with existing ensembles") if build_empirical_prior: self.reset_parcov(self.parensemble.covariance_matrix()) if self.save_mats: self.parcov.to_binary(self.pst.filename + ".empcov.jcb") else: if build_empirical_prior: self.logger.lraise( "can't use build_emprirical_prior without parensemble...") self.logger.log( "initializing smoother with {0} realizations".format( num_reals)) self.logger.log("initializing parensemble") self.parensemble_0 = pyemu.ParameterEnsemble.from_gaussian_draw( self.pst, self.parcov, num_reals=num_reals) self.parensemble_0.enforce(enforce_bounds=enforce_bounds) self.logger.log("initializing parensemble") self.parensemble = self.parensemble_0.copy() self.parensemble_0.to_csv(self.pst.filename +\ self.paren_prefix.format(0)) self.logger.log("initializing parensemble") self.logger.log("initializing obsensemble") self.obsensemble_0 = pyemu.ObservationEnsemble.from_id_gaussian_draw( self.pst, num_reals=num_reals) #self.obsensemble = self.obsensemble_0.copy() # save the base obsensemble self.obsensemble_0.to_csv(self.pst.filename +\ self.obsen_prefix.format(-1)) self.logger.log("initializing obsensemble") self.logger.log( "initializing smoother with {0} realizations".format( num_reals)) if use_approx_prior: self.logger.statement("using approximate parcov in solution") self.parcov_inv_sqrt = 1.0 else: self.logger.statement("using full parcov in solution") # Chen and Oliver use a low rank approx here, but so far, # I haven't needed it - not using enough parameters yet self.logger.log("forming inverse sqrt parcov matrix") self.parcov_inv_sqrt = self.parcov.inv.sqrt self.logger.log("forming inverse sqrt parcov matrix") # self.obs0_matrix = self.obsensemble_0.nonzero.as_pyemu_matrix() # self.par0_matrix = self.parensemble_0.as_pyemu_matrix() self.enforce_bounds = enforce_bounds if restart_obsensemble is not None: self.logger.log( "loading restart_obsensemble {0}".format(restart_obsensemble)) failed_runs, self.obsensemble = self._load_obs_ensemble( restart_obsensemble) assert self.obsensemble.shape[0] == self.obsensemble_0.shape[0] assert list(self.obsensemble.columns) == list( self.obsensemble_0.columns) self.logger.log( "loading restart_obsensemble {0}".format(restart_obsensemble)) else: # run the initial parameter ensemble self.logger.log("evaluating initial ensembles") failed_runs, self.obsensemble = self._calc_obs(self.parensemble) self.obsensemble.to_csv(self.pst.filename +\ self.obsen_prefix.format(0)) if self.raw_sweep_out is not None: self.raw_sweep_out.to_csv(self.pst.filename + "_sweepraw0.csv") self.logger.log("evaluating initial ensembles") if failed_runs is not None: self.logger.warn("dropping failed realizations") #failed_runs_str = [str(f) for f in failed_runs] #self.parensemble = self.parensemble.drop(failed_runs) #self.obsensemble = self.obsensemble.drop(failed_runs) self.parensemble.loc[failed_runs, :] = np.NaN self.parensemble = self.parensemble.dropna() self.obsensemble.loc[failed_runs, :] = np.NaN self.obsensemble = self.obsensemble.dropna() if not self.parensemble.istransformed: self.parensemble._transform(inplace=True) if not self.parensemble_0.istransformed: self.parensemble_0._transform(inplace=True) self.phi = Phi(self) if self.drop_bad_reals is not None: #drop_idx = np.argwhere(self.current_phi_vec > self.drop_bad_reals).flatten() #comp_phi = self.phi.comp_phi #drop_idx = np.argwhere(self.phi.comp_phi > self.drop_bad_reals).flatten() #meas_phi = self.phi.meas_phi drop_idx = np.argwhere( self.phi.meas_phi > self.drop_bad_reals).flatten() run_ids = self.obsensemble.index.values drop_idx = run_ids[drop_idx] if len(drop_idx) == self.obsensemble.shape[0]: raise Exception("dropped all realizations as 'bad'") if len(drop_idx) > 0: self.logger.warn("{0} realizations dropped as 'bad' (indices :{1})".\ format(len(drop_idx),','.join([str(d) for d in drop_idx]))) self.parensemble.loc[drop_idx, :] = np.NaN self.parensemble = self.parensemble.dropna() self.obsensemble.loc[drop_idx, :] = np.NaN self.obsensemble = self.obsensemble.dropna() self.phi.update() self.phi.report(cur_lam=0.0) self.last_best_mean = self.phi.comp_phi.mean() self.last_best_std = self.phi.comp_phi.std() #self.logger.statement("initial phi (mean, std): {0:15.6G},{1:15.6G}".\ # format(self.last_best_mean,self.last_best_std)) if init_lambda is not None: self.current_lambda = float(init_lambda) else: #following chen and oliver x = self.last_best_mean / (2.0 * float(self.obsensemble.shape[1])) self.current_lambda = 10.0**(np.floor(np.log10(x))) self.logger.statement("current lambda:{0:15.6g}".format( self.current_lambda)) self.delta_par_prior = self._calc_delta_par(self.parensemble_0) u, s, v = self.delta_par_prior.pseudo_inv_components( eigthresh=self.pst.svd_data.eigthresh) self.Am = u * s.inv if self.save_mats: np.savetxt(self.pst.filename.replace(".pst", '.') + "0.prior_par_diff.dat", self.delta_par_prior.x, fmt="%15.6e") np.savetxt(self.pst.filename.replace(".pst", '.') + "0.am_u.dat", u.x, fmt="%15.6e") np.savetxt(self.pst.filename.replace(".pst", '.') + "0.am_v.dat", v.x, fmt="%15.6e") np.savetxt(self.pst.filename.replace(".pst", '.') + "0.am_s_inv.dat", s.inv.as_2d, fmt="%15.6e") np.savetxt(self.pst.filename.replace(".pst", '.') + "0.am.dat", self.Am.x, fmt="%15.6e") self._initialized = True
def update(self, lambda_mults=[1.0], localizer=None, run_subset=None, use_approx=True): if run_subset is not None: if run_subset >= self.obsensemble.shape[0]: self.logger.warn("run_subset ({0}) >= num of active reals ({1})...ignoring ".\ format(run_subset,self.obsensemble.shape[0])) run_subset = None self.iter_num += 1 self.logger.log("iteration {0}".format(self.iter_num)) self.logger.statement("{0} active realizations".format( self.obsensemble.shape[0])) if self.obsensemble.shape[0] < 2: self.logger.lraise( "at least active 2 realizations (really like 300) are needed to update" ) if not self.__initialized: #raise Exception("must call initialize() before update()") self.logger.lraise("must call initialize() before update()") self.logger.log("calculate scaled delta obs") scaled_delta_obs = self._calc_delta_obs(self.obsensemble) self.logger.log("calculate scaled delta obs") self.logger.log("calculate scaled delta par") scaled_delta_par = self._calc_delta_par(self.parensemble) self.logger.log("calculate scaled delta par") self.logger.log("calculate pseudo inv comps") u, s, v = scaled_delta_obs.pseudo_inv_components() self.logger.log("calculate pseudo inv comps") self.logger.log("calculate obs diff matrix") obs_diff = self.obscov_inv_sqrt * self._get_residual_matrix( self.obsensemble).T self.logger.log("calculate obs diff matrix") # here is the math part...calculate upgrade matrices mean_lam, std_lam, paren_lam, obsen_lam = [], [], [], [] lam_vals = [] for ilam, cur_lam_mult in enumerate(lambda_mults): parensemble_cur_lam = self.parensemble.copy() #print(parensemble_cur_lam.isnull().values.any()) cur_lam = self.current_lambda * cur_lam_mult lam_vals.append(cur_lam) self.logger.log("calcs for lambda {0}".format(cur_lam_mult)) scaled_ident = Cov.identity_like(s) * (cur_lam + 1.0) scaled_ident += s**2 scaled_ident = scaled_ident.inv # build up this matrix as a single element so we can apply # localization self.logger.log("building upgrade_1 matrix") upgrade_1 = -1.0 * (self.half_parcov_diag * scaled_delta_par) *\ v * s * scaled_ident * u.T self.logger.log("building upgrade_1 matrix") # apply localization if localizer is not None: self.logger.log("applying localization") upgrade_1.hadamard_product(localizer) self.logger.log("applying localization") # apply residual information self.logger.log("applying residuals") upgrade_1 *= obs_diff self.logger.log("applying residuals") self.logger.log("processing upgrade_1") upgrade_1 = upgrade_1.to_dataframe() upgrade_1.index.name = "parnme" upgrade_1 = upgrade_1.T upgrade_1.index = [int(i) for i in upgrade_1.index] upgrade_1.to_csv(self.pst.filename+".upgrade_1.{0:04d}.csv".\ format(self.iter_num)) if upgrade_1.isnull().values.any(): self.logger.lraise("NaNs in upgrade_1") self.logger.log("processing upgrade_1") #print(upgrade_1.isnull().values.any()) #print(parensemble_cur_lam.index) #print(upgrade_1.index) parensemble_cur_lam += upgrade_1 # parameter-based upgrade portion if not use_approx and self.iter_num > 1: self.logger.log("building upgrade_2 matrix") par_diff = (self.parensemble - self.parensemble_0.loc[self.parensemble.index,:]).\ as_pyemu_matrix().T x4 = self.Am.T * self.half_parcov_diag * par_diff x5 = self.Am * x4 x6 = scaled_delta_par.T * x5 x7 = v * scaled_ident * v.T * x6 upgrade_2 = -1.0 * (self.half_parcov_diag * scaled_delta_par * x7).to_dataframe() upgrade_2.index.name = "parnme" upgrade_2 = upgrade_2.T upgrade_2.to_csv(self.pst.filename+".upgrade_2.{0:04d}.csv".\ format(self.iter_num)) upgrade_2.index = [int(i) for i in upgrade_2.index] if upgrade_2.isnull().values.any(): self.logger.lraise("NaNs in upgrade_2") parensemble_cur_lam += upgrade_2 self.logger.log("building upgrade_2 matrix") parensemble_cur_lam.enforce(self.enforce_bounds) # this is for testing failed runs on upgrade testing # works with the 10par_xsec smoother test #parensemble_cur_lam.iloc[:,:] = -1000000.0 paren_lam.append(pd.DataFrame(parensemble_cur_lam.loc[:, :])) self.logger.log("calcs for lambda {0}".format(cur_lam_mult)) # subset if needed # and combine lambda par ensembles into one par ensemble for evaluation if run_subset is not None and run_subset < self.parensemble.shape[0]: #subset_idx = ["{0:d}".format(i) for i in np.random.randint(0,self.parensemble.shape[0]-1,run_subset)] subset_idx = self.parensemble.iloc[:run_subset, :].index.values self.logger.statement("subset idxs: " + ','.join([str(s) for s in subset_idx])) paren_lam_subset = [pe.loc[subset_idx, :] for pe in paren_lam] paren_combine = pd.concat(paren_lam_subset, ignore_index=True) paren_lam_subset = None else: subset_idx = self.parensemble.index.values paren_combine = pd.concat(paren_lam, ignore_index=True) self.logger.log("evaluating ensembles for lambdas : {0}".\ format(','.join(["{0:8.3E}".format(l) for l in lam_vals]))) failed_runs, obsen_combine = self._calc_obs(paren_combine) #if failed_runs is not None: # obsen_combine.loc[failed_runs,:] = np.NaN self.logger.log("evaluating ensembles for lambdas : {0}".\ format(','.join(["{0:8.3E}".format(l) for l in lam_vals]))) paren_combine = None if failed_runs is not None and len( failed_runs) == obsen_combine.shape[0]: self.logger.lraise("all runs failed - cannot continue") # unpack lambda obs ensembles from combined obs ensemble nrun_per_lam = self.obsensemble.shape[0] if run_subset is not None: nrun_per_lam = run_subset obsen_lam = [] for i in range(len(lam_vals)): sidx = i * nrun_per_lam eidx = sidx + nrun_per_lam oe = ObservationEnsemble.from_dataframe( df=obsen_combine.iloc[sidx:eidx, :].copy(), pst=self.pst) oe.index = subset_idx # check for failed runs in this set - drop failed runs from obs ensembles if failed_runs is not None: failed_runs_this = np.array( [f for f in failed_runs if f >= sidx and f < eidx]) - sidx if len(failed_runs_this) > 0: if len(failed_runs_this) == oe.shape[0]: self.logger.warn( "all runs failed for lambda {0}".format( lam_vals[i])) else: self.logger.warn("{0} run failed for lambda {1}".\ format(len(failed_runs_this),lam_vals[i])) oe.iloc[failed_runs_this, :] = np.NaN oe = oe.dropna() obsen_lam.append(oe) obsen_combine = None # here is where we need to select out the "best" lambda par and obs # ensembles self.logger.statement("\n**************************") self.logger.statement(str(datetime.now())) self.logger.statement("total runs:{0}".format(self.total_runs)) self.logger.statement("iteration: {0}".format(self.iter_num)) self.logger.statement("current lambda:{0:15.6G}, mean:{1:15.6G}, std:{2:15.6G}".\ format(self.current_lambda, self.last_best_mean,self.last_best_std)) phi_vecs = [self._calc_phi_vec(obsen) for obsen in obsen_lam] mean_std = [(pv.mean(), pv.std()) for pv in phi_vecs] update_pars = False update_lambda = False # accept a new best if its within 10% best_mean = self.last_best_mean * 1.1 best_std = self.last_best_std * 1.1 best_i = 0 for i, (m, s) in enumerate(mean_std): self.logger.statement(" tested lambda:{0:15.6G}, mean:{1:15.6G}, std:{2:15.6G}".\ format(self.current_lambda * lambda_mults[i],m,s)) if m < best_mean: update_pars = True best_mean = m best_i = i if s < best_std: update_lambda = True best_std = s if np.isnan(best_mean): self.logger.lraise("best mean = NaN") if np.isnan(best_std): self.logger.lraise("best std = NaN") if not update_pars: self.current_lambda *= max(lambda_mults) * 10.0 self.current_lambda = min(self.current_lambda, 100000) self.logger.statement("not accepting iteration, increased lambda:{0}".\ format(self.current_lambda)) else: self.parensemble = ParameterEnsemble.from_dataframe( df=paren_lam[best_i], pst=self.pst) if run_subset is not None: failed_runs, self.obsensemble = self._calc_obs( self.parensemble) if failed_runs is not None: self.logger.warn("dropping failed realizations") self.parensemble = self.parensemble.drop(failed_runs) self.obsensemble = self.obsensemble.drop(failed_runs) self.current_phi_vec = self._calc_phi_vec(self.obsensemble) self._phi_report(self.current_phi_vec, self.current_lambda * lambda_mults[best_i]) best_mean = self.current_phi_vec.mean() best_std = self.current_phi_vec.std() else: self.obsensemble = obsen_lam[best_i] # reindex parensemble in case failed runs self.parensemble = ParameterEnsemble.from_dataframe( df=self.parensemble.loc[self.obsensemble.index], pst=self.pst) self._phi_report(phi_vecs[best_i], self.current_lambda * lambda_mults[best_i]) self.current_phi_vec = phi_vecs[best_i] self.logger.statement(" best lambda:{0:15.6G}, mean:{1:15.6G}, std:{2:15.6G}".\ format(self.current_lambda*lambda_mults[best_i], best_mean,best_std)) self.last_best_mean = best_mean self.last_best_std = best_std if update_lambda: # be aggressive self.current_lambda *= (lambda_mults[best_i] * 0.75) # but don't let lambda get too small self.current_lambda = max(self.current_lambda, 0.001) self.logger.statement("updating lambda: {0:15.6G}".\ format(self.current_lambda )) self.logger.statement("**************************\n") self.parensemble.to_csv(self.pst.filename+self.paren_prefix.\ format(self.iter_num)) self.obsensemble.to_csv(self.pst.filename+self.obsen_prefix.\ format(self.iter_num)) if self.raw_sweep_out is not None: self.raw_sweep_out.to_csv(self.pst.filename+"_raw{0}".\ format(self.iter_num)) self.logger.log("iteration {0}".format(self.iter_num))
def initialize(self, num_reals=1, init_lambda=None, enforce_bounds="reset", parensemble=None, obsensemble=None, restart_obsensemble=None): ''' (re)initialize the process ''' # initialize the phi report csv self.enforce_bounds = enforce_bounds self.phi_csv = open(self.pst.filename + ".iobj.csv", 'w') self.phi_csv.write( "iter_num,total_runs,lambda,min,max,mean,median,std,") self.phi_csv.write(','.join(["{0:010d}".\ format(i+1) for i in range(num_reals)])) self.phi_csv.write('\n') self.total_runs = 0 # this matrix gets used a lot, so only calc once and store self.obscov_inv_sqrt = self.obscov.get(self.pst.nnz_obs_names).inv.sqrt if parensemble is not None and obsensemble is not None: self.logger.log("initializing with existing ensembles") if isinstance(parensemble, str): self.logger.log("loading parensemble from file") if not os.path.exists(obsensemble): self.logger.lraise("can not find parensemble file: {0}".\ format(parensemble)) df = pd.read_csv(parensemble, index_col=0) #df.index = [str(i) for i in df.index] self.parensemble_0 = ParameterEnsemble.from_dataframe( df=df, pst=self.pst) self.logger.log("loading parensemble from file") elif isinstance(parensemble, ParameterEnsemble): self.parensemble_0 = parensemble.copy() else: raise Exception("unrecognized arg type for parensemble, " +\ "should be filename or ParameterEnsemble" +\ ", not {0}".format(type(parensemble))) self.parensemble = self.parensemble_0.copy() if isinstance(obsensemble, str): self.logger.log("loading obsensemble from file") if not os.path.exists(obsensemble): self.logger.lraise("can not find obsensemble file: {0}".\ format(obsensemble)) df = pd.read_csv(obsensemble, index_col=0).loc[:, self.pst.nnz_obs_names] #df.index = [str(i) for i in df.index] self.obsensemble_0 = ObservationEnsemble.from_dataframe( df=df, pst=self.pst) self.logger.log("loading obsensemble from file") elif isinstance(obsensemble, ObservationEnsemble): self.obsensemble_0 = obsensemble.copy() else: raise Exception("unrecognized arg type for obsensemble, " +\ "should be filename or ObservationEnsemble" +\ ", not {0}".format(type(obsensemble))) assert self.parensemble_0.shape[0] == self.obsensemble_0.shape[0] #self.num_reals = self.parensemble_0.shape[0] self.logger.log("initializing with existing ensembles") else: self.logger.log( "initializing smoother with {0} realizations".format( num_reals)) #self.num_reals = int(num_reals) #assert self.num_reals > 1 self.logger.log("initializing parensemble") self.parensemble_0 = ParameterEnsemble(self.pst) self.parensemble_0.draw(cov=self.parcov, num_reals=num_reals) self.parensemble_0.enforce(enforce_bounds=enforce_bounds) self.logger.log("initializing parensemble") self.parensemble = self.parensemble_0.copy() self.parensemble_0.to_csv(self.pst.filename +\ self.paren_prefix.format(0)) self.logger.log("initializing parensemble") self.logger.log("initializing obsensemble") self.obsensemble_0 = ObservationEnsemble(self.pst) self.obsensemble_0.draw(cov=self.obscov, num_reals=num_reals) #self.obsensemble = self.obsensemble_0.copy() # save the base obsensemble self.obsensemble_0.to_csv(self.pst.filename +\ self.obsen_prefix.format(-1)) self.logger.log("initializing obsensemble") self.logger.log( "initializing smoother with {0} realizations".format( num_reals)) self.obs0_matrix = self.obsensemble_0.nonzero.as_pyemu_matrix() self.enforce_bounds = enforce_bounds if restart_obsensemble is not None: self.logger.log( "loading restart_obsensemble {0}".format(restart_obsensemble)) failed_runs, self.obsensemble = self._load_obs_ensemble( restart_obsensemble) assert self.obsensemble.shape[0] == self.obsensemble_0.shape[0] assert list(self.obsensemble.columns) == list( self.obsensemble_0.columns) self.logger.log( "loading restart_obsensemble {0}".format(restart_obsensemble)) else: # run the initial parameter ensemble self.logger.log("evaluating initial ensembles") failed_runs, self.obsensemble = self._calc_obs(self.parensemble) self.obsensemble.to_csv(self.pst.filename +\ self.obsen_prefix.format(0)) self.logger.log("evaluating initial ensembles") if failed_runs is not None: self.logger.warn("dropping failed realizations") #failed_runs_str = [str(f) for f in failed_runs] self.parensemble = self.parensemble.drop(failed_runs) self.obsensemble = self.obsensemble.drop(failed_runs) self.current_phi_vec = self._calc_phi_vec(self.obsensemble) self._phi_report(self.current_phi_vec, 0.0) self.last_best_mean = self.current_phi_vec.mean() self.last_best_std = self.current_phi_vec.std() self.logger.statement("initial phi (mean, std): {0:15.6G},{1:15.6G}".\ format(self.last_best_mean,self.last_best_std)) if init_lambda is not None: self.current_lambda = float(init_lambda) else: #following chen and oliver x = self.last_best_mean / (2.0 * float(self.obsensemble.shape[1])) self.current_lambda = 10.0**(np.floor(np.log10(x))) # if using the approximate form of the algorithm, let # the parameter scaling matrix be the identity matrix # jwhite - dec 5 2016 - using the actual parcov inv # for upgrades seems to be pushing parameters around # too much. for now, just not using it, maybe # better choices of lambda will tame it self.logger.statement("current lambda:{0:15.6g}".format( self.current_lambda)) if self.use_approx_prior: self.logger.statement("using approximate parcov in solution") self.half_parcov_diag = 1.0 else: #self.logger.statement("using full parcov in solution") # if self.parcov.isdiagonal: # self.half_parcov_diag = self.parcov.sqrt.inv # else: # self.half_parcov_diag = Cov(x=np.diag(self.parcov.x), # names=self.parcov.col_names, # isdiagonal=True).inv.sqrt self.half_parcov_diag = 1.0 self.delta_par_prior = self._calc_delta_par(self.parensemble_0) u, s, v = self.delta_par_prior.pseudo_inv_components() self.Am = u * s.inv self.__initialized = True