def initialize(self, num_reals): ''' (re)initialize the process ''' self.num_reals = int(num_reals) self.parensemble = ParameterEnsemble(self.pst) self.parensemble.draw(cov=self.parcov, num_reals=num_reals) self.obsensemble_0 = ObservationEnsemble(self.pst) self.obsensemble_0.draw(cov=self.obscov, num_reals=num_reals) self.obsensemble = self.obsensemble_0.copy() if self.parcov.isdiagonal: self.half_parcov_diag = self.parcov.inv.sqrt else: self.half_parcov_diag = Cov(x=np.diag(self.parcov.x), names=self.parcov.col_names, isdiagonal=True).inv.sqrt #if self.obscov.isdiagonal: #self.half_obscov_inv = self.obscov.inv.sqrt # else: # self.half_obscov_diag = Cov(x=np.diag(self.obscov.x), # names=self.obscov.col_names, # isdiagonal=True) self.delta_par_prior = self._calc_delta_par() self.__initialized = True
def __init__(self, **kwargs): warnings.warn("pyemu.MonteCarlo class is deprecated. "+\ "Please use the ensemble classes directly",PyemuWarning) super(MonteCarlo, self).__init__(**kwargs) assert self.pst is not None, \ "monte carlo requires a pest control file" self.parensemble = ParameterEnsemble(pst=self.pst) self.obsensemble = ObservationEnsemble(pst=self.pst)
def draw(self, num_reals=1, par_file=None, obs=False, enforce_bounds=False, cov=None, how="gaussian"): """draw stochastic realizations of parameters and optionally observations Parameters: ---------- num_reals (int): number of realization to generate par_file (str): parameter file to use as mean values obs (bool): add a realization of measurement noise to obs enforce_bounds (bool): enforce parameter bounds in control file how (str): type of distribution. Must be in ["gaussian","uniform"] Returns: None Raises: None """ if par_file is not None: self.pst.parrep(par_file) how = how.lower().strip() assert how in ["gaussian", "uniform"] if cov is not None: assert isinstance(cov, Cov) if how == "uniform": raise Exception("MonteCarlo.draw() error: 'how'='uniform'," +\ " 'cov' arg cannot be passed") else: cov = self.parcov self.parensemble = ParameterEnsemble(pst=self.pst) self.obsensemble = ObservationEnsemble(pst=self.pst) self.log("generating {0:d} parameter realizations".format(num_reals)) self.parensemble.draw(cov, num_reals=num_reals, how=how, enforce_bounds=enforce_bounds) #if enforce_bounds: # self.parensemble.enforce() self.log("generating {0:d} parameter realizations".format(num_reals)) if obs: self.log( "generating {0:d} observation realizations".format(num_reals)) self.obsensemble.draw(self.obscov, num_reals=num_reals) self.log( "generating {0:d} observation realizations".format(num_reals))
def initialize(self,num_reals): ''' (re)initialize the process ''' self.num_reals = int(num_reals) self.parensemble = ParameterEnsemble(self.pst) self.parensemble.draw(cov=self.parcov,num_reals=num_reals) self.obsensemble_0 = ObservationEnsemble(self.pst) self.obsensemble_0.draw(cov=self.obscov,num_reals=num_reals) self.obsensemble = self.obsensemble_0.copy() if self.parcov.isdiagonal: self.half_parcov_diag = self.parcov.inv.sqrt else: self.half_parcov_diag = Cov(x=np.diag(self.parcov.x), names=self.parcov.col_names, isdiagonal=True).inv.sqrt #if self.obscov.isdiagonal: #self.half_obscov_inv = self.obscov.inv.sqrt # else: # self.half_obscov_diag = Cov(x=np.diag(self.obscov.x), # names=self.obscov.col_names, # isdiagonal=True) self.delta_par_prior = self._calc_delta_par() self.__initialized = True
def draw(self, num_reals=1, par_file = None, obs=False, enforce_bounds=False,cov=None, how="gaussian"): """draw stochastic realizations of parameters and optionally observations Parameters: ---------- num_reals (int): number of realization to generate par_file (str): parameter file to use as mean values obs (bool): add a realization of measurement noise to obs enforce_bounds (bool): enforce parameter bounds in control file how (str): type of distribution. Must be in ["gaussian","uniform"] Returns: None Raises: None """ if par_file is not None: self.pst.parrep(par_file) how = how.lower().strip() assert how in ["gaussian","uniform"] if cov is not None: assert isinstance(cov,Cov) if how == "uniform": raise Exception("MonteCarlo.draw() error: 'how'='uniform'," +\ " 'cov' arg cannot be passed") else: cov = self.parcov self.parensemble = ParameterEnsemble(pst=self.pst) self.obsensemble = ObservationEnsemble(pst=self.pst) self.log("generating {0:d} parameter realizations".format(num_reals)) self.parensemble.draw(cov,num_reals=num_reals, how=how) if enforce_bounds: self.parensemble.enforce() self.log("generating {0:d} parameter realizations".format(num_reals)) if obs: self.log("generating {0:d} observation realizations".format(num_reals)) self.obsensemble.draw(self.obscov,num_reals=num_reals) self.log("generating {0:d} observation realizations".format(num_reals))
def initialize( self, num_reals=1, init_lambda=None, enforce_bounds="reset", parensemble=None, obsensemble=None, restart_obsensemble=None, ): """Initialize the iES process. Depending on arguments, draws or loads initial parameter observations ensembles and runs the initial parameter ensemble Parameters ---------- num_reals : int the number of realizations to draw. Ignored if parensemble/obsensemble are not None init_lambda : float the initial lambda to use. During subsequent updates, the lambda is updated according to upgrade success enforce_bounds : str how to enfore parameter bound transgression. options are reset, drop, or None parensemble : pyemu.ParameterEnsemble or str a parameter ensemble or filename to use as the initial parameter ensemble. If not None, then obsenemble must not be None obsensemble : pyemu.ObservationEnsemble or str an observation ensemble or filename to use as the initial observation ensemble. If not None, then parensemble must not be None restart_obsensemble : pyemu.ObservationEnsemble or str an observation ensemble or filename to use as an evaluated observation ensemble. If not None, this will skip the initial parameter ensemble evaluation - user beware! Example ------- ``>>>import pyemu`` ``>>>es = pyemu.EnsembleSmoother(pst="pest.pst")`` ``>>>es.initialize(num_reals=100)`` """ ''' (re)initialize the process ''' # initialize the phi report csv self.enforce_bounds = enforce_bounds self.total_runs = 0 # this matrix gets used a lot, so only calc once and store self.obscov_inv_sqrt = self.obscov.get(self.pst.nnz_obs_names).inv.sqrt if parensemble is not None and obsensemble is not None: self.logger.log("initializing with existing ensembles") if isinstance(parensemble, str): self.logger.log("loading parensemble from file") if not os.path.exists(obsensemble): self.logger.lraise("can not find parensemble file: {0}".\ format(parensemble)) df = pd.read_csv(parensemble, index_col=0) #df.index = [str(i) for i in df.index] self.parensemble_0 = ParameterEnsemble.from_dataframe( df=df, pst=self.pst) self.logger.log("loading parensemble from file") elif isinstance(parensemble, ParameterEnsemble): self.parensemble_0 = parensemble.copy() else: raise Exception("unrecognized arg type for parensemble, " +\ "should be filename or ParameterEnsemble" +\ ", not {0}".format(type(parensemble))) self.parensemble = self.parensemble_0.copy() if isinstance(obsensemble, str): self.logger.log("loading obsensemble from file") if not os.path.exists(obsensemble): self.logger.lraise("can not find obsensemble file: {0}".\ format(obsensemble)) df = pd.read_csv(obsensemble, index_col=0).loc[:, self.pst.nnz_obs_names] #df.index = [str(i) for i in df.index] self.obsensemble_0 = ObservationEnsemble.from_dataframe( df=df, pst=self.pst) self.logger.log("loading obsensemble from file") elif isinstance(obsensemble, ObservationEnsemble): self.obsensemble_0 = obsensemble.copy() else: raise Exception("unrecognized arg type for obsensemble, " +\ "should be filename or ObservationEnsemble" +\ ", not {0}".format(type(obsensemble))) assert self.parensemble_0.shape[0] == self.obsensemble_0.shape[0] #self.num_reals = self.parensemble_0.shape[0] num_reals = self.parensemble.shape[0] self.logger.log("initializing with existing ensembles") else: self.logger.log( "initializing smoother with {0} realizations".format( num_reals)) #self.num_reals = int(num_reals) #assert self.num_reals > 1 self.logger.log("initializing parensemble") #self.parensemble_0 = ParameterEnsemble(self.pst) #self.parensemble_0.draw(cov=self.parcov,num_reals=num_reals) self.parensemble_0 = pyemu.ParameterEnsemble.from_gaussian_draw( ParameterEnsemble(self.pst), self.parcov, num_reals=num_reals) self.parensemble_0.enforce(enforce_bounds=enforce_bounds) self.logger.log("initializing parensemble") self.parensemble = self.parensemble_0.copy() self.parensemble_0.to_csv(self.pst.filename +\ self.paren_prefix.format(0)) self.logger.log("initializing parensemble") self.logger.log("initializing obsensemble") #self.obsensemble_0 = ObservationEnsemble(self.pst) #self.obsensemble_0.draw(cov=self.obscov,num_reals=num_reals) self.obsensemble_0 = pyemu.ObservationEnsemble.from_id_gaussian_draw( ObservationEnsemble(self.pst), num_reals=num_reals) #self.obsensemble = self.obsensemble_0.copy() # save the base obsensemble self.obsensemble_0.to_csv(self.pst.filename +\ self.obsen_prefix.format(-1)) self.logger.log("initializing obsensemble") self.logger.log( "initializing smoother with {0} realizations".format( num_reals)) self.obs0_matrix = self.obsensemble_0.nonzero.as_pyemu_matrix() self.enforce_bounds = enforce_bounds self.phi_csv = open(self.pst.filename + ".iobj.csv", 'w') self.phi_csv.write( "iter_num,total_runs,lambda,min,max,mean,median,std,") self.phi_csv.write(','.join(["{0:010d}". \ format(i + 1) for i in range(num_reals)])) self.phi_csv.write('\n') self.phi_act_csv = open(self.pst.filename + ".iobj.actual.csv", 'w') self.phi_act_csv.write( "iter_num,total_runs,lambda,min,max,mean,median,std,") self.phi_act_csv.write(','.join(["{0:010d}". \ format(i + 1) for i in range(num_reals)])) self.phi_act_csv.write('\n') if restart_obsensemble is not None: self.logger.log( "loading restart_obsensemble {0}".format(restart_obsensemble)) failed_runs, self.obsensemble = self._load_obs_ensemble( restart_obsensemble) assert self.obsensemble.shape[0] == self.obsensemble_0.shape[0] assert list(self.obsensemble.columns) == list( self.obsensemble_0.columns) self.logger.log( "loading restart_obsensemble {0}".format(restart_obsensemble)) else: # run the initial parameter ensemble self.logger.log("evaluating initial ensembles") failed_runs, self.obsensemble = self._calc_obs(self.parensemble) self.obsensemble.to_csv(self.pst.filename +\ self.obsen_prefix.format(0)) self.logger.log("evaluating initial ensembles") if failed_runs is not None: self.logger.warn("dropping failed realizations") #failed_runs_str = [str(f) for f in failed_runs] #self.parensemble = self.parensemble.drop(failed_runs) #self.obsensemble = self.obsensemble.drop(failed_runs) self.parensemble.loc[failed_runs, :] = np.NaN self.parensemble = self.parensemble.dropna() self.obsensemble.loc[failed_runs, :] = np.NaN self.obsensemble = self.obsensemble.dropna() self.current_phi_vec = self._calc_phi_vec(self.obsensemble) if self.drop_bad_reals is not None: drop_idx = np.argwhere( self.current_phi_vec > self.drop_bad_reals).flatten() run_ids = self.obsensemble.index.values drop_idx = run_ids[drop_idx] if len(drop_idx) == self.obsensemble.shape[0]: raise Exception("dropped all realizations as 'bad'") if len(drop_idx) > 0: self.logger.warn("{0} realizations dropped as 'bad' (indices :{1})".\ format(len(drop_idx),','.join([str(d) for d in drop_idx]))) self.parensemble.loc[drop_idx, :] = np.NaN self.parensemble = self.parensemble.dropna() self.obsensemble.loc[drop_idx, :] = np.NaN self.obsensemble = self.obsensemble.dropna() self.current_phi_vec = self._calc_phi_vec(self.obsensemble) self._phi_report(self.phi_csv, self.current_phi_vec, 0.0) self._phi_report(self.phi_act_csv, self.obsensemble.phi_vector.values, 0.0) self.last_best_mean = self.current_phi_vec.mean() self.last_best_std = self.current_phi_vec.std() self.logger.statement("initial phi (mean, std): {0:15.6G},{1:15.6G}".\ format(self.last_best_mean,self.last_best_std)) if init_lambda is not None: self.current_lambda = float(init_lambda) else: #following chen and oliver x = self.last_best_mean / (2.0 * float(self.obsensemble.shape[1])) self.current_lambda = 10.0**(np.floor(np.log10(x))) # if using the approximate form of the algorithm, let # the parameter scaling matrix be the identity matrix # jwhite - dec 5 2016 - using the actual parcov inv # for upgrades seems to be pushing parameters around # too much. for now, just not using it, maybe # better choices of lambda will tame it self.logger.statement("current lambda:{0:15.6g}".format( self.current_lambda)) if self.use_approx_prior: self.logger.statement("using approximate parcov in solution") self.half_parcov_diag = 1.0 else: #self.logger.statement("using full parcov in solution") # if self.parcov.isdiagonal: # self.half_parcov_diag = self.parcov.sqrt.inv # else: # self.half_parcov_diag = Cov(x=np.diag(self.parcov.x), # names=self.parcov.col_names, # isdiagonal=True).inv.sqrt self.half_parcov_diag = 1.0 self.delta_par_prior = self._calc_delta_par(self.parensemble_0) u, s, v = self.delta_par_prior.pseudo_inv_components() self.Am = u * s.inv self.__initialized = True
def update(self, lambda_mults=[1.0], localizer=None, run_subset=None, use_approx=True, calc_only=False): """update the iES one GLM cycle Parameters ---------- lambda_mults : list a list of lambda multipliers to test. Each lambda mult value will require evaluating (a subset of) the parameter ensemble. localizer : pyemu.Matrix a jacobian localizing matrix run_subset : int the number of realizations to test for each lambda_mult value. For example, if run_subset = 30 and num_reals=100, the first 30 realizations will be run (in parallel) for each lambda_mult value. Then the best lambda_mult is selected and the remaining 70 realizations for that lambda_mult value are run (in parallel). use_approx : bool a flag to use the MLE or MAP upgrade solution. True indicates use MLE solution calc_only : bool a flag to calculate the upgrade matrix only (not run the ensemble). This is mostly for debugging and testing on travis. Default is False Example ------- ``>>>import pyemu`` ``>>>es = pyemu.EnsembleSmoother(pst="pest.pst")`` ``>>>es.initialize(num_reals=100)`` ``>>>es.update(lambda_mults=[0.1,1.0,10.0],run_subset=30)`` """ if run_subset is not None: if run_subset >= self.obsensemble.shape[0]: self.logger.warn("run_subset ({0}) >= num of active reals ({1})...ignoring ".\ format(run_subset,self.obsensemble.shape[0])) run_subset = None self.iter_num += 1 self.logger.log("iteration {0}".format(self.iter_num)) self.logger.statement("{0} active realizations".format( self.obsensemble.shape[0])) if self.obsensemble.shape[0] < 2: self.logger.lraise( "at least active 2 realizations (really like 300) are needed to update" ) if not self.__initialized: #raise Exception("must call initialize() before update()") self.logger.lraise("must call initialize() before update()") self.logger.log("calculate scaled delta obs") scaled_delta_obs = self._calc_delta_obs(self.obsensemble) self.logger.log("calculate scaled delta obs") self.logger.log("calculate scaled delta par") scaled_delta_par = self._calc_delta_par(self.parensemble) self.logger.log("calculate scaled delta par") self.logger.log("calculate pseudo inv comps") u, s, v = scaled_delta_obs.pseudo_inv_components() self.logger.log("calculate pseudo inv comps") self.logger.log("calculate obs diff matrix") obs_diff = self.obscov_inv_sqrt * self._get_residual_matrix( self.obsensemble).T self.logger.log("calculate obs diff matrix") # here is the math part...calculate upgrade matrices mean_lam, std_lam, paren_lam, obsen_lam = [], [], [], [] lam_vals = [] for ilam, cur_lam_mult in enumerate(lambda_mults): parensemble_cur_lam = self.parensemble.copy() #print(parensemble_cur_lam.isnull().values.any()) cur_lam = self.current_lambda * cur_lam_mult lam_vals.append(cur_lam) self.logger.log("calcs for lambda {0}".format(cur_lam_mult)) scaled_ident = Cov.identity_like(s) * (cur_lam + 1.0) scaled_ident += s**2 scaled_ident = scaled_ident.inv # build up this matrix as a single element so we can apply # localization self.logger.log("building upgrade_1 matrix") upgrade_1 = -1.0 * (self.half_parcov_diag * scaled_delta_par) *\ v * s * scaled_ident * u.T self.logger.log("building upgrade_1 matrix") # apply localization if localizer is not None: self.logger.log("applying localization") upgrade_1.hadamard_product(localizer) self.logger.log("applying localization") # apply residual information self.logger.log("applying residuals") upgrade_1 *= obs_diff self.logger.log("applying residuals") self.logger.log("processing upgrade_1") upgrade_1 = upgrade_1.to_dataframe() upgrade_1.index.name = "parnme" upgrade_1 = upgrade_1.T upgrade_1.index = [int(i) for i in upgrade_1.index] upgrade_1.to_csv(self.pst.filename+".upgrade_1.{0:04d}.csv".\ format(self.iter_num)) if upgrade_1.isnull().values.any(): self.logger.lraise("NaNs in upgrade_1") self.logger.log("processing upgrade_1") #print(upgrade_1.isnull().values.any()) #print(parensemble_cur_lam.index) #print(upgrade_1.index) parensemble_cur_lam += upgrade_1 # parameter-based upgrade portion if not use_approx and self.iter_num > 1: self.logger.log("building upgrade_2 matrix") par_diff = (self.parensemble - self.parensemble_0.loc[self.parensemble.index,:]).\ as_pyemu_matrix().T x4 = self.Am.T * self.half_parcov_diag * par_diff x5 = self.Am * x4 x6 = scaled_delta_par.T * x5 x7 = v * scaled_ident * v.T * x6 upgrade_2 = -1.0 * (self.half_parcov_diag * scaled_delta_par * x7).to_dataframe() upgrade_2.index.name = "parnme" upgrade_2 = upgrade_2.T upgrade_2.to_csv(self.pst.filename+".upgrade_2.{0:04d}.csv".\ format(self.iter_num)) upgrade_2.index = [int(i) for i in upgrade_2.index] if upgrade_2.isnull().values.any(): self.logger.lraise("NaNs in upgrade_2") parensemble_cur_lam += upgrade_2 self.logger.log("building upgrade_2 matrix") parensemble_cur_lam.enforce(self.enforce_bounds) # this is for testing failed runs on upgrade testing # works with the 10par_xsec smoother test #parensemble_cur_lam.iloc[:,:] = -1000000.0 paren_lam.append(pd.DataFrame(parensemble_cur_lam.loc[:, :])) self.logger.log("calcs for lambda {0}".format(cur_lam_mult)) if calc_only: return # subset if needed # and combine lambda par ensembles into one par ensemble for evaluation if run_subset is not None and run_subset < self.parensemble.shape[0]: #subset_idx = ["{0:d}".format(i) for i in np.random.randint(0,self.parensemble.shape[0]-1,run_subset)] subset_idx = self.parensemble.iloc[:run_subset, :].index.values self.logger.statement("subset idxs: " + ','.join([str(s) for s in subset_idx])) paren_lam_subset = [pe.loc[subset_idx, :] for pe in paren_lam] paren_combine = pd.concat(paren_lam_subset, ignore_index=True) paren_lam_subset = None else: subset_idx = self.parensemble.index.values paren_combine = pd.concat(paren_lam, ignore_index=True) self.logger.log("evaluating ensembles for lambdas : {0}".\ format(','.join(["{0:8.3E}".format(l) for l in lam_vals]))) failed_runs, obsen_combine = self._calc_obs(paren_combine) #if failed_runs is not None: # obsen_combine.loc[failed_runs,:] = np.NaN self.logger.log("evaluating ensembles for lambdas : {0}".\ format(','.join(["{0:8.3E}".format(l) for l in lam_vals]))) paren_combine = None if failed_runs is not None and len( failed_runs) == obsen_combine.shape[0]: self.logger.lraise("all runs failed - cannot continue") # unpack lambda obs ensembles from combined obs ensemble nrun_per_lam = self.obsensemble.shape[0] if run_subset is not None: nrun_per_lam = run_subset obsen_lam = [] for i in range(len(lam_vals)): sidx = i * nrun_per_lam eidx = sidx + nrun_per_lam oe = ObservationEnsemble.from_dataframe( df=obsen_combine.iloc[sidx:eidx, :].copy(), pst=self.pst) oe.index = subset_idx # check for failed runs in this set - drop failed runs from obs ensembles if failed_runs is not None: failed_runs_this = np.array( [f for f in failed_runs if f >= sidx and f < eidx]) - sidx if len(failed_runs_this) > 0: if len(failed_runs_this) == oe.shape[0]: self.logger.warn( "all runs failed for lambda {0}".format( lam_vals[i])) else: self.logger.warn("{0} run failed for lambda {1}".\ format(len(failed_runs_this),lam_vals[i])) oe.iloc[failed_runs_this, :] = np.NaN oe = oe.dropna() # don't drop bad reals here, instead, mask bad reals in the lambda # selection and drop later # if self.drop_bad_reals is not None: # assert isinstance(drop_bad_reals, float) # drop_idx = np.argwhere(self.current_phi_vec > self.drop_bad_reals).flatten() # run_ids = self.obsensemble.index.values # drop_idx = run_ids[drop_idx] # if len(drop_idx) == self.obsensemble.shape[0]: # raise Exception("dropped all realizations as 'bad'") # if len(drop_idx) > 0: # self.logger.warn("{0} realizations dropped as 'bad' (indices :{1})". \ # format(len(drop_idx), ','.join([str(d) for d in drop_idx]))) # self.parensemble.loc[drop_idx, :] = np.NaN # self.parensemble = self.parensemble.dropna() # self.obsensemble.loc[drop_idx, :] = np.NaN # self.obsensemble = self.obsensemble.dropna() # # self.current_phi_vec = self._calc_phi_vec(self.obsensemble) obsen_lam.append(oe) obsen_combine = None # here is where we need to select out the "best" lambda par and obs # ensembles self.logger.statement("\n**************************") self.logger.statement(str(datetime.now())) self.logger.statement("total runs:{0}".format(self.total_runs)) self.logger.statement("iteration: {0}".format(self.iter_num)) self.logger.statement("current lambda:{0:15.6G}, mean:{1:15.6G}, std:{2:15.6G}".\ format(self.current_lambda, self.last_best_mean,self.last_best_std)) phi_vecs = [self._calc_phi_vec(obsen) for obsen in obsen_lam] if self.drop_bad_reals is not None: for i, pv in enumerate(phi_vecs): #for testing the drop_bad_reals functionality #pv[[0,3,7]] = self.drop_bad_reals + 1.0 pv[pv > self.drop_bad_reals] = np.NaN pv = pv[~np.isnan(pv)] if len(pv) == 0: raise Exception("all realization for lambda {0} dropped as 'bad'".\ format(lam_vals[i])) phi_vecs[i] = pv mean_std = [(pv.mean(), pv.std()) for pv in phi_vecs] update_pars = False update_lambda = False # accept a new best if its within 10% best_mean = self.last_best_mean * 1.1 best_std = self.last_best_std * 1.1 best_i = 0 for i, (m, s) in enumerate(mean_std): self.logger.statement(" tested lambda:{0:15.6G}, mean:{1:15.6G}, std:{2:15.6G}".\ format(self.current_lambda * lambda_mults[i],m,s)) if m < best_mean: update_pars = True best_mean = m best_i = i if s < best_std: update_lambda = True best_std = s if np.isnan(best_mean): self.logger.lraise("best mean = NaN") if np.isnan(best_std): self.logger.lraise("best std = NaN") if not update_pars: self.current_lambda *= max(lambda_mults) * 10.0 self.current_lambda = min(self.current_lambda, 100000) self.logger.statement("not accepting iteration, increased lambda:{0}".\ format(self.current_lambda)) else: self.parensemble = ParameterEnsemble.from_dataframe( df=paren_lam[best_i], pst=self.pst) if run_subset is not None: failed_runs, self.obsensemble = self._calc_obs( self.parensemble) if failed_runs is not None: self.logger.warn("dropping failed realizations") self.parensemble.loc[failed_runs, :] = np.NaN self.parensemble = self.parensemble.dropna() self.obsensemble.loc[failed_runs, :] = np.NaN self.obsensemble = self.obsensemble.dropna() self.current_phi_vec = self._calc_phi_vec(self.obsensemble) #self._phi_report(self.current_phi_vec,self.current_lambda * lambda_mults[best_i]) best_mean = self.current_phi_vec.mean() best_std = self.current_phi_vec.std() else: self.obsensemble = obsen_lam[best_i] # reindex parensemble in case failed runs self.parensemble = ParameterEnsemble.from_dataframe( df=self.parensemble.loc[self.obsensemble.index], pst=self.pst) self.current_phi_vec = phi_vecs[best_i] if self.drop_bad_reals is not None: # for testing drop_bad_reals functionality # self.current_phi_vec[::2] = self.drop_bad_reals + 1.0 drop_idx = np.argwhere( self.current_phi_vec > self.drop_bad_reals).flatten() run_ids = self.obsensemble.index.values drop_idx = run_ids[drop_idx] if len(drop_idx) > self.obsensemble.shape[0] - 3: raise Exception("dropped too many realizations as 'bad'") if len(drop_idx) > 0: self.logger.warn("{0} realizations dropped as 'bad' (indices :{1})". \ format(len(drop_idx), ','.join([str(d) for d in drop_idx]))) self.parensemble.loc[drop_idx, :] = np.NaN self.parensemble = self.parensemble.dropna() self.obsensemble.loc[drop_idx, :] = np.NaN self.obsensemble = self.obsensemble.dropna() self.current_phi_vec = self._calc_phi_vec(self.obsensemble) best_mean = self.current_phi_vec.mean() best_std = self.current_phi_vec.std() self._phi_report(self.phi_csv, self.current_phi_vec, self.current_lambda * lambda_mults[best_i]) self._phi_report(self.phi_act_csv, self.obsensemble.phi_vector.values, self.current_lambda * lambda_mults[best_i]) self.logger.statement(" best lambda:{0:15.6G}, mean:{1:15.6G}, std:{2:15.6G}".\ format(self.current_lambda*lambda_mults[best_i], best_mean,best_std)) self.logger.statement(" actual mean phi: {0:15.6G}".format( float(self.current_actual_phi.mean()))) self.last_best_mean = best_mean self.last_best_std = best_std if update_lambda: # be aggressive self.current_lambda *= (lambda_mults[best_i] * 0.75) # but don't let lambda get too small self.current_lambda = max(self.current_lambda, 0.00001) self.logger.statement("updating lambda: {0:15.6G}".\ format(self.current_lambda )) self.logger.statement("**************************\n") self.parensemble.to_csv(self.pst.filename+self.paren_prefix.\ format(self.iter_num)) self.obsensemble.to_csv(self.pst.filename+self.obsen_prefix.\ format(self.iter_num)) if self.raw_sweep_out is not None: self.raw_sweep_out.to_csv(self.pst.filename+"_raw{0}".\ format(self.iter_num)) self.logger.log("iteration {0}".format(self.iter_num))
def initialize(self, num_reals=1, init_lambda=None, enforce_bounds="reset", parensemble=None, obsensemble=None, restart_obsensemble=None): ''' (re)initialize the process ''' # initialize the phi report csv self.enforce_bounds = enforce_bounds self.phi_csv = open(self.pst.filename + ".iobj.csv", 'w') self.phi_csv.write( "iter_num,total_runs,lambda,min,max,mean,median,std,") self.phi_csv.write(','.join(["{0:010d}".\ format(i+1) for i in range(num_reals)])) self.phi_csv.write('\n') self.total_runs = 0 # this matrix gets used a lot, so only calc once and store self.obscov_inv_sqrt = self.obscov.get(self.pst.nnz_obs_names).inv.sqrt if parensemble is not None and obsensemble is not None: self.logger.log("initializing with existing ensembles") if isinstance(parensemble, str): self.logger.log("loading parensemble from file") if not os.path.exists(obsensemble): self.logger.lraise("can not find parensemble file: {0}".\ format(parensemble)) df = pd.read_csv(parensemble, index_col=0) #df.index = [str(i) for i in df.index] self.parensemble_0 = ParameterEnsemble.from_dataframe( df=df, pst=self.pst) self.logger.log("loading parensemble from file") elif isinstance(parensemble, ParameterEnsemble): self.parensemble_0 = parensemble.copy() else: raise Exception("unrecognized arg type for parensemble, " +\ "should be filename or ParameterEnsemble" +\ ", not {0}".format(type(parensemble))) self.parensemble = self.parensemble_0.copy() if isinstance(obsensemble, str): self.logger.log("loading obsensemble from file") if not os.path.exists(obsensemble): self.logger.lraise("can not find obsensemble file: {0}".\ format(obsensemble)) df = pd.read_csv(obsensemble, index_col=0).loc[:, self.pst.nnz_obs_names] #df.index = [str(i) for i in df.index] self.obsensemble_0 = ObservationEnsemble.from_dataframe( df=df, pst=self.pst) self.logger.log("loading obsensemble from file") elif isinstance(obsensemble, ObservationEnsemble): self.obsensemble_0 = obsensemble.copy() else: raise Exception("unrecognized arg type for obsensemble, " +\ "should be filename or ObservationEnsemble" +\ ", not {0}".format(type(obsensemble))) assert self.parensemble_0.shape[0] == self.obsensemble_0.shape[0] #self.num_reals = self.parensemble_0.shape[0] self.logger.log("initializing with existing ensembles") else: self.logger.log( "initializing smoother with {0} realizations".format( num_reals)) #self.num_reals = int(num_reals) #assert self.num_reals > 1 self.logger.log("initializing parensemble") self.parensemble_0 = ParameterEnsemble(self.pst) self.parensemble_0.draw(cov=self.parcov, num_reals=num_reals) self.parensemble_0.enforce(enforce_bounds=enforce_bounds) self.logger.log("initializing parensemble") self.parensemble = self.parensemble_0.copy() self.parensemble_0.to_csv(self.pst.filename +\ self.paren_prefix.format(0)) self.logger.log("initializing parensemble") self.logger.log("initializing obsensemble") self.obsensemble_0 = ObservationEnsemble(self.pst) self.obsensemble_0.draw(cov=self.obscov, num_reals=num_reals) #self.obsensemble = self.obsensemble_0.copy() # save the base obsensemble self.obsensemble_0.to_csv(self.pst.filename +\ self.obsen_prefix.format(-1)) self.logger.log("initializing obsensemble") self.logger.log( "initializing smoother with {0} realizations".format( num_reals)) self.obs0_matrix = self.obsensemble_0.nonzero.as_pyemu_matrix() self.enforce_bounds = enforce_bounds if restart_obsensemble is not None: self.logger.log( "loading restart_obsensemble {0}".format(restart_obsensemble)) failed_runs, self.obsensemble = self._load_obs_ensemble( restart_obsensemble) assert self.obsensemble.shape[0] == self.obsensemble_0.shape[0] assert list(self.obsensemble.columns) == list( self.obsensemble_0.columns) self.logger.log( "loading restart_obsensemble {0}".format(restart_obsensemble)) else: # run the initial parameter ensemble self.logger.log("evaluating initial ensembles") failed_runs, self.obsensemble = self._calc_obs(self.parensemble) self.obsensemble.to_csv(self.pst.filename +\ self.obsen_prefix.format(0)) self.logger.log("evaluating initial ensembles") if failed_runs is not None: self.logger.warn("dropping failed realizations") #failed_runs_str = [str(f) for f in failed_runs] self.parensemble = self.parensemble.drop(failed_runs) self.obsensemble = self.obsensemble.drop(failed_runs) self.current_phi_vec = self._calc_phi_vec(self.obsensemble) self._phi_report(self.current_phi_vec, 0.0) self.last_best_mean = self.current_phi_vec.mean() self.last_best_std = self.current_phi_vec.std() self.logger.statement("initial phi (mean, std): {0:15.6G},{1:15.6G}".\ format(self.last_best_mean,self.last_best_std)) if init_lambda is not None: self.current_lambda = float(init_lambda) else: #following chen and oliver x = self.last_best_mean / (2.0 * float(self.obsensemble.shape[1])) self.current_lambda = 10.0**(np.floor(np.log10(x))) # if using the approximate form of the algorithm, let # the parameter scaling matrix be the identity matrix # jwhite - dec 5 2016 - using the actual parcov inv # for upgrades seems to be pushing parameters around # too much. for now, just not using it, maybe # better choices of lambda will tame it self.logger.statement("current lambda:{0:15.6g}".format( self.current_lambda)) if self.use_approx_prior: self.logger.statement("using approximate parcov in solution") self.half_parcov_diag = 1.0 else: #self.logger.statement("using full parcov in solution") # if self.parcov.isdiagonal: # self.half_parcov_diag = self.parcov.sqrt.inv # else: # self.half_parcov_diag = Cov(x=np.diag(self.parcov.x), # names=self.parcov.col_names, # isdiagonal=True).inv.sqrt self.half_parcov_diag = 1.0 self.delta_par_prior = self._calc_delta_par(self.parensemble_0) u, s, v = self.delta_par_prior.pseudo_inv_components() self.Am = u * s.inv self.__initialized = True
def __init__(self, **kwargs): super(MonteCarlo, self).__init__(**kwargs) assert self.pst is not None, \ "monte carlo requires a pest control file" self.parensemble = ParameterEnsemble(pst=self.pst) self.obsensemble = ObservationEnsemble(pst=self.pst)
def __init__(self,**kwargs): super(MonteCarlo,self).__init__(**kwargs) assert self.pst is not None, \ "monte carlo requires a pest control file" self.parensemble = ParameterEnsemble(pst=self.pst) self.obsensemble = ObservationEnsemble(pst=self.pst)
class EnsembleSmoother(): def __init__(self, pst, parcov=None, obscov=None): assert isinstance(pst, Pst) self.pst = pst if parcov is not None: assert isinstance(parcov, Cov) else: parcov = Cov.from_parameter_data(self.pst) if obscov is not None: assert isinstance(obscov, Cov) else: obscov = Cov.from_observation_data(pst) self.parcov = parcov self.obscov = obscov self.__initialized = False self.num_reals = 0 self.half_parcov_diag = None self.half_obscov_diag = None self.delta_par_prior = None self.iter_num = 0 def initialize(self, num_reals): ''' (re)initialize the process ''' self.num_reals = int(num_reals) self.parensemble = ParameterEnsemble(self.pst) self.parensemble.draw(cov=self.parcov, num_reals=num_reals) self.obsensemble_0 = ObservationEnsemble(self.pst) self.obsensemble_0.draw(cov=self.obscov, num_reals=num_reals) self.obsensemble = self.obsensemble_0.copy() if self.parcov.isdiagonal: self.half_parcov_diag = self.parcov.inv.sqrt else: self.half_parcov_diag = Cov(x=np.diag(self.parcov.x), names=self.parcov.col_names, isdiagonal=True).inv.sqrt #if self.obscov.isdiagonal: #self.half_obscov_inv = self.obscov.inv.sqrt # else: # self.half_obscov_diag = Cov(x=np.diag(self.obscov.x), # names=self.obscov.col_names, # isdiagonal=True) self.delta_par_prior = self._calc_delta_par() self.__initialized = True def _calc_delta_par(self): ''' calc the scaled parameter ensemble differences from the mean ''' mean = np.array(self.parensemble.mean(axis=0)) delta = self.parensemble.as_pyemu_matrix() for i in range(self.num_reals): delta.x[i, :] -= mean #delta = Matrix(x=(self.half_parcov_diag * delta.transpose()).x, # row_names=self.parensemble.columns) delta = self.half_parcov_diag * delta.T return delta * (1.0 / np.sqrt(float(self.num_reals - 1.0))) def _calc_delta_obs(self): ''' calc the scaled observation ensemble differences from the mean ''' mean = np.array(self.obsensemble.mean(axis=0)) delta = self.obsensemble.as_pyemu_matrix() for i in range(self.num_reals): delta.x[i, :] -= mean delta = self.obscov.inv.sqrt * delta.T return delta * (1.0 / np.sqrt(float(self.num_reals - 1.0))) def _calc_obs(self): ''' propagate the ensemble forward... ''' self.parensemble.to_csv(os.path.join("smoother", "sweep_in.csv")) os.chdir("smoother") print(os.listdir('.')) os.system("sweep freyberg.pst") os.chdir('..') obs = ObservationEnsemble.from_csv(os.path.join(\ "smoother",'sweep_out.csv')) obs.columns = [item.lower() for item in obs.columns] self.obsensemble = ObservationEnsemble.from_dataframe( df=obs.loc[:, self.obscov.row_names], pst=self.pst) #todo: modifiy sweep to be interactive... return @property def current_lambda(self): return 10.0 def update(self): if not self.__initialized: raise Exception("must call initialize() before update()") self._calc_obs() delta_obs = self._calc_delta_obs() u, s, v = delta_obs.pseudo_inv_components() #print(v) #print(s) #print(v) diff = self.obsensemble.as_pyemu_matrix( ) - self.obsensemble_0.as_pyemu_matrix() #print(diff) x1 = u.T * self.obscov.inv.sqrt * diff.T x1.autoalign = False #print(x1) x2 = (Cov.identity_like(s) + s**2).inv * x1 #print(x2) x3 = v * s * x2 #print(x3) upgrade_1 = (self.half_parcov_diag * self._calc_delta_par() * x3).to_dataframe() upgrade_1.index.name = "parnme" print(upgrade_1) self.parensemble += upgrade_1.T print(self.parensemble) if self.iter_num > 0: raise NotImplementedError() print(upgrade_1.shape)
class EnsembleSmoother(): def __init__(self,pst,parcov=None,obscov=None,num_slaves=0,use_approx=True, restart_iter=0,submit_file=None): self.num_slaves = int(num_slaves) self.submit_file = submit_file self.use_approx = bool(use_approx) self.paren_prefix = ".parensemble.{0:04d}.csv" self.obsen_prefix = ".obsensemble.{0:04d}.csv" if isinstance(pst,str): pst = Pst(pst) assert isinstance(pst,Pst) self.pst = pst self.sweep_in_csv = pst.pestpp_options.get("sweep_parameter_csv_file","sweep_in.csv") self.sweep_out_csv = pst.pestpp_options.get("sweep_output_csv_file","sweep_out.csv") if parcov is not None: assert isinstance(parcov,Cov) else: parcov = Cov.from_parameter_data(self.pst) if obscov is not None: assert isinstance(obscov,Cov) else: obscov = Cov.from_observation_data(pst) self.parcov = parcov self.obscov = obscov self.restart = False if restart_iter > 0: self.restart_iter = restart_iter paren = self.pst.filename+self.paren_prefix.format(restart_iter) assert os.path.exists(paren),\ "could not find restart par ensemble {0}".format(paren) obsen0 = self.pst.filename+self.obsen_prefix.format(0) assert os.path.exists(obsen0),\ "could not find restart obs ensemble 0 {0}".format(obsen0) obsen = self.pst.filename+self.obsen_prefix.format(restart_iter) assert os.path.exists(obsen),\ "could not find restart obs ensemble {0}".format(obsen) self.restart = True self.__initialized = False self.num_reals = 0 self.half_parcov_diag = None self.half_obscov_diag = None self.delta_par_prior = None self.iter_num = 0 def initialize(self,num_reals,init_lambda=None): ''' (re)initialize the process ''' assert num_reals > 1 # initialize the phi report csv self.phi_csv = open(self.pst.filename+".iobj.csv",'w') self.phi_csv.write("iter_num,total_runs,lambda,min,max,mean,median,std,") self.phi_csv.write(','.join(["{0:010d}".\ format(i+1) for i in range(num_reals)])) self.phi_csv.write('\n') self.total_runs = 0 # this matrix gets used a lot, so only calc once and store self.obscov_inv_sqrt = self.obscov.get(self.pst.nnz_obs_names).inv.sqrt if self.restart: print("restarting...ignoring num_reals") raise NotImplementedError() df = pd.read_csv(self.pst.filename+self.paren_prefix.format(self.restart_iter)) self.parensemble_0 = ParameterEnsemble.from_dataframe(df=df,pst=self.pst) self.parensemble = self.parensemble_0.copy() df = pd.read_csv(self.pst.filename+self.obsen_prefix.format(0)) self.obsensemble_0 = ObservationEnsemble.from_dataframe(df=df.loc[:,self.pst.nnz_obs_names], pst=self.pst) # this matrix gets used a lot, so only calc once self.obs0_matrix = self.obsensemble_0.as_pyemu_matrix() df = pd.read_csv(self.pst.filename+self.obsen_prefix.format(self.restart_iter)) self.obsensemble = ObservationEnsemble.from_dataframe(df=df.loc[:,self.pst.nnz_obs_names], pst=self.pst) assert self.parensemble.shape[0] == self.obsensemble.shape[0] self.num_reals = self.parensemble.shape[0] else: self.num_reals = int(num_reals) self.parensemble_0 = ParameterEnsemble(self.pst) self.parensemble_0.draw(cov=self.parcov,num_reals=num_reals) self.parensemble_0.enforce() self.parensemble = self.parensemble_0.copy() self.parensemble_0.to_csv(self.pst.filename +\ self.paren_prefix.format(0)) self.obsensemble_0 = ObservationEnsemble(self.pst) self.obsensemble_0.draw(cov=self.obscov,num_reals=num_reals) #self.obsensemble = self.obsensemble_0.copy() # save the base obsensemble self.obsensemble_0.to_csv(self.pst.filename +\ self.obsen_prefix.format(-1)) self.obs0_matrix = self.obsensemble_0.nonzero.as_pyemu_matrix() # run the initial parameter ensemble self.obsensemble = self._calc_obs(self.parensemble) self.obsensemble.to_csv(self.pst.filename +\ self.obsen_prefix.format(0)) self.current_phi_vec = self._calc_phi_vec(self.obsensemble) self._phi_report(self.current_phi_vec,0.0) self.last_best_mean = self.current_phi_vec.mean() self.last_best_std = self.current_phi_vec.std() if init_lambda is not None: self.current_lambda = float(init_lambda) else: #following chen and oliver x = self.last_best_mean / (2.0 * float(self.obsensemble.shape[1])) self.current_lambda = 10.0**(np.floor(np.log10(x))) # if using the approximate form of the algorithm, let # the parameter scaling matrix be the identity matrix # jwhite - dec 5 2016 - using the actual parcov inv # for upgrades seems to be pushing parameters around # too much. for now, just not using it, maybe # better choices of lambda will tame it if self.use_approx: self.half_parcov_diag = 1.0 else: # if self.parcov.isdiagonal: # self.half_parcov_diag = self.parcov.sqrt.inv # else: # self.half_parcov_diag = Cov(x=np.diag(self.parcov.x), # names=self.parcov.col_names, # isdiagonal=True).inv.sqrt self.half_parcov_diag = 1.0 self.delta_par_prior = self._calc_delta_par(self.parensemble_0) u,s,v = self.delta_par_prior.pseudo_inv_components() self.Am = u * s.inv self.__initialized = True def get_localizer(self): onames = self.pst.nnz_obs_names pnames = self.pst.adj_par_names localizer = Matrix(x=np.ones((len(onames),len(pnames))),row_names=onames,col_names=pnames) return localizer def _calc_delta_par(self,parensemble): ''' calc the scaled parameter ensemble differences from the mean ''' return self._calc_delta(parensemble, self.half_parcov_diag) def _calc_delta_obs(self,obsensemble): ''' calc the scaled observation ensemble differences from the mean ''' return self._calc_delta(obsensemble.nonzero, self.obscov.inv.sqrt) def _calc_delta(self,ensemble,scaling_matrix): ''' calc the scaled ensemble differences from the mean ''' mean = np.array(ensemble.mean(axis=0)) delta = ensemble.as_pyemu_matrix() for i in range(self.num_reals): delta.x[i,:] -= mean delta = scaling_matrix * delta.T delta *= (1.0 / np.sqrt(float(self.num_reals - 1.0))) return delta def _calc_obs(self,parensemble): if self.submit_file is None: self._calc_obs_local(parensemble) else: self._calc_obs_condor(parensemble) def _calc_obs_condor(self,parensemble): parensemble.to_csv(self.sweep_in_csv) os.system("condor_rm -all") port = 4004 def master(): os.system("sweep {0} /h :{1} >nul".format(self.pst.filename,port)) master_thread = threading.Thread(target=master) master_thread.start() time.sleep(1.5) #just some time for the master to get up and running to take slaves pyemu.utils.start_slaves("template","sweep",self.pst.filename, self.num_slaves,slave_root='.',port=port) os.system("condor_submit {0}".format(self.submit_file)) master_thread.join() def _calc_obs_local(self,parensemble): ''' propagate the ensemble forward using sweep. ''' parensemble.to_csv(self.sweep_in_csv) if self.num_slaves > 0: port = 4004 def master(): os.system("sweep {0} /h :{1} >nul".format(self.pst.filename,port)) master_thread = threading.Thread(target=master) master_thread.start() time.sleep(1.5) #just some time for the master to get up and running to take slaves pyemu.utils.start_slaves("template","sweep",self.pst.filename, self.num_slaves,slave_root='.',port=port) master_thread.join() else: os.system("sweep {0}".format(self.pst.filename)) obs = pd.read_csv(self.sweep_out_csv) obs.columns = [item.lower() for item in obs.columns] self.total_runs += obs.shape[0] return ObservationEnsemble.from_dataframe(df=obs.loc[:,self.obscov.row_names], pst=self.pst) def _calc_phi_vec(self,obsensemble): obs_diff = self._get_residual_matrix(obsensemble) phi_vec = np.diagonal((obs_diff * self.obscov_inv_sqrt.get(row_names=obs_diff.col_names, col_names=obs_diff.col_names) * obs_diff.T).x) return phi_vec def _phi_report(self,phi_vec,cur_lam): assert phi_vec.shape[0] == self.num_reals self.phi_csv.write("{0},{1},{2},{3},{4},{5},{6}".format(self.iter_num, self.total_runs, cur_lam, phi_vec.min(), phi_vec.max(), phi_vec.mean(), np.median(phi_vec), phi_vec.std())) self.phi_csv.write(",".join(["{0:20.8}".format(phi) for phi in phi_vec])) self.phi_csv.write("\n") self.phi_csv.flush() def _get_residual_matrix(self, obsensemble): obs_matrix = obsensemble.nonzero.as_pyemu_matrix() return obs_matrix - self.obs0_matrix.get(col_names=obs_matrix.col_names,row_names=obs_matrix.row_names) def update(self,lambda_mults=[1.0],localizer=None,run_subset=None): self.iter_num += 1 if not self.__initialized: raise Exception("must call initialize() before update()") scaled_delta_obs = self._calc_delta_obs(self.obsensemble) scaled_delta_par = self._calc_delta_par(self.parensemble) u,s,v = scaled_delta_obs.pseudo_inv_components() obs_diff = self._get_residual_matrix(self.obsensemble) if run_subset is not None: subset_idx = ["{0:d}".format(i) for i in np.random.randint(0,self.num_reals-1,run_subset)] print("subset idxs: " + ','.join(subset_idx)) mean_lam,std_lam,paren_lam,obsen_lam = [],[],[],[] for ilam,cur_lam_mult in enumerate(lambda_mults): parensemble_cur_lam = self.parensemble.copy() cur_lam = self.current_lambda * cur_lam_mult scaled_ident = Cov.identity_like(s) * (cur_lam+1.0) scaled_ident += s**2 scaled_ident = scaled_ident.inv # build up this matrix as a single element so we can apply # localization upgrade_1 = -1.0 * (self.half_parcov_diag * scaled_delta_par) *\ v * s * scaled_ident * u.T # apply localization #print(cur_lam,upgrade_1) if localizer is not None: upgrade_1.hadamard_product(localizer) # apply residual information upgrade_1 *= (self.obscov_inv_sqrt * obs_diff.T) upgrade_1 = upgrade_1.to_dataframe() upgrade_1.index.name = "parnme" upgrade_1 = upgrade_1.T upgrade_1.to_csv(self.pst.filename+".upgrade_1.{0:04d}.csv".\ format(self.iter_num)) parensemble_cur_lam += upgrade_1 # parameter-based upgrade portion if not self.use_approx and self.iter_num > 1: par_diff = (self.parensemble - self.parensemble_0).\ as_pyemu_matrix().T x4 = self.Am.T * self.half_parcov_diag * par_diff x5 = self.Am * x4 x6 = scaled_delta_par.T * x5 x7 = v * scaled_ident * v.T * x6 upgrade_2 = -1.0 * (self.half_parcov_diag * scaled_delta_par * x7).to_dataframe() upgrade_2.index.name = "parnme" upgrade_2.T.to_csv(self.pst.filename+".upgrade_2.{0:04d}.csv".\ format(self.iter_num)) parensemble_cur_lam += upgrade_2.T parensemble_cur_lam.enforce() paren_lam.append(parensemble_cur_lam) if run_subset is not None: #phi_series = pd.Series(data=self.current_phi_vec) #phi_series.sort_values(inplace=True,ascending=False) #subset_idx = ["{0:d}".format(i) for i in phi_series.index.values[:run_subset]] parensemble_subset = parensemble_cur_lam.loc[subset_idx,:] obsensemble_cur_lam = self._calc_obs(parensemble_subset) else: obsensemble_cur_lam = self._calc_obs(parensemble_cur_lam) #print(obsensemble_cur_lam.head()) obsen_lam.append(obsensemble_cur_lam) # here is where we need to select out the "best" lambda par and obs # ensembles print("\n**************************") print(str(datetime.now())) print("total runs:{0}".format(self.total_runs)) print("iteration: {0}".format(self.iter_num)) print("current lambda:{0:15.6G}, mean:{1:15.6G}, std:{2:15.6G}".\ format(self.current_lambda, self.last_best_mean,self.last_best_std)) phi_vecs = [self._calc_phi_vec(obsen) for obsen in obsen_lam] mean_std = [(pv.mean(),pv.std()) for pv in phi_vecs] update_pars = False update_lambda = False # accept a new best if its within 10% best_mean = self.last_best_mean * 1.1 best_std = self.last_best_std * 1.1 best_i = 0 for i,(m,s) in enumerate(mean_std): print(" tested lambda:{0:15.6G}, mean:{1:15.6G}, std:{2:15.6G}".\ format(self.current_lambda * lambda_mults[i],m,s)) if m < best_mean: update_pars = True best_mean = m best_i = i if s < best_std: update_lambda = True best_std = s if not update_pars: self.current_lambda *= max(lambda_mults) * 3.0 self.current_lambda = min(self.current_lambda,100000) print("not accepting iteration, increased lambda:{0}".\ format(self.current_lambda)) else: self.parensemble = paren_lam[best_i] if run_subset is not None: self.obsensemble = self._calc_obs(self.parensemble) self.current_phi_vec = self._calc_phi_vec(self.obsensemble) self._phi_report(self.current_phi_vec,self.current_lambda * lambda_mults[best_i]) best_mean = self.current_phi_vec.mean() best_std = self.current_phi_vec.std() else: self.obsensemble = obsen_lam[best_i] self._phi_report(phi_vecs[best_i],self.current_lambda * lambda_mults[best_i]) self.current_phi_vec = phi_vecs[best_i] print("\n" + " best lambda:{0:15.6G}, mean:{1:15.6G}, std:{2:15.6G}".\ format(self.current_lambda*lambda_mults[best_i], best_mean,best_std)) self.last_best_mean = best_mean self.last_best_std = best_std if update_lambda: # be aggressive - cut best lambda in half self.current_lambda *= (lambda_mults[best_i] * 0.75) # but don't let lambda get too small self.current_lambda = max(self.current_lambda,0.001) print("updating lambda: {0:15.6G}".\ format(self.current_lambda )) print("**************************\n") self.parensemble.to_csv(self.pst.filename+self.paren_prefix.\ format(self.iter_num)) self.obsensemble.to_csv(self.pst.filename+self.obsen_prefix.\ format(self.iter_num))
class EnsembleSmoother(): def __init__(self, pst, parcov=None, obscov=None, num_slaves=0, use_approx_prior=True, submit_file=None, verbose=False, port=4004, slave_dir="template"): self.logger = Logger(verbose) if verbose is not False: self.logger.echo = True self.num_slaves = int(num_slaves) if submit_file is not None: if not os.path.exists(submit_file): self.logger.lraise( "submit_file {0} not found".format(submit_file)) elif num_slaves > 0: if not os.path.exists(slave_dir): self.logger.lraise( "template dir {0} not found".format(slave_dir)) self.slave_dir = slave_dir self.submit_file = submit_file self.port = int(port) self.use_approx_prior = bool(use_approx_prior) self.paren_prefix = ".parensemble.{0:04d}.csv" self.obsen_prefix = ".obsensemble.{0:04d}.csv" if isinstance(pst, str): pst = Pst(pst) assert isinstance(pst, Pst) self.pst = pst self.sweep_in_csv = pst.pestpp_options.get("sweep_parameter_csv_file", "sweep_in.csv") self.sweep_out_csv = pst.pestpp_options.get("sweep_output_csv_file", "sweep_out.csv") if parcov is not None: assert isinstance(parcov, Cov) else: parcov = Cov.from_parameter_data(self.pst) if obscov is not None: assert isinstance(obscov, Cov) else: obscov = Cov.from_observation_data(pst) self.parcov = parcov self.obscov = obscov # if restart_iter > 0: # self.restart_iter = restart_iter # paren = self.pst.filename+self.paren_prefix.format(restart_iter) # assert os.path.exists(paren),\ # "could not find restart par ensemble {0}".format(paren) # obsen0 = self.pst.filename+self.obsen_prefix.format(0) # assert os.path.exists(obsen0),\ # "could not find restart obs ensemble 0 {0}".format(obsen0) # obsen = self.pst.filename+self.obsen_prefix.format(restart_iter) # assert os.path.exists(obsen),\ # "could not find restart obs ensemble {0}".format(obsen) # self.restart = True self.__initialized = False #self.num_reals = 0 self.half_parcov_diag = None self.half_obscov_diag = None self.delta_par_prior = None self.iter_num = 0 #self.enforce_bounds = None self.raw_sweep_out = None @property def current_phi(self): return pd.DataFrame(data={"phi":self._calc_phi_vec(self.obsensemble)},\ index=self.obsensemble.index) def initialize(self, num_reals=1, init_lambda=None, enforce_bounds="reset", parensemble=None, obsensemble=None, restart_obsensemble=None): ''' (re)initialize the process ''' # initialize the phi report csv self.enforce_bounds = enforce_bounds self.phi_csv = open(self.pst.filename + ".iobj.csv", 'w') self.phi_csv.write( "iter_num,total_runs,lambda,min,max,mean,median,std,") self.phi_csv.write(','.join(["{0:010d}".\ format(i+1) for i in range(num_reals)])) self.phi_csv.write('\n') self.total_runs = 0 # this matrix gets used a lot, so only calc once and store self.obscov_inv_sqrt = self.obscov.get(self.pst.nnz_obs_names).inv.sqrt if parensemble is not None and obsensemble is not None: self.logger.log("initializing with existing ensembles") if isinstance(parensemble, str): self.logger.log("loading parensemble from file") if not os.path.exists(obsensemble): self.logger.lraise("can not find parensemble file: {0}".\ format(parensemble)) df = pd.read_csv(parensemble, index_col=0) #df.index = [str(i) for i in df.index] self.parensemble_0 = ParameterEnsemble.from_dataframe( df=df, pst=self.pst) self.logger.log("loading parensemble from file") elif isinstance(parensemble, ParameterEnsemble): self.parensemble_0 = parensemble.copy() else: raise Exception("unrecognized arg type for parensemble, " +\ "should be filename or ParameterEnsemble" +\ ", not {0}".format(type(parensemble))) self.parensemble = self.parensemble_0.copy() if isinstance(obsensemble, str): self.logger.log("loading obsensemble from file") if not os.path.exists(obsensemble): self.logger.lraise("can not find obsensemble file: {0}".\ format(obsensemble)) df = pd.read_csv(obsensemble, index_col=0).loc[:, self.pst.nnz_obs_names] #df.index = [str(i) for i in df.index] self.obsensemble_0 = ObservationEnsemble.from_dataframe( df=df, pst=self.pst) self.logger.log("loading obsensemble from file") elif isinstance(obsensemble, ObservationEnsemble): self.obsensemble_0 = obsensemble.copy() else: raise Exception("unrecognized arg type for obsensemble, " +\ "should be filename or ObservationEnsemble" +\ ", not {0}".format(type(obsensemble))) assert self.parensemble_0.shape[0] == self.obsensemble_0.shape[0] #self.num_reals = self.parensemble_0.shape[0] self.logger.log("initializing with existing ensembles") else: self.logger.log( "initializing smoother with {0} realizations".format( num_reals)) #self.num_reals = int(num_reals) #assert self.num_reals > 1 self.logger.log("initializing parensemble") self.parensemble_0 = ParameterEnsemble(self.pst) self.parensemble_0.draw(cov=self.parcov, num_reals=num_reals) self.parensemble_0.enforce(enforce_bounds=enforce_bounds) self.logger.log("initializing parensemble") self.parensemble = self.parensemble_0.copy() self.parensemble_0.to_csv(self.pst.filename +\ self.paren_prefix.format(0)) self.logger.log("initializing parensemble") self.logger.log("initializing obsensemble") self.obsensemble_0 = ObservationEnsemble(self.pst) self.obsensemble_0.draw(cov=self.obscov, num_reals=num_reals) #self.obsensemble = self.obsensemble_0.copy() # save the base obsensemble self.obsensemble_0.to_csv(self.pst.filename +\ self.obsen_prefix.format(-1)) self.logger.log("initializing obsensemble") self.logger.log( "initializing smoother with {0} realizations".format( num_reals)) self.obs0_matrix = self.obsensemble_0.nonzero.as_pyemu_matrix() self.enforce_bounds = enforce_bounds if restart_obsensemble is not None: self.logger.log( "loading restart_obsensemble {0}".format(restart_obsensemble)) failed_runs, self.obsensemble = self._load_obs_ensemble( restart_obsensemble) assert self.obsensemble.shape[0] == self.obsensemble_0.shape[0] assert list(self.obsensemble.columns) == list( self.obsensemble_0.columns) self.logger.log( "loading restart_obsensemble {0}".format(restart_obsensemble)) else: # run the initial parameter ensemble self.logger.log("evaluating initial ensembles") failed_runs, self.obsensemble = self._calc_obs(self.parensemble) self.obsensemble.to_csv(self.pst.filename +\ self.obsen_prefix.format(0)) self.logger.log("evaluating initial ensembles") if failed_runs is not None: self.logger.warn("dropping failed realizations") #failed_runs_str = [str(f) for f in failed_runs] self.parensemble = self.parensemble.drop(failed_runs) self.obsensemble = self.obsensemble.drop(failed_runs) self.current_phi_vec = self._calc_phi_vec(self.obsensemble) self._phi_report(self.current_phi_vec, 0.0) self.last_best_mean = self.current_phi_vec.mean() self.last_best_std = self.current_phi_vec.std() self.logger.statement("initial phi (mean, std): {0:15.6G},{1:15.6G}".\ format(self.last_best_mean,self.last_best_std)) if init_lambda is not None: self.current_lambda = float(init_lambda) else: #following chen and oliver x = self.last_best_mean / (2.0 * float(self.obsensemble.shape[1])) self.current_lambda = 10.0**(np.floor(np.log10(x))) # if using the approximate form of the algorithm, let # the parameter scaling matrix be the identity matrix # jwhite - dec 5 2016 - using the actual parcov inv # for upgrades seems to be pushing parameters around # too much. for now, just not using it, maybe # better choices of lambda will tame it self.logger.statement("current lambda:{0:15.6g}".format( self.current_lambda)) if self.use_approx_prior: self.logger.statement("using approximate parcov in solution") self.half_parcov_diag = 1.0 else: #self.logger.statement("using full parcov in solution") # if self.parcov.isdiagonal: # self.half_parcov_diag = self.parcov.sqrt.inv # else: # self.half_parcov_diag = Cov(x=np.diag(self.parcov.x), # names=self.parcov.col_names, # isdiagonal=True).inv.sqrt self.half_parcov_diag = 1.0 self.delta_par_prior = self._calc_delta_par(self.parensemble_0) u, s, v = self.delta_par_prior.pseudo_inv_components() self.Am = u * s.inv self.__initialized = True def get_localizer(self): onames = self.pst.nnz_obs_names pnames = self.pst.adj_par_names localizer = Matrix(x=np.ones((len(onames), len(pnames))), row_names=onames, col_names=pnames) return localizer def _calc_delta_par(self, parensemble): ''' calc the scaled parameter ensemble differences from the mean ''' return self._calc_delta(parensemble, self.half_parcov_diag) def _calc_delta_obs(self, obsensemble): ''' calc the scaled observation ensemble differences from the mean ''' return self._calc_delta(obsensemble.nonzero, self.obscov.inv.sqrt) def _calc_delta(self, ensemble, scaling_matrix): ''' calc the scaled ensemble differences from the mean ''' mean = np.array(ensemble.mean(axis=0)) delta = ensemble.as_pyemu_matrix() for i in range(ensemble.shape[0]): delta.x[i, :] -= mean delta = scaling_matrix * delta.T delta *= (1.0 / np.sqrt(float(ensemble.shape[0] - 1.0))) return delta def _calc_obs(self, parensemble): self.logger.log("removing existing sweep in/out files") try: os.remove(self.sweep_in_csv) except Exception as e: self.logger.warn( "error removing existing sweep in file:{0}".format(str(e))) try: os.remove(self.sweep_out_csv) except Exception as e: self.logger.warn( "error removing existing sweep out file:{0}".format(str(e))) self.logger.log("removing existing sweep in/out files") if parensemble.isnull().values.any(): parensemble.to_csv("_nan.csv") self.logger.lraise( "_calc_obs() error: NaNs in parensemble (written to '_nan.csv')" ) if self.submit_file is None: self._calc_obs_local(parensemble) else: self._calc_obs_condor(parensemble) # make a copy of sweep out for restart purposes # sweep_out = str(self.iter_num)+"_raw_"+self.sweep_out_csv # if os.path.exists(sweep_out): # os.remove(sweep_out) # shutil.copy2(self.sweep_out_csv,sweep_out) self.logger.log("reading sweep out csv {0}".format(self.sweep_out_csv)) failed_runs, obs = self._load_obs_ensemble(self.sweep_out_csv) self.logger.log("reading sweep out csv {0}".format(self.sweep_out_csv)) self.total_runs += obs.shape[0] self.logger.statement("total runs:{0}".format(self.total_runs)) return failed_runs, obs def _load_obs_ensemble(self, filename): if not os.path.exists(filename): self.logger.lraise( "obsensemble file {0} does not exists".format(filename)) obs = pd.read_csv(filename) obs.columns = [item.lower() for item in obs.columns] self.raw_sweep_out = obs.copy( ) # save this for later to support restart assert "input_run_id" in obs.columns,\ "'input_run_id' col missing...need newer version of sweep" obs.index = obs.input_run_id failed_runs = None if 1 in obs.failed_flag.values: failed_runs = obs.loc[obs.failed_flag == 1].index.values self.logger.warn("{0} runs failed (indices: {1})".\ format(len(failed_runs),','.join([str(f) for f in failed_runs]))) obs = ObservationEnsemble.from_dataframe( df=obs.loc[:, self.obscov.row_names], pst=self.pst) if obs.isnull().values.any(): self.logger.lraise("_calc_obs() error: NaNs in obsensemble") return failed_runs, obs def _get_master_thread(self): master_stdout = "_master_stdout.dat" master_stderr = "_master_stderr.dat" def master(): try: #os.system("sweep {0} /h :{1} 1>{2} 2>{3}". \ # format(self.pst.filename, self.port, master_stdout, master_stderr)) pyemu.helpers.run("sweep {0} /h :{1} 1>{2} 2>{3}". \ format(self.pst.filename, self.port, master_stdout, master_stderr)) except Exception as e: self.logger.lraise("error starting condor master: {0}".format( str(e))) with open(master_stderr, 'r') as f: err_lines = f.readlines() if len(err_lines) > 0: self.logger.warn("master stderr lines: {0}".format(','.join( [l.strip() for l in err_lines]))) master_thread = threading.Thread(target=master) master_thread.start() time.sleep(2.0) return master_thread def _calc_obs_condor(self, parensemble): self.logger.log("evaluating ensemble of size {0} with htcondor".\ format(parensemble.shape[0])) parensemble.to_csv(self.sweep_in_csv) master_thread = self._get_master_thread() condor_temp_file = "_condor_submit_stdout.dat" condor_err_file = "_condor_submit_stderr.dat" self.logger.log("calling condor_submit with submit file {0}".format( self.submit_file)) try: os.system("condor_submit {0} 1>{1} 2>{2}".\ format(self.submit_file,condor_temp_file,condor_err_file)) except Exception as e: self.logger.lraise("error in condor_submit: {0}".format(str(e))) self.logger.log("calling condor_submit with submit file {0}".format( self.submit_file)) time.sleep( 2.0) #some time for condor to submit the job and echo to stdout condor_submit_string = "submitted to cluster" with open(condor_temp_file, 'r') as f: lines = f.readlines() self.logger.statement("condor_submit stdout: {0}".\ format(','.join([l.strip() for l in lines]))) with open(condor_err_file, 'r') as f: err_lines = f.readlines() if len(err_lines) > 0: self.logger.warn("stderr from condor_submit:{0}".\ format([l.strip() for l in err_lines])) cluster_number = None for line in lines: if condor_submit_string in line.lower(): cluster_number = int( float(line.split(condor_submit_string)[-1])) if cluster_number is None: self.logger.lraise("couldn't find cluster number...") self.logger.statement("condor cluster: {0}".format(cluster_number)) master_thread.join() self.logger.statement("condor master thread exited") self.logger.log( "calling condor_rm on cluster {0}".format(cluster_number)) os.system("condor_rm cluster {0}".format(cluster_number)) self.logger.log( "calling condor_rm on cluster {0}".format(cluster_number)) self.logger.log("evaluating ensemble of size {0} with htcondor".\ format(parensemble.shape[0])) def _calc_obs_local(self, parensemble): ''' propagate the ensemble forward using sweep. ''' self.logger.log("evaluating ensemble of size {0} locally with sweep".\ format(parensemble.shape[0])) parensemble.to_csv(self.sweep_in_csv) if self.num_slaves > 0: master_thread = self._get_master_thread() pyemu.utils.start_slaves(self.slave_dir, "sweep", self.pst.filename, self.num_slaves, slave_root='..', port=self.port) master_thread.join() else: os.system("sweep {0}".format(self.pst.filename)) self.logger.log("evaluating ensemble of size {0} locally with sweep".\ format(parensemble.shape[0])) def _calc_phi_vec(self, obsensemble): obs_diff = self._get_residual_matrix(obsensemble) #phi_vec = np.diagonal((obs_diff * self.obscov_inv_sqrt.get(row_names=obs_diff.col_names, # col_names=obs_diff.col_names) * obs_diff.T).x) q = np.diagonal( self.obscov_inv_sqrt.get(row_names=obs_diff.col_names, col_names=obs_diff.col_names).x) phi_vec = [] for i in range(obs_diff.shape[0]): o = obs_diff.x[i, :] phi_vec.append(((obs_diff.x[i, :] * q)**2).sum()) return np.array(phi_vec) def _phi_report(self, phi_vec, cur_lam): self.phi_csv.write("{0},{1},{2},{3},{4},{5},{6}".format( self.iter_num, self.total_runs, cur_lam, phi_vec.min(), phi_vec.max(), phi_vec.mean(), np.median(phi_vec), phi_vec.std())) self.phi_csv.write(",".join( ["{0:20.8}".format(phi) for phi in phi_vec])) self.phi_csv.write("\n") self.phi_csv.flush() def _get_residual_matrix(self, obsensemble): obs_matrix = obsensemble.nonzero.as_pyemu_matrix() return obs_matrix - self.obs0_matrix.get( col_names=obs_matrix.col_names, row_names=obs_matrix.row_names) def update(self, lambda_mults=[1.0], localizer=None, run_subset=None, use_approx=True): if run_subset is not None: if run_subset >= self.obsensemble.shape[0]: self.logger.warn("run_subset ({0}) >= num of active reals ({1})...ignoring ".\ format(run_subset,self.obsensemble.shape[0])) run_subset = None self.iter_num += 1 self.logger.log("iteration {0}".format(self.iter_num)) self.logger.statement("{0} active realizations".format( self.obsensemble.shape[0])) if self.obsensemble.shape[0] < 2: self.logger.lraise( "at least active 2 realizations (really like 300) are needed to update" ) if not self.__initialized: #raise Exception("must call initialize() before update()") self.logger.lraise("must call initialize() before update()") self.logger.log("calculate scaled delta obs") scaled_delta_obs = self._calc_delta_obs(self.obsensemble) self.logger.log("calculate scaled delta obs") self.logger.log("calculate scaled delta par") scaled_delta_par = self._calc_delta_par(self.parensemble) self.logger.log("calculate scaled delta par") self.logger.log("calculate pseudo inv comps") u, s, v = scaled_delta_obs.pseudo_inv_components() self.logger.log("calculate pseudo inv comps") self.logger.log("calculate obs diff matrix") obs_diff = self.obscov_inv_sqrt * self._get_residual_matrix( self.obsensemble).T self.logger.log("calculate obs diff matrix") # here is the math part...calculate upgrade matrices mean_lam, std_lam, paren_lam, obsen_lam = [], [], [], [] lam_vals = [] for ilam, cur_lam_mult in enumerate(lambda_mults): parensemble_cur_lam = self.parensemble.copy() #print(parensemble_cur_lam.isnull().values.any()) cur_lam = self.current_lambda * cur_lam_mult lam_vals.append(cur_lam) self.logger.log("calcs for lambda {0}".format(cur_lam_mult)) scaled_ident = Cov.identity_like(s) * (cur_lam + 1.0) scaled_ident += s**2 scaled_ident = scaled_ident.inv # build up this matrix as a single element so we can apply # localization self.logger.log("building upgrade_1 matrix") upgrade_1 = -1.0 * (self.half_parcov_diag * scaled_delta_par) *\ v * s * scaled_ident * u.T self.logger.log("building upgrade_1 matrix") # apply localization if localizer is not None: self.logger.log("applying localization") upgrade_1.hadamard_product(localizer) self.logger.log("applying localization") # apply residual information self.logger.log("applying residuals") upgrade_1 *= obs_diff self.logger.log("applying residuals") self.logger.log("processing upgrade_1") upgrade_1 = upgrade_1.to_dataframe() upgrade_1.index.name = "parnme" upgrade_1 = upgrade_1.T upgrade_1.index = [int(i) for i in upgrade_1.index] upgrade_1.to_csv(self.pst.filename+".upgrade_1.{0:04d}.csv".\ format(self.iter_num)) if upgrade_1.isnull().values.any(): self.logger.lraise("NaNs in upgrade_1") self.logger.log("processing upgrade_1") #print(upgrade_1.isnull().values.any()) #print(parensemble_cur_lam.index) #print(upgrade_1.index) parensemble_cur_lam += upgrade_1 # parameter-based upgrade portion if not use_approx and self.iter_num > 1: self.logger.log("building upgrade_2 matrix") par_diff = (self.parensemble - self.parensemble_0.loc[self.parensemble.index,:]).\ as_pyemu_matrix().T x4 = self.Am.T * self.half_parcov_diag * par_diff x5 = self.Am * x4 x6 = scaled_delta_par.T * x5 x7 = v * scaled_ident * v.T * x6 upgrade_2 = -1.0 * (self.half_parcov_diag * scaled_delta_par * x7).to_dataframe() upgrade_2.index.name = "parnme" upgrade_2 = upgrade_2.T upgrade_2.to_csv(self.pst.filename+".upgrade_2.{0:04d}.csv".\ format(self.iter_num)) upgrade_2.index = [int(i) for i in upgrade_2.index] if upgrade_2.isnull().values.any(): self.logger.lraise("NaNs in upgrade_2") parensemble_cur_lam += upgrade_2 self.logger.log("building upgrade_2 matrix") parensemble_cur_lam.enforce(self.enforce_bounds) # this is for testing failed runs on upgrade testing # works with the 10par_xsec smoother test #parensemble_cur_lam.iloc[:,:] = -1000000.0 paren_lam.append(pd.DataFrame(parensemble_cur_lam.loc[:, :])) self.logger.log("calcs for lambda {0}".format(cur_lam_mult)) # subset if needed # and combine lambda par ensembles into one par ensemble for evaluation if run_subset is not None and run_subset < self.parensemble.shape[0]: #subset_idx = ["{0:d}".format(i) for i in np.random.randint(0,self.parensemble.shape[0]-1,run_subset)] subset_idx = self.parensemble.iloc[:run_subset, :].index.values self.logger.statement("subset idxs: " + ','.join([str(s) for s in subset_idx])) paren_lam_subset = [pe.loc[subset_idx, :] for pe in paren_lam] paren_combine = pd.concat(paren_lam_subset, ignore_index=True) paren_lam_subset = None else: subset_idx = self.parensemble.index.values paren_combine = pd.concat(paren_lam, ignore_index=True) self.logger.log("evaluating ensembles for lambdas : {0}".\ format(','.join(["{0:8.3E}".format(l) for l in lam_vals]))) failed_runs, obsen_combine = self._calc_obs(paren_combine) #if failed_runs is not None: # obsen_combine.loc[failed_runs,:] = np.NaN self.logger.log("evaluating ensembles for lambdas : {0}".\ format(','.join(["{0:8.3E}".format(l) for l in lam_vals]))) paren_combine = None if failed_runs is not None and len( failed_runs) == obsen_combine.shape[0]: self.logger.lraise("all runs failed - cannot continue") # unpack lambda obs ensembles from combined obs ensemble nrun_per_lam = self.obsensemble.shape[0] if run_subset is not None: nrun_per_lam = run_subset obsen_lam = [] for i in range(len(lam_vals)): sidx = i * nrun_per_lam eidx = sidx + nrun_per_lam oe = ObservationEnsemble.from_dataframe( df=obsen_combine.iloc[sidx:eidx, :].copy(), pst=self.pst) oe.index = subset_idx # check for failed runs in this set - drop failed runs from obs ensembles if failed_runs is not None: failed_runs_this = np.array( [f for f in failed_runs if f >= sidx and f < eidx]) - sidx if len(failed_runs_this) > 0: if len(failed_runs_this) == oe.shape[0]: self.logger.warn( "all runs failed for lambda {0}".format( lam_vals[i])) else: self.logger.warn("{0} run failed for lambda {1}".\ format(len(failed_runs_this),lam_vals[i])) oe.iloc[failed_runs_this, :] = np.NaN oe = oe.dropna() obsen_lam.append(oe) obsen_combine = None # here is where we need to select out the "best" lambda par and obs # ensembles self.logger.statement("\n**************************") self.logger.statement(str(datetime.now())) self.logger.statement("total runs:{0}".format(self.total_runs)) self.logger.statement("iteration: {0}".format(self.iter_num)) self.logger.statement("current lambda:{0:15.6G}, mean:{1:15.6G}, std:{2:15.6G}".\ format(self.current_lambda, self.last_best_mean,self.last_best_std)) phi_vecs = [self._calc_phi_vec(obsen) for obsen in obsen_lam] mean_std = [(pv.mean(), pv.std()) for pv in phi_vecs] update_pars = False update_lambda = False # accept a new best if its within 10% best_mean = self.last_best_mean * 1.1 best_std = self.last_best_std * 1.1 best_i = 0 for i, (m, s) in enumerate(mean_std): self.logger.statement(" tested lambda:{0:15.6G}, mean:{1:15.6G}, std:{2:15.6G}".\ format(self.current_lambda * lambda_mults[i],m,s)) if m < best_mean: update_pars = True best_mean = m best_i = i if s < best_std: update_lambda = True best_std = s if np.isnan(best_mean): self.logger.lraise("best mean = NaN") if np.isnan(best_std): self.logger.lraise("best std = NaN") if not update_pars: self.current_lambda *= max(lambda_mults) * 10.0 self.current_lambda = min(self.current_lambda, 100000) self.logger.statement("not accepting iteration, increased lambda:{0}".\ format(self.current_lambda)) else: self.parensemble = ParameterEnsemble.from_dataframe( df=paren_lam[best_i], pst=self.pst) if run_subset is not None: failed_runs, self.obsensemble = self._calc_obs( self.parensemble) if failed_runs is not None: self.logger.warn("dropping failed realizations") self.parensemble = self.parensemble.drop(failed_runs) self.obsensemble = self.obsensemble.drop(failed_runs) self.current_phi_vec = self._calc_phi_vec(self.obsensemble) self._phi_report(self.current_phi_vec, self.current_lambda * lambda_mults[best_i]) best_mean = self.current_phi_vec.mean() best_std = self.current_phi_vec.std() else: self.obsensemble = obsen_lam[best_i] # reindex parensemble in case failed runs self.parensemble = ParameterEnsemble.from_dataframe( df=self.parensemble.loc[self.obsensemble.index], pst=self.pst) self._phi_report(phi_vecs[best_i], self.current_lambda * lambda_mults[best_i]) self.current_phi_vec = phi_vecs[best_i] self.logger.statement(" best lambda:{0:15.6G}, mean:{1:15.6G}, std:{2:15.6G}".\ format(self.current_lambda*lambda_mults[best_i], best_mean,best_std)) self.last_best_mean = best_mean self.last_best_std = best_std if update_lambda: # be aggressive self.current_lambda *= (lambda_mults[best_i] * 0.75) # but don't let lambda get too small self.current_lambda = max(self.current_lambda, 0.001) self.logger.statement("updating lambda: {0:15.6G}".\ format(self.current_lambda )) self.logger.statement("**************************\n") self.parensemble.to_csv(self.pst.filename+self.paren_prefix.\ format(self.iter_num)) self.obsensemble.to_csv(self.pst.filename+self.obsen_prefix.\ format(self.iter_num)) if self.raw_sweep_out is not None: self.raw_sweep_out.to_csv(self.pst.filename+"_raw{0}".\ format(self.iter_num)) self.logger.log("iteration {0}".format(self.iter_num))
def draw( self, num_reals=1, par_file=None, obs=False, enforce_bounds=None, cov=None, how="gaussian", ): """draw stochastic realizations of parameters and optionally observations, filling MonteCarlo.parensemble and optionally MonteCarlo.obsensemble. Parameters ---------- num_reals : int number of realization to generate par_file : str parameter file to use as mean values. If None, use MonteCarlo.pst.parameter_data.parval1. Default is None obs : bool add a realization of measurement noise to observation values, forming MonteCarlo.obsensemble.Default is False enforce_bounds : str enforce parameter bounds based on control file information. options are 'reset', 'drop' or None. Default is None how : str type of distribution to draw from. Must be in ["gaussian","uniform"] default is "gaussian". Example ------- ``>>>import pyemu`` ``>>>mc = pyemu.MonteCarlo(pst="pest.pst")`` ``>>>mc.draw(1000)`` """ if par_file is not None: self.pst.parrep(par_file) how = how.lower().strip() assert how in ["gaussian", "uniform"] if cov is not None: assert isinstance(cov, Cov) if how == "uniform": raise Exception("MonteCarlo.draw() error: 'how'='uniform'," + " 'cov' arg cannot be passed") else: cov = self.parcov self.log("generating {0:d} parameter realizations".format(num_reals)) if how == "gaussian": self.parensemble = ParameterEnsemble.from_gaussian_draw( pst=self.pst, cov=cov, num_reals=num_reals, use_homegrown=True, enforce_bounds=False, ) elif how == "uniform": self.parensemble = ParameterEnsemble.from_uniform_draw( pst=self.pst, num_reals=num_reals) else: raise Exception( "MonteCarlo.draw(): unrecognized 'how' arg: {0}".format(how)) # self.parensemble = ParameterEnsemble(pst=self.pst) # self.obsensemble = ObservationEnsemble(pst=self.pst) # self.parensemble.draw(cov,num_reals=num_reals, how=how, # enforce_bounds=enforce_bounds) if enforce_bounds is not None: self.parensemble.enforce(enforce_bounds) self.log("generating {0:d} parameter realizations".format(num_reals)) if obs: self.log( "generating {0:d} observation realizations".format(num_reals)) self.obsensemble = ObservationEnsemble.from_id_gaussian_draw( pst=self.pst, num_reals=num_reals) self.log( "generating {0:d} observation realizations".format(num_reals))
class EnsembleSmoother(): def __init__(self,pst,parcov=None,obscov=None): assert isinstance(pst,Pst) self.pst = pst if parcov is not None: assert isinstance(parcov,Cov) else: parcov = Cov.from_parameter_data(self.pst) if obscov is not None: assert isinstance(obscov,Cov) else: obscov = Cov.from_observation_data(pst) self.parcov = parcov self.obscov = obscov self.__initialized = False self.num_reals = 0 self.half_parcov_diag = None self.half_obscov_diag = None self.delta_par_prior = None self.iter_num = 0 def initialize(self,num_reals): ''' (re)initialize the process ''' self.num_reals = int(num_reals) self.parensemble = ParameterEnsemble(self.pst) self.parensemble.draw(cov=self.parcov,num_reals=num_reals) self.obsensemble_0 = ObservationEnsemble(self.pst) self.obsensemble_0.draw(cov=self.obscov,num_reals=num_reals) self.obsensemble = self.obsensemble_0.copy() if self.parcov.isdiagonal: self.half_parcov_diag = self.parcov.inv.sqrt else: self.half_parcov_diag = Cov(x=np.diag(self.parcov.x), names=self.parcov.col_names, isdiagonal=True).inv.sqrt #if self.obscov.isdiagonal: #self.half_obscov_inv = self.obscov.inv.sqrt # else: # self.half_obscov_diag = Cov(x=np.diag(self.obscov.x), # names=self.obscov.col_names, # isdiagonal=True) self.delta_par_prior = self._calc_delta_par() self.__initialized = True def _calc_delta_par(self): ''' calc the scaled parameter ensemble differences from the mean ''' mean = np.array(self.parensemble.mean(axis=0)) delta = self.parensemble.as_pyemu_matrix() for i in range(self.num_reals): delta.x[i,:] -= mean #delta = Matrix(x=(self.half_parcov_diag * delta.transpose()).x, # row_names=self.parensemble.columns) delta = self.half_parcov_diag * delta.T return delta * (1.0 / np.sqrt(float(self.num_reals - 1.0))) def _calc_delta_obs(self): ''' calc the scaled observation ensemble differences from the mean ''' mean = np.array(self.obsensemble.mean(axis=0)) delta = self.obsensemble.as_pyemu_matrix() for i in range(self.num_reals): delta.x[i,:] -= mean delta = self.obscov.inv.sqrt * delta.T return delta * (1.0 / np.sqrt(float(self.num_reals - 1.0))) def _calc_obs(self): ''' propagate the ensemble forward... ''' self.parensemble.to_csv(os.path.join("smoother","sweep_in.csv")) os.chdir("smoother") print(os.listdir('.')) os.system("sweep freyberg.pst") os.chdir('..') obs = ObservationEnsemble.from_csv(os.path.join(\ "smoother",'sweep_out.csv')) obs.columns = [item.lower() for item in obs.columns] self.obsensemble = ObservationEnsemble.from_dataframe(df=obs.loc[:,self.obscov.row_names],pst=self.pst) #todo: modifiy sweep to be interactive... return @property def current_lambda(self): return 10.0 def update(self): if not self.__initialized: raise Exception("must call initialize() before update()") self._calc_obs() delta_obs = self._calc_delta_obs() u,s,v = delta_obs.pseudo_inv_components() #print(v) #print(s) #print(v) diff = self.obsensemble.as_pyemu_matrix() - self.obsensemble_0.as_pyemu_matrix() #print(diff) x1 = u.T * self.obscov.inv.sqrt * diff.T x1.autoalign = False #print(x1) x2 = (Cov.identity_like(s) + s**2).inv * x1 #print(x2) x3 = v * s * x2 #print(x3) upgrade_1 = (self.half_parcov_diag * self._calc_delta_par() * x3).to_dataframe() upgrade_1.index.name = "parnme" print(upgrade_1) self.parensemble += upgrade_1.T print(self.parensemble) if self.iter_num > 0: raise NotImplementedError() print(upgrade_1.shape)
class MonteCarlo(LinearAnalysis): """LinearAnalysis derived type for monte carlo analysis Note: requires a pest control file, which can be derived from a jco argument MonteCarlo.project_parsensemble also requires a jacobian """ def __init__(self,**kwargs): super(MonteCarlo,self).__init__(**kwargs) assert self.pst is not None, \ "monte carlo requires a pest control file" self.parensemble = ParameterEnsemble(pst=self.pst) self.obsensemble = ObservationEnsemble(pst=self.pst) @property def num_reals(self): return self.parensemble.shape[0] def get_nsing(self,epsilon=1.0e-4): """ get the number of solution space dimensions given a ratio between the largest and smallest singular values Parameters: epsilon: ratio Returns : integer (or None) number of singular components above the epsilon ratio threshold If nsing == nadj_par, then None is returned """ mx = self.xtqx.shape[0] nsing = mx - np.searchsorted( np.sort((self.xtqx.s.x / self.xtqx.s.x.max())[:,0]),epsilon) if nsing == mx: self.logger.warn("optimal nsing=npar") nsing = None return nsing def get_null_proj(self,nsing=None): """ get a null-space projection matrix of XTQX Parameters: ---------- nsing: optional number of singular components to use if none, call self.get_nsing() Returns: ------- Matrix instance : V2V2^T """ if nsing is None: nsing = self.get_nsing() if nsing is None: raise Exception("nsing is None") print("using {0} singular components".format(nsing)) self.log("forming null space projection matrix with " +\ "{0} of {1} singular components".format(nsing,self.jco.shape[1])) v2_proj = (self.xtqx.v[:,nsing:] * self.xtqx.v[:,nsing:].T) self.log("forming null space projection matrix with " +\ "{0} of {1} singular components".format(nsing,self.jco.shape[1])) return v2_proj def draw(self, num_reals=1, par_file = None, obs=False, enforce_bounds=False,cov=None, how="gaussian"): """draw stochastic realizations of parameters and optionally observations Parameters: ---------- num_reals (int): number of realization to generate par_file (str): parameter file to use as mean values obs (bool): add a realization of measurement noise to obs enforce_bounds (bool): enforce parameter bounds in control file how (str): type of distribution. Must be in ["gaussian","uniform"] Returns: None Raises: None """ if par_file is not None: self.pst.parrep(par_file) how = how.lower().strip() assert how in ["gaussian","uniform"] if cov is not None: assert isinstance(cov,Cov) if how == "uniform": raise Exception("MonteCarlo.draw() error: 'how'='uniform'," +\ " 'cov' arg cannot be passed") else: cov = self.parcov self.parensemble = ParameterEnsemble(pst=self.pst) self.obsensemble = ObservationEnsemble(pst=self.pst) self.log("generating {0:d} parameter realizations".format(num_reals)) self.parensemble.draw(cov,num_reals=num_reals, how=how) if enforce_bounds: self.parensemble.enforce() self.log("generating {0:d} parameter realizations".format(num_reals)) if obs: self.log("generating {0:d} observation realizations".format(num_reals)) self.obsensemble.draw(self.obscov,num_reals=num_reals) self.log("generating {0:d} observation realizations".format(num_reals)) def project_parensemble(self,par_file=None,nsing=None, inplace=True): """ perform the null-space projection operations for null-space monte carlo Parameters: par_file: str an optional file of parameter values to use nsing: int number of singular values to in forming null subspace matrix inplace: bool overwrite the existing parameter ensemble with the projected values Returns: ------- if inplace is False, ParameterEnsemble instance, otherwise None """ assert self.jco is not None,"MonteCarlo.project_parensemble()" +\ "requires a jacobian attribute" if par_file is not None: assert os.path.exists(par_file),"monte_carlo.draw() error: par_file not found:" +\ par_file self.parensemble.pst.parrep(par_file) # project the ensemble self.log("projecting parameter ensemble") en = self.parensemble.project(self.get_null_proj(nsing),inplace=inplace,log=self.log) self.log("projecting parameter ensemble") return en def write_psts(self,prefix,existing_jco=None,noptmax=None): """ write parameter and optionally observation realizations to pest control files Parameters: ---------- prefix: str pest control file prefix existing_jco: str filename of an existing jacobian matrix to add to the pest++ options in the control file. This is useful for NSMC since this jco can be used to get the first set of parameter upgrades for free! Needs to be the path the jco file as seen from the location where pest++ will be run noptmax: int value of NOPTMAX to set in new pest control files Returns: ------- None """ self.log("writing realized pest control files") # get a copy of the pest control file pst = self.pst.get(par_names=self.pst.par_names,obs_names=self.pst.obs_names) if noptmax is not None: pst.control_data.noptmax = noptmax pst.control_data.noptmax = noptmax if existing_jco is not None: pst.pestpp_options["BASE_JACOBIAN"] = existing_jco # set the indices pst.parameter_data.index = pst.parameter_data.parnme pst.observation_data.index = pst.observation_data.obsnme if self.parensemble.istransformed: par_en = self.parensemble._back_transform(inplace=False) else: par_en = self.parensemble for i in range(self.num_reals): pst_name = prefix + "{0:d}.pst".format(i) self.log("writing realized pest control file " + pst_name) pst.parameter_data.loc[par_en.columns,"parval1"] = par_en.iloc[i, :].T # reset the regularization if pst.control_data.pestmode == "regularization": pst.zero_order_tikhonov(parbounds=True) # add the obs noise realization if needed if self.obsensemble.shape[0] == self.num_reals: pst.observation_data.loc[self.obsensemble.columns,"obsval"] = \ self.obsensemble.iloc[i, :].T # write pst.write(pst_name) self.log("writing realized pest control file " + pst_name) self.log("writing realized pest control files")
def update(self, lambda_mults=[1.0], localizer=None, run_subset=None, use_approx=True, calc_only=False): """update the iES one GLM cycle Parameters ---------- lambda_mults : list a list of lambda multipliers to test. Each lambda mult value will require evaluating (a subset of) the parameter ensemble. localizer : pyemu.Matrix a jacobian localizing matrix run_subset : int the number of realizations to test for each lambda_mult value. For example, if run_subset = 30 and num_reals=100, the first 30 realizations will be run (in parallel) for each lambda_mult value. Then the best lambda_mult is selected and the remaining 70 realizations for that lambda_mult value are run (in parallel). use_approx : bool a flag to use the MLE or MAP upgrade solution. True indicates use MLE solution calc_only : bool a flag to calculate the upgrade matrix only (not run the ensemble). This is mostly for debugging and testing on travis. Default is False Example ------- ``>>>import pyemu`` ``>>>es = pyemu.EnsembleSmoother(pst="pest.pst")`` ``>>>es.initialize(num_reals=100)`` ``>>>es.update(lambda_mults=[0.1,1.0,10.0],run_subset=30)`` """ #if not self.parensemble.istransformed: # self.parensemble._transform(inplace=False) if run_subset is not None: if run_subset >= self.obsensemble.shape[0]: self.logger.warn("run_subset ({0}) >= num of active reals ({1})...ignoring ".\ format(run_subset,self.obsensemble.shape[0])) run_subset = None self.iter_num += 1 mat_prefix = self.pst.filename.replace('.pst', '') + ".{0}".format( self.iter_num) self.logger.log("iteration {0}".format(self.iter_num)) self.logger.statement("{0} active realizations".format( self.obsensemble.shape[0])) if self.obsensemble.shape[0] < 2: self.logger.lraise( "at least active 2 realizations (really like 300) are needed to update" ) if not self._initialized: #raise Exception("must call initialize() before update()") self.logger.lraise("must call initialize() before update()") self.logger.log("calculate scaled delta obs") scaled_delta_obs = self._calc_delta_obs(self.obsensemble) self.logger.log("calculate scaled delta obs") self.logger.log("calculate scaled delta par") scaled_delta_par = self._calc_delta_par(self.parensemble) self.logger.log("calculate scaled delta par") self.logger.log("calculate pseudo inv comps") u, s, v = scaled_delta_obs.pseudo_inv_components( eigthresh=self.pst.svd_data.eigthresh) s.col_names = s.row_names self.logger.log("calculate pseudo inv comps") self.logger.log("calculate obs diff matrix") #obs_diff = self.obscov_inv_sqrt * self._get_residual_obs_matrix(self.obsensemble).T obs_diff = self.obscov_inv_sqrt * self.phi.get_residual_obs_matrix( self.obsensemble).T self.logger.log("calculate obs diff matrix") if self.save_mats: np.savetxt(mat_prefix + ".obs_diff.dat", scaled_delta_obs.x, fmt="%15.6e") np.savetxt(mat_prefix + ".par_diff.dat", scaled_delta_par.x, fmt="%15.6e") np.savetxt(mat_prefix + ".u.dat", u.x, fmt="%15.6e") np.savetxt(mat_prefix + ".s.dat", s.x, fmt="%15.6e") np.savetxt(mat_prefix + ".v.dat", v.x, fmt="%15.6e") # here is the math part...calculate upgrade matrices mean_lam, std_lam, paren_lam, obsen_lam = [], [], [], [] lam_vals = [] for ilam, cur_lam_mult in enumerate(lambda_mults): parensemble_cur_lam = self.parensemble.copy() #print(parensemble_cur_lam.isnull().values.any()) cur_lam = self.current_lambda * cur_lam_mult lam_vals.append(cur_lam) self.logger.log("calcs for lambda {0}".format(cur_lam_mult)) scaled_ident = Cov.identity_like(s) * (cur_lam + 1.0) scaled_ident += s**2 scaled_ident = scaled_ident.inv # build up this matrix as a single element so we can apply # localization self.logger.log("building upgrade_1 matrix") upgrade_1 = -1.0 * (self.parcov_inv_sqrt * scaled_delta_par) *\ v * s * scaled_ident * u.T if self.save_mats: np.savetxt(mat_prefix + ".ivec.dat".format(self.iter_num), scaled_ident.x, fmt="%15.6e") self.logger.log("building upgrade_1 matrix") # apply localization if localizer is not None: self.logger.log("applying localization") upgrade_1.hadamard_product(localizer) self.logger.log("applying localization") # apply residual information self.logger.log("applying residuals") upgrade_1 *= obs_diff self.logger.log("applying residuals") self.logger.log("processing upgrade_1") if self.save_mats: np.savetxt(mat_prefix + ".upgrade_1.dat", upgrade_1.T.x, fmt="%15.6e") upgrade_1 = upgrade_1.to_dataframe() upgrade_1.index.name = "parnme" upgrade_1 = upgrade_1.T upgrade_1.index = [int(i) for i in upgrade_1.index] upgrade_1.to_csv(self.pst.filename+".upgrade_1.{0:04d}.csv".\ format(self.iter_num)) if upgrade_1.isnull().values.any(): self.logger.lraise("NaNs in upgrade_1") self.logger.log("processing upgrade_1") #print(upgrade_1.isnull().values.any()) #print(parensemble_cur_lam.index) #print(upgrade_1.index) parensemble_cur_lam += upgrade_1 # parameter-based upgrade portion if not use_approx and self.iter_num > 1: #if True: self.logger.log("building upgrade_2 matrix") par_diff = (self.parensemble - self.parensemble_0.loc[self.parensemble.index,:]).\ as_pyemu_matrix().T x4 = self.Am.T * self.parcov_inv_sqrt * par_diff x5 = self.Am * x4 x6 = scaled_delta_par.T * x5 x7 = v * scaled_ident * v.T * x6 ug2_mat = -1.0 * (self.parcov_inv_sqrt * scaled_delta_par * x7) upgrade_2 = ug2_mat.to_dataframe() upgrade_2.index.name = "parnme" upgrade_2 = upgrade_2.T upgrade_2.to_csv(self.pst.filename+".upgrade_2.{0:04d}.csv".\ format(self.iter_num)) upgrade_2.index = [int(i) for i in upgrade_2.index] if self.save_mats: np.savetxt(mat_prefix + ".scaled_par_resid.dat", par_diff.x, fmt="%15.6e") np.savetxt(mat_prefix + ".x4.dat", x4.x, fmt="%15.6e") np.savetxt(mat_prefix + ".x5.dat", x5.x, fmt="%15.6e") np.savetxt(mat_prefix + ".x6.dat", x6.x, fmt="%15.6e") np.savetxt(mat_prefix + ".x7.dat", x7.x, fmt="%15.6e") np.savetxt(mat_prefix + ".upgrade_2.dat", ug2_mat.T.x, fmt="%15.6e") if upgrade_2.isnull().values.any(): self.logger.lraise("NaNs in upgrade_2") parensemble_cur_lam += upgrade_2 self.logger.log("building upgrade_2 matrix") self.logger.log("enforcing bounds") parensemble_cur_lam.enforce(self.enforce_bounds) self.logger.log("enforcing bounds") self.logger.log("filling fixed parameters") #fill in fixed pars with initial values fi = parensemble_cur_lam.fixed_indexer li = parensemble_cur_lam.log_indexer log_values = self.pst.parameter_data.loc[:, "parval1"].copy() log_values.loc[li] = log_values.loc[li].apply(np.log10) fixed_vals = log_values.loc[fi] for fname, fval in zip(fixed_vals.index, fixed_vals.values): # if fname not in df.columns: # continue # print(fname) parensemble_cur_lam.loc[:, fname] = fval self.logger.log("filling fixed parameters") # this is for testing failed runs on upgrade testing # works with the 10par_xsec smoother test #parensemble_cur_lam.iloc[:,:] = -1000000.0 # some hackery - we lose track of the transform flag here, but just # know it is transformed. Need to create dataframe here because # pd.concat doesn't like par ensembles later paren_lam.append(pd.DataFrame(parensemble_cur_lam.loc[:, :])) self.logger.log("calcs for lambda {0}".format(cur_lam_mult)) if calc_only: return # subset if needed # and combine lambda par ensembles into one par ensemble for evaluation if run_subset is not None and run_subset < self.parensemble.shape[0]: #subset_idx = ["{0:d}".format(i) for i in np.random.randint(0,self.parensemble.shape[0]-1,run_subset)] subset_idx = self.parensemble.iloc[:run_subset, :].index.values self.logger.statement("subset idxs: " + ','.join([str(s) for s in subset_idx])) # more tracking of transformed - just know it! Creating dataframes... paren_lam_subset = [pe.loc[subset_idx, :] for pe in paren_lam] paren_combine = pd.concat(paren_lam_subset, ignore_index=True) paren_lam_subset = None else: subset_idx = self.parensemble.index.values paren_combine = pd.concat(paren_lam, ignore_index=True) self.logger.log("evaluating ensembles for lambdas : {0}".\ format(','.join(["{0:8.3E}".format(l) for l in lam_vals]))) # back to par ensemble and know it is transformed paren_combine = ParameterEnsemble.from_dataframe(df=paren_combine, pst=self.pst, istransformed=True) failed_runs, obsen_combine = self._calc_obs(paren_combine) self.logger.log("evaluating ensembles for lambdas : {0}".\ format(','.join(["{0:8.3E}".format(l) for l in lam_vals]))) paren_combine = None if failed_runs is not None and len( failed_runs) == obsen_combine.shape[0]: self.logger.lraise("all runs failed - cannot continue") # unpack lambda obs ensembles from combined obs ensemble nrun_per_lam = self.obsensemble.shape[0] if run_subset is not None: nrun_per_lam = run_subset obsen_lam = [] for i in range(len(lam_vals)): sidx = i * nrun_per_lam eidx = sidx + nrun_per_lam oe = ObservationEnsemble.from_dataframe( df=obsen_combine.iloc[sidx:eidx, :].copy(), pst=self.pst) oe.index = subset_idx # check for failed runs in this set - drop failed runs from obs ensembles if failed_runs is not None: failed_runs_this = np.array( [f for f in failed_runs if f >= sidx and f < eidx]) - sidx if len(failed_runs_this) > 0: if len(failed_runs_this) == oe.shape[0]: self.logger.warn( "all runs failed for lambda {0}".format( lam_vals[i])) else: self.logger.warn("{0} run failed for lambda {1}".\ format(len(failed_runs_this),lam_vals[i])) oe.iloc[failed_runs_this, :] = np.NaN oe = oe.dropna() paren_lam[i].iloc[failed_runs_this, :] = np.NaN paren_lam[i] = ParameterEnsemble.from_dataframe( df=paren_lam[i].dropna(), pst=self.pst) paren_lam[i].__instransformed = True # don't drop bad reals here, instead, mask bad reals in the lambda # selection and drop later # if self.drop_bad_reals is not None: # assert isinstance(drop_bad_reals, float) # drop_idx = np.argwhere(self.current_phi_vec > self.drop_bad_reals).flatten() # run_ids = self.obsensemble.index.values # drop_idx = run_ids[drop_idx] # if len(drop_idx) == self.obsensemble.shape[0]: # raise Exception("dropped all realizations as 'bad'") # if len(drop_idx) > 0: # self.logger.warn("{0} realizations dropped as 'bad' (indices :{1})". \ # format(len(drop_idx), ','.join([str(d) for d in drop_idx]))) # self.parensemble.loc[drop_idx, :] = np.NaN # self.parensemble = self.parensemble.dropna() # self.obsensemble.loc[drop_idx, :] = np.NaN # self.obsensemble = self.obsensemble.dropna() # # self.current_phi_vec = self._calc_phi_vec(self.obsensemble) obsen_lam.append(oe) obsen_combine = None # here is where we need to select out the "best" lambda par and obs # ensembles #phi_vecs = [self._calc_phi_vec(obsen) for obsen in obsen_lam] #phi_vecs_reg = [self._calc_regul_phi_vec(paren) for paren in paren_lam] #if self.regul_factor > 0.0: # for i,(pv,prv) in enumerate(zip(phi_vecs,phi_vecs_reg)): # phi_vecs[i] = pv + (prv * self.regul_factor) self.logger.log("calc lambda phi vectors") phi_vecs = [ self.phi.get_meas_and_regul_phi(oe, pe.loc[oe.index, :]) for oe, pe in zip(obsen_lam, paren_lam) ] self.logger.log("calc lambda phi vectors") if self.drop_bad_reals is not None: for i, (meas_pv, regul_pv) in enumerate(phi_vecs): #for testing the drop_bad_reals functionality #pv[[0,3,7]] = self.drop_bad_reals + 1.0 regul_pv = regul_pv.copy() regul_pv[meas_pv > self.drop_bad_reals] = np.NaN regul_pv = regul_pv[~np.isnan(regul_pv)] meas_pv[meas_pv > self.drop_bad_reals] = np.NaN meas_pv = meas_pv[~np.isnan(meas_pv)] if len(meas_pv) == 0: #raise Exception("all realization for lambda {0} dropped as 'bad'".\ # format(lam_vals[i])) self.logger.warn( "all realizations for lambda {0} marked as 'bad'") meas_pv = np.zeros_like(obsen_lam[0].shape[0]) + 1.0e+30 regul_pv = np.zeros_like(obsen_lam[0].shape[0]) + 1.0e+30 phi_vecs[i] = (meas_pv, regul_pv) mean_std_meas = [(pv[0].mean(), pv[0].std()) for pv in phi_vecs] mean_std_regul = [(pv[1].mean(), pv[1].std()) for pv in phi_vecs] update_pars = False update_lambda = False self.logger.statement("**************************") # self.logger.statement(str(datetime.now())) self.logger.statement("lambda testing summary") self.logger.statement("total runs:{0}".format(self.total_runs)) self.logger.statement("iteration: {0}".format(self.iter_num)) self.logger.statement("current lambda:{0:15.6G}, mean:{1:15.6G}, std:{2:15.6G}". \ format(self.current_lambda, self.last_best_mean, self.last_best_std)) # accept a new best if its within 10% best_mean = self.last_best_mean * 1.1 best_std = self.last_best_std * 1.1 best_i = 0 for i, ((mm, ms), (rm, rs)) in enumerate(zip(mean_std_meas, mean_std_regul)): self.logger.statement( " tested lambda:{0:15.6G}, meas mean:{1:15.6G}, meas std:{2:15.6G}" .format(self.current_lambda * lambda_mults[i], mm, ms)) self.logger.statement("{0:30s}regul mean:{1:15.6G}, regul std:{2:15.6G}".\ format(' ',rm,rs)) m = mm + (self.regul_factor * rm) s = ms + (self.regul_factor * rs) if m < best_mean: update_pars = True best_mean = m best_i = i if s < best_std: update_lambda = True best_std = s if np.isnan(best_mean): self.logger.lraise("best mean = NaN") if np.isnan(best_std): self.logger.lraise("best std = NaN") if not update_pars: self.current_lambda *= max(lambda_mults) * 10.0 self.current_lambda = min(self.current_lambda, 100000) self.logger.statement("not accepting iteration, increased lambda:{0}".\ format(self.current_lambda)) else: #more transformation status hard coding - ugly self.parensemble = ParameterEnsemble.from_dataframe( df=paren_lam[best_i], pst=self.pst, istransformed=True) if run_subset is not None: failed_runs, self.obsensemble = self._calc_obs( self.parensemble) if failed_runs is not None: self.logger.warn("dropping failed realizations") self.parensemble.loc[failed_runs, :] = np.NaN self.parensemble = self.parensemble.dropna() self.obsensemble.loc[failed_runs, :] = np.NaN self.obsensemble = self.obsensemble.dropna() self.phi.update() best_mean = self.phi.comp_phi.mean() best_std = self.phi.comp_phi.std() else: self.obsensemble = obsen_lam[best_i] # reindex parensemble in case failed runs self.parensemble = ParameterEnsemble.from_dataframe( df=self.parensemble.loc[self.obsensemble.index], pst=self.pst, istransformed=self.parensemble.istransformed) self.phi.update() if self.drop_bad_reals is not None: # for testing drop_bad_reals functionality # self.current_phi_vec[::2] = self.drop_bad_reals + 1.0 #drop_idx = np.argwhere(self.current_phi_vec > self.drop_bad_reals).flatten() drop_idx = np.argwhere( self.phi.comp_phi > self.drop_bad_reals).flatten() run_ids = self.obsensemble.index.values drop_idx = run_ids[drop_idx] if len(drop_idx) > self.obsensemble.shape[0] - 3: raise Exception("dropped too many realizations as 'bad'") if len(drop_idx) > 0: self.logger.warn("{0} realizations dropped as 'bad' (indices :{1})". \ format(len(drop_idx), ','.join([str(d) for d in drop_idx]))) self.parensemble.loc[drop_idx, :] = np.NaN self.parensemble = self.parensemble.dropna() self.obsensemble.loc[drop_idx, :] = np.NaN self.obsensemble = self.obsensemble.dropna() self.phi.update() best_mean = self.phi.comp_phi.mean() best_std = self.phi.comp_phi.std() self.phi.report(cur_lam=self.current_lambda * lambda_mults[best_i]) self.logger.statement(" best lambda:{0:15.6G}, mean:{1:15.6G}, std:{2:15.6G}".\ format(self.current_lambda*lambda_mults[best_i], best_mean,best_std)) #self.logger.statement(" actual mean phi: {0:15.6G}".format(float(self.current_actual_phi.mean()))) self.last_best_mean = best_mean self.last_best_std = best_std if update_lambda: # be aggressive self.current_lambda *= (lambda_mults[best_i] * 0.75) # but don't let lambda get too small self.current_lambda = max(self.current_lambda, 0.00001) self.logger.statement("updating lambda: {0:15.6G}".\ format(self.current_lambda )) self.logger.statement("**************************\n") self.parensemble.to_csv(self.pst.filename+self.paren_prefix.\ format(self.iter_num)) self.obsensemble.to_csv(self.pst.filename+self.obsen_prefix.\ format(self.iter_num)) if self.raw_sweep_out is not None: self.raw_sweep_out.to_csv(self.pst.filename+"_sweepraw{0}.csv".\ format(self.iter_num)) self.logger.log("iteration {0}".format(self.iter_num))
def initialize(self, num_reals=1, init_lambda=None, enforce_bounds="reset", parensemble=None, obsensemble=None, restart_obsensemble=None, regul_factor=0.0, use_approx_prior=True, build_empirical_prior=False): """Initialize the iES process. Depending on arguments, draws or loads initial parameter observations ensembles and runs the initial parameter ensemble Parameters ---------- num_reals : int the number of realizations to draw. Ignored if parensemble/obsensemble are not None init_lambda : float the initial lambda to use. During subsequent updates, the lambda is updated according to upgrade success enforce_bounds : str how to enfore parameter bound transgression. options are reset, drop, or None parensemble : pyemu.ParameterEnsemble or str a parameter ensemble or filename to use as the initial parameter ensemble. If not None, then obsenemble must not be None obsensemble : pyemu.ObservationEnsemble or str an observation ensemble or filename to use as the initial observation ensemble. If not None, then parensemble must not be None restart_obsensemble : pyemu.ObservationEnsemble or str an observation ensemble or filename to use as an evaluated observation ensemble. If not None, this will skip the initial parameter ensemble evaluation - user beware! regul_factor : float the regularization penalty fraction of the composite objective. The Prurist, MAP solution would be regul_factor = 1.0, yielding equal parts measurement and regularization to the composite objective function. Default is 0.0, which means only seek to minimize the measurement objective function use_approx_prior : bool a flag to use the inverse, square root of the prior ccovariance matrix for scaling the upgrade calculation. If True, this matrix is not used. Default is True build_empirical_prior : bool flag to build the prior parameter covariance matrix from an existing parensemble. If True and parensemble is None, an exception is raised Example ------- ``>>>import pyemu`` ``>>>es = pyemu.EnsembleSmoother(pst="pest.pst")`` ``>>>es.initialize(num_reals=100)`` """ ''' (re)initialize the process ''' # initialize the phi report csv self.enforce_bounds = enforce_bounds self.regul_factor = float(regul_factor) self.total_runs = 0 # this matrix gets used a lot, so only calc once and store self.obscov_inv_sqrt = self.obscov.get(self.pst.nnz_obs_names).inv.sqrt if use_approx_prior: self.logger.statement("using approximate parcov in solution") self.parcov_inv_sqrt = 1.0 else: self.logger.statement("using full parcov in solution") # Chen and Oliver use a low rank approx here, but so far, # I haven't needed it - not using enough parameters yet self.logger.log("forming inverse sqrt parcov matrix") self.parcov_inv_sqrt = self.parcov.inv.sqrt self.logger.log("forming inverse sqrt parcov matrix") if parensemble is not None and obsensemble is not None: self.logger.log("initializing with existing ensembles") if isinstance(parensemble, str): self.logger.log("loading parensemble from file") if not os.path.exists(obsensemble): self.logger.lraise("can not find parensemble file: {0}".\ format(parensemble)) df = pd.read_csv(parensemble, index_col=0) #df.index = [str(i) for i in df.index] self.parensemble_0 = ParameterEnsemble.from_dataframe( df=df, pst=self.pst) self.logger.log("loading parensemble from file") elif isinstance(parensemble, ParameterEnsemble): self.parensemble_0 = parensemble.copy() else: raise Exception("unrecognized arg type for parensemble, " +\ "should be filename or ParameterEnsemble" +\ ", not {0}".format(type(parensemble))) self.parensemble = self.parensemble_0.copy() if isinstance(obsensemble, str): self.logger.log("loading obsensemble from file") if not os.path.exists(obsensemble): self.logger.lraise("can not find obsensemble file: {0}".\ format(obsensemble)) df = pd.read_csv(obsensemble, index_col=0).loc[:, self.pst.nnz_obs_names] #df.index = [str(i) for i in df.index] self.obsensemble_0 = ObservationEnsemble.from_dataframe( df=df, pst=self.pst) self.logger.log("loading obsensemble from file") elif isinstance(obsensemble, ObservationEnsemble): self.obsensemble_0 = obsensemble.copy() else: raise Exception("unrecognized arg type for obsensemble, " +\ "should be filename or ObservationEnsemble" +\ ", not {0}".format(type(obsensemble))) assert self.parensemble_0.shape[0] == self.obsensemble_0.shape[0] #self.num_reals = self.parensemble_0.shape[0] num_reals = self.parensemble.shape[0] self.logger.log("initializing with existing ensembles") if build_empirical_prior: self.reset_parcov(self.parensemble.covariance_matrix()) if self.save_mats: self.parcov.to_binary(self.pst.filename + ".empcov.jcb") else: if build_empirical_prior: self.logger.lraise( "can't use build_emprirical_prior without parensemble...") self.logger.log( "initializing smoother with {0} realizations".format( num_reals)) self.logger.log("initializing parensemble") self.parensemble_0 = pyemu.ParameterEnsemble.from_gaussian_draw( self.pst, self.parcov, num_reals=num_reals) self.parensemble_0.enforce(enforce_bounds=enforce_bounds) self.logger.log("initializing parensemble") self.parensemble = self.parensemble_0.copy() self.parensemble_0.to_csv(self.pst.filename +\ self.paren_prefix.format(0)) self.logger.log("initializing parensemble") self.logger.log("initializing obsensemble") self.obsensemble_0 = pyemu.ObservationEnsemble.from_id_gaussian_draw( self.pst, num_reals=num_reals) #self.obsensemble = self.obsensemble_0.copy() # save the base obsensemble self.obsensemble_0.to_csv(self.pst.filename +\ self.obsen_prefix.format(-1)) self.logger.log("initializing obsensemble") self.logger.log( "initializing smoother with {0} realizations".format( num_reals)) if use_approx_prior: self.logger.statement("using approximate parcov in solution") self.parcov_inv_sqrt = 1.0 else: self.logger.statement("using full parcov in solution") # Chen and Oliver use a low rank approx here, but so far, # I haven't needed it - not using enough parameters yet self.logger.log("forming inverse sqrt parcov matrix") self.parcov_inv_sqrt = self.parcov.inv.sqrt self.logger.log("forming inverse sqrt parcov matrix") # self.obs0_matrix = self.obsensemble_0.nonzero.as_pyemu_matrix() # self.par0_matrix = self.parensemble_0.as_pyemu_matrix() self.enforce_bounds = enforce_bounds if restart_obsensemble is not None: self.logger.log( "loading restart_obsensemble {0}".format(restart_obsensemble)) failed_runs, self.obsensemble = self._load_obs_ensemble( restart_obsensemble) assert self.obsensemble.shape[0] == self.obsensemble_0.shape[0] assert list(self.obsensemble.columns) == list( self.obsensemble_0.columns) self.logger.log( "loading restart_obsensemble {0}".format(restart_obsensemble)) else: # run the initial parameter ensemble self.logger.log("evaluating initial ensembles") failed_runs, self.obsensemble = self._calc_obs(self.parensemble) self.obsensemble.to_csv(self.pst.filename +\ self.obsen_prefix.format(0)) if self.raw_sweep_out is not None: self.raw_sweep_out.to_csv(self.pst.filename + "_sweepraw0.csv") self.logger.log("evaluating initial ensembles") if failed_runs is not None: self.logger.warn("dropping failed realizations") #failed_runs_str = [str(f) for f in failed_runs] #self.parensemble = self.parensemble.drop(failed_runs) #self.obsensemble = self.obsensemble.drop(failed_runs) self.parensemble.loc[failed_runs, :] = np.NaN self.parensemble = self.parensemble.dropna() self.obsensemble.loc[failed_runs, :] = np.NaN self.obsensemble = self.obsensemble.dropna() if not self.parensemble.istransformed: self.parensemble._transform(inplace=True) if not self.parensemble_0.istransformed: self.parensemble_0._transform(inplace=True) self.phi = Phi(self) if self.drop_bad_reals is not None: #drop_idx = np.argwhere(self.current_phi_vec > self.drop_bad_reals).flatten() #comp_phi = self.phi.comp_phi #drop_idx = np.argwhere(self.phi.comp_phi > self.drop_bad_reals).flatten() #meas_phi = self.phi.meas_phi drop_idx = np.argwhere( self.phi.meas_phi > self.drop_bad_reals).flatten() run_ids = self.obsensemble.index.values drop_idx = run_ids[drop_idx] if len(drop_idx) == self.obsensemble.shape[0]: raise Exception("dropped all realizations as 'bad'") if len(drop_idx) > 0: self.logger.warn("{0} realizations dropped as 'bad' (indices :{1})".\ format(len(drop_idx),','.join([str(d) for d in drop_idx]))) self.parensemble.loc[drop_idx, :] = np.NaN self.parensemble = self.parensemble.dropna() self.obsensemble.loc[drop_idx, :] = np.NaN self.obsensemble = self.obsensemble.dropna() self.phi.update() self.phi.report(cur_lam=0.0) self.last_best_mean = self.phi.comp_phi.mean() self.last_best_std = self.phi.comp_phi.std() #self.logger.statement("initial phi (mean, std): {0:15.6G},{1:15.6G}".\ # format(self.last_best_mean,self.last_best_std)) if init_lambda is not None: self.current_lambda = float(init_lambda) else: #following chen and oliver x = self.last_best_mean / (2.0 * float(self.obsensemble.shape[1])) self.current_lambda = 10.0**(np.floor(np.log10(x))) self.logger.statement("current lambda:{0:15.6g}".format( self.current_lambda)) self.delta_par_prior = self._calc_delta_par(self.parensemble_0) u, s, v = self.delta_par_prior.pseudo_inv_components( eigthresh=self.pst.svd_data.eigthresh) self.Am = u * s.inv if self.save_mats: np.savetxt(self.pst.filename.replace(".pst", '.') + "0.prior_par_diff.dat", self.delta_par_prior.x, fmt="%15.6e") np.savetxt(self.pst.filename.replace(".pst", '.') + "0.am_u.dat", u.x, fmt="%15.6e") np.savetxt(self.pst.filename.replace(".pst", '.') + "0.am_v.dat", v.x, fmt="%15.6e") np.savetxt(self.pst.filename.replace(".pst", '.') + "0.am_s_inv.dat", s.inv.as_2d, fmt="%15.6e") np.savetxt(self.pst.filename.replace(".pst", '.') + "0.am.dat", self.Am.x, fmt="%15.6e") self._initialized = True
class MonteCarlo(LinearAnalysis): """LinearAnalysis derived type for monte carlo analysis Note: requires a pest control file, which can be derived from a jco argument MonteCarlo.project_parsensemble also requires a jacobian """ def __init__(self, **kwargs): super(MonteCarlo, self).__init__(**kwargs) assert self.pst is not None, \ "monte carlo requires a pest control file" self.parensemble = ParameterEnsemble(pst=self.pst) self.obsensemble = ObservationEnsemble(pst=self.pst) @property def num_reals(self): return self.parensemble.shape[0] def get_nsing(self, epsilon=1.0e-4): """ get the number of solution space dimensions given a ratio between the largest and smallest singular values Parameters: epsilon: ratio Returns : integer (or None) number of singular components above the epsilon ratio threshold If nsing == nadj_par, then None is returned """ mx = self.xtqx.shape[0] nsing = mx - np.searchsorted( np.sort((self.xtqx.s.x / self.xtqx.s.x.max())[:, 0]), epsilon) if nsing == mx: self.logger.warn("optimal nsing=npar") nsing = None return nsing def get_null_proj(self, nsing=None): """ get a null-space projection matrix of XTQX Parameters: ---------- nsing: optional number of singular components to use if none, call self.get_nsing() Returns: ------- Matrix instance : V2V2^T """ if nsing is None: nsing = self.get_nsing() if nsing is None: raise Exception("nsing is None") print("using {0} singular components".format(nsing)) self.log("forming null space projection matrix with " +\ "{0} of {1} singular components".format(nsing,self.jco.shape[1])) v2_proj = (self.xtqx.v[:, nsing:] * self.xtqx.v[:, nsing:].T) self.log("forming null space projection matrix with " +\ "{0} of {1} singular components".format(nsing,self.jco.shape[1])) return v2_proj def draw(self, num_reals=1, par_file=None, obs=False, enforce_bounds=False, cov=None, how="gaussian"): """draw stochastic realizations of parameters and optionally observations Parameters: ---------- num_reals (int): number of realization to generate par_file (str): parameter file to use as mean values obs (bool): add a realization of measurement noise to obs enforce_bounds (bool): enforce parameter bounds in control file how (str): type of distribution. Must be in ["gaussian","uniform"] Returns: None Raises: None """ if par_file is not None: self.pst.parrep(par_file) how = how.lower().strip() assert how in ["gaussian", "uniform"] if cov is not None: assert isinstance(cov, Cov) if how == "uniform": raise Exception("MonteCarlo.draw() error: 'how'='uniform'," +\ " 'cov' arg cannot be passed") else: cov = self.parcov self.parensemble = ParameterEnsemble(pst=self.pst) self.obsensemble = ObservationEnsemble(pst=self.pst) self.log("generating {0:d} parameter realizations".format(num_reals)) self.parensemble.draw(cov, num_reals=num_reals, how=how) if enforce_bounds: self.parensemble.enforce() self.log("generating {0:d} parameter realizations".format(num_reals)) if obs: self.log( "generating {0:d} observation realizations".format(num_reals)) self.obsensemble.draw(self.obscov, num_reals=num_reals) self.log( "generating {0:d} observation realizations".format(num_reals)) def project_parensemble(self, par_file=None, nsing=None, inplace=True): """ perform the null-space projection operations for null-space monte carlo Parameters: par_file: str an optional file of parameter values to use nsing: int number of singular values to in forming null subspace matrix inplace: bool overwrite the existing parameter ensemble with the projected values Returns: ------- if inplace is False, ParameterEnsemble instance, otherwise None """ assert self.jco is not None,"MonteCarlo.project_parensemble()" +\ "requires a jacobian attribute" if par_file is not None: assert os.path.exists(par_file),"monte_carlo.draw() error: par_file not found:" +\ par_file self.parensemble.pst.parrep(par_file) # project the ensemble self.log("projecting parameter ensemble") en = self.parensemble.project(self.get_null_proj(nsing), inplace=inplace, log=self.log) self.log("projecting parameter ensemble") return en def write_psts(self, prefix, existing_jco=None, noptmax=None): """ write parameter and optionally observation realizations to pest control files Parameters: ---------- prefix: str pest control file prefix existing_jco: str filename of an existing jacobian matrix to add to the pest++ options in the control file. This is useful for NSMC since this jco can be used to get the first set of parameter upgrades for free! Needs to be the path the jco file as seen from the location where pest++ will be run noptmax: int value of NOPTMAX to set in new pest control files Returns: ------- None """ self.log("writing realized pest control files") # get a copy of the pest control file pst = self.pst.get(par_names=self.pst.par_names, obs_names=self.pst.obs_names) if noptmax is not None: pst.control_data.noptmax = noptmax pst.control_data.noptmax = noptmax if existing_jco is not None: pst.pestpp_options["BASE_JACOBIAN"] = existing_jco # set the indices pst.parameter_data.index = pst.parameter_data.parnme pst.observation_data.index = pst.observation_data.obsnme if self.parensemble.istransformed: par_en = self.parensemble._back_transform(inplace=False) else: par_en = self.parensemble for i in range(self.num_reals): pst_name = prefix + "{0:d}.pst".format(i) self.log("writing realized pest control file " + pst_name) pst.parameter_data.loc[par_en.columns, "parval1"] = par_en.iloc[i, :].T # reset the regularization if pst.control_data.pestmode == "regularization": pst.zero_order_tikhonov(parbounds=True) # add the obs noise realization if needed if self.obsensemble.shape[0] == self.num_reals: pst.observation_data.loc[self.obsensemble.columns,"obsval"] = \ self.obsensemble.iloc[i, :].T # write pst.write(pst_name) self.log("writing realized pest control file " + pst_name) self.log("writing realized pest control files")
def update(self, lambda_mults=[1.0], localizer=None, run_subset=None, use_approx=True): if run_subset is not None: if run_subset >= self.obsensemble.shape[0]: self.logger.warn("run_subset ({0}) >= num of active reals ({1})...ignoring ".\ format(run_subset,self.obsensemble.shape[0])) run_subset = None self.iter_num += 1 self.logger.log("iteration {0}".format(self.iter_num)) self.logger.statement("{0} active realizations".format( self.obsensemble.shape[0])) if self.obsensemble.shape[0] < 2: self.logger.lraise( "at least active 2 realizations (really like 300) are needed to update" ) if not self.__initialized: #raise Exception("must call initialize() before update()") self.logger.lraise("must call initialize() before update()") self.logger.log("calculate scaled delta obs") scaled_delta_obs = self._calc_delta_obs(self.obsensemble) self.logger.log("calculate scaled delta obs") self.logger.log("calculate scaled delta par") scaled_delta_par = self._calc_delta_par(self.parensemble) self.logger.log("calculate scaled delta par") self.logger.log("calculate pseudo inv comps") u, s, v = scaled_delta_obs.pseudo_inv_components() self.logger.log("calculate pseudo inv comps") self.logger.log("calculate obs diff matrix") obs_diff = self.obscov_inv_sqrt * self._get_residual_matrix( self.obsensemble).T self.logger.log("calculate obs diff matrix") # here is the math part...calculate upgrade matrices mean_lam, std_lam, paren_lam, obsen_lam = [], [], [], [] lam_vals = [] for ilam, cur_lam_mult in enumerate(lambda_mults): parensemble_cur_lam = self.parensemble.copy() #print(parensemble_cur_lam.isnull().values.any()) cur_lam = self.current_lambda * cur_lam_mult lam_vals.append(cur_lam) self.logger.log("calcs for lambda {0}".format(cur_lam_mult)) scaled_ident = Cov.identity_like(s) * (cur_lam + 1.0) scaled_ident += s**2 scaled_ident = scaled_ident.inv # build up this matrix as a single element so we can apply # localization self.logger.log("building upgrade_1 matrix") upgrade_1 = -1.0 * (self.half_parcov_diag * scaled_delta_par) *\ v * s * scaled_ident * u.T self.logger.log("building upgrade_1 matrix") # apply localization if localizer is not None: self.logger.log("applying localization") upgrade_1.hadamard_product(localizer) self.logger.log("applying localization") # apply residual information self.logger.log("applying residuals") upgrade_1 *= obs_diff self.logger.log("applying residuals") self.logger.log("processing upgrade_1") upgrade_1 = upgrade_1.to_dataframe() upgrade_1.index.name = "parnme" upgrade_1 = upgrade_1.T upgrade_1.index = [int(i) for i in upgrade_1.index] upgrade_1.to_csv(self.pst.filename+".upgrade_1.{0:04d}.csv".\ format(self.iter_num)) if upgrade_1.isnull().values.any(): self.logger.lraise("NaNs in upgrade_1") self.logger.log("processing upgrade_1") #print(upgrade_1.isnull().values.any()) #print(parensemble_cur_lam.index) #print(upgrade_1.index) parensemble_cur_lam += upgrade_1 # parameter-based upgrade portion if not use_approx and self.iter_num > 1: self.logger.log("building upgrade_2 matrix") par_diff = (self.parensemble - self.parensemble_0.loc[self.parensemble.index,:]).\ as_pyemu_matrix().T x4 = self.Am.T * self.half_parcov_diag * par_diff x5 = self.Am * x4 x6 = scaled_delta_par.T * x5 x7 = v * scaled_ident * v.T * x6 upgrade_2 = -1.0 * (self.half_parcov_diag * scaled_delta_par * x7).to_dataframe() upgrade_2.index.name = "parnme" upgrade_2 = upgrade_2.T upgrade_2.to_csv(self.pst.filename+".upgrade_2.{0:04d}.csv".\ format(self.iter_num)) upgrade_2.index = [int(i) for i in upgrade_2.index] if upgrade_2.isnull().values.any(): self.logger.lraise("NaNs in upgrade_2") parensemble_cur_lam += upgrade_2 self.logger.log("building upgrade_2 matrix") parensemble_cur_lam.enforce(self.enforce_bounds) # this is for testing failed runs on upgrade testing # works with the 10par_xsec smoother test #parensemble_cur_lam.iloc[:,:] = -1000000.0 paren_lam.append(pd.DataFrame(parensemble_cur_lam.loc[:, :])) self.logger.log("calcs for lambda {0}".format(cur_lam_mult)) # subset if needed # and combine lambda par ensembles into one par ensemble for evaluation if run_subset is not None and run_subset < self.parensemble.shape[0]: #subset_idx = ["{0:d}".format(i) for i in np.random.randint(0,self.parensemble.shape[0]-1,run_subset)] subset_idx = self.parensemble.iloc[:run_subset, :].index.values self.logger.statement("subset idxs: " + ','.join([str(s) for s in subset_idx])) paren_lam_subset = [pe.loc[subset_idx, :] for pe in paren_lam] paren_combine = pd.concat(paren_lam_subset, ignore_index=True) paren_lam_subset = None else: subset_idx = self.parensemble.index.values paren_combine = pd.concat(paren_lam, ignore_index=True) self.logger.log("evaluating ensembles for lambdas : {0}".\ format(','.join(["{0:8.3E}".format(l) for l in lam_vals]))) failed_runs, obsen_combine = self._calc_obs(paren_combine) #if failed_runs is not None: # obsen_combine.loc[failed_runs,:] = np.NaN self.logger.log("evaluating ensembles for lambdas : {0}".\ format(','.join(["{0:8.3E}".format(l) for l in lam_vals]))) paren_combine = None if failed_runs is not None and len( failed_runs) == obsen_combine.shape[0]: self.logger.lraise("all runs failed - cannot continue") # unpack lambda obs ensembles from combined obs ensemble nrun_per_lam = self.obsensemble.shape[0] if run_subset is not None: nrun_per_lam = run_subset obsen_lam = [] for i in range(len(lam_vals)): sidx = i * nrun_per_lam eidx = sidx + nrun_per_lam oe = ObservationEnsemble.from_dataframe( df=obsen_combine.iloc[sidx:eidx, :].copy(), pst=self.pst) oe.index = subset_idx # check for failed runs in this set - drop failed runs from obs ensembles if failed_runs is not None: failed_runs_this = np.array( [f for f in failed_runs if f >= sidx and f < eidx]) - sidx if len(failed_runs_this) > 0: if len(failed_runs_this) == oe.shape[0]: self.logger.warn( "all runs failed for lambda {0}".format( lam_vals[i])) else: self.logger.warn("{0} run failed for lambda {1}".\ format(len(failed_runs_this),lam_vals[i])) oe.iloc[failed_runs_this, :] = np.NaN oe = oe.dropna() obsen_lam.append(oe) obsen_combine = None # here is where we need to select out the "best" lambda par and obs # ensembles self.logger.statement("\n**************************") self.logger.statement(str(datetime.now())) self.logger.statement("total runs:{0}".format(self.total_runs)) self.logger.statement("iteration: {0}".format(self.iter_num)) self.logger.statement("current lambda:{0:15.6G}, mean:{1:15.6G}, std:{2:15.6G}".\ format(self.current_lambda, self.last_best_mean,self.last_best_std)) phi_vecs = [self._calc_phi_vec(obsen) for obsen in obsen_lam] mean_std = [(pv.mean(), pv.std()) for pv in phi_vecs] update_pars = False update_lambda = False # accept a new best if its within 10% best_mean = self.last_best_mean * 1.1 best_std = self.last_best_std * 1.1 best_i = 0 for i, (m, s) in enumerate(mean_std): self.logger.statement(" tested lambda:{0:15.6G}, mean:{1:15.6G}, std:{2:15.6G}".\ format(self.current_lambda * lambda_mults[i],m,s)) if m < best_mean: update_pars = True best_mean = m best_i = i if s < best_std: update_lambda = True best_std = s if np.isnan(best_mean): self.logger.lraise("best mean = NaN") if np.isnan(best_std): self.logger.lraise("best std = NaN") if not update_pars: self.current_lambda *= max(lambda_mults) * 10.0 self.current_lambda = min(self.current_lambda, 100000) self.logger.statement("not accepting iteration, increased lambda:{0}".\ format(self.current_lambda)) else: self.parensemble = ParameterEnsemble.from_dataframe( df=paren_lam[best_i], pst=self.pst) if run_subset is not None: failed_runs, self.obsensemble = self._calc_obs( self.parensemble) if failed_runs is not None: self.logger.warn("dropping failed realizations") self.parensemble = self.parensemble.drop(failed_runs) self.obsensemble = self.obsensemble.drop(failed_runs) self.current_phi_vec = self._calc_phi_vec(self.obsensemble) self._phi_report(self.current_phi_vec, self.current_lambda * lambda_mults[best_i]) best_mean = self.current_phi_vec.mean() best_std = self.current_phi_vec.std() else: self.obsensemble = obsen_lam[best_i] # reindex parensemble in case failed runs self.parensemble = ParameterEnsemble.from_dataframe( df=self.parensemble.loc[self.obsensemble.index], pst=self.pst) self._phi_report(phi_vecs[best_i], self.current_lambda * lambda_mults[best_i]) self.current_phi_vec = phi_vecs[best_i] self.logger.statement(" best lambda:{0:15.6G}, mean:{1:15.6G}, std:{2:15.6G}".\ format(self.current_lambda*lambda_mults[best_i], best_mean,best_std)) self.last_best_mean = best_mean self.last_best_std = best_std if update_lambda: # be aggressive self.current_lambda *= (lambda_mults[best_i] * 0.75) # but don't let lambda get too small self.current_lambda = max(self.current_lambda, 0.001) self.logger.statement("updating lambda: {0:15.6G}".\ format(self.current_lambda )) self.logger.statement("**************************\n") self.parensemble.to_csv(self.pst.filename+self.paren_prefix.\ format(self.iter_num)) self.obsensemble.to_csv(self.pst.filename+self.obsen_prefix.\ format(self.iter_num)) if self.raw_sweep_out is not None: self.raw_sweep_out.to_csv(self.pst.filename+"_raw{0}".\ format(self.iter_num)) self.logger.log("iteration {0}".format(self.iter_num))
class MonteCarlo(LinearAnalysis): """LinearAnalysis derived type for monte carlo analysis Parameters ---------- **kwargs : dict dictionary of keyword arguments. See pyemu.LinearAnalysis for complete definitions Attributes ---------- parensemble : pyemu.ParameterEnsemble pyemu object derived from a pandas dataframe, the ensemble of parameters from the PEST control file with associated starting value and bounds. Object also exposes methods relevant to the dataframe and parameters-- see documentation. obsensemble : pyemu.ObservationEnsemble pyemu object derived from a pandas dataframe, the ensemble of observations from the PEST control file with associated starting weights. Object also exposes methods relevant to the dataframe and observations-- see documentation. Returns ------- MonteCarlo pyEMU MonteCarlo object Example ------- ``>>>import pyemu`` ``>>>mc = pyemu.MonteCarlo(pst="pest.pst")`` """ def __init__(self, **kwargs): warnings.warn( "pyemu.MonteCarlo class is deprecated. " + "Please use the ensemble classes directly", PyemuWarning, ) super(MonteCarlo, self).__init__(**kwargs) assert self.pst is not None, "monte carlo requires a pest control file" self.parensemble = ParameterEnsemble(pst=self.pst) self.obsensemble = ObservationEnsemble(pst=self.pst) @property def num_reals(self): """ get the number of realizations in the parameter ensemble Returns ------- num_real : int """ return self.parensemble.shape[0] def get_nsing(self, epsilon=1.0e-4): """ get the number of solution space dimensions given a ratio between the largest and smallest singular values Parameters ---------- epsilon: float singular value ratio Returns ------- nsing : float number of singular components above the epsilon ratio threshold Note ----- If nsing == nadj_par, then None is returned """ mx = self.xtqx.shape[0] nsing = mx - np.searchsorted( np.sort((self.xtqx.s.x / self.xtqx.s.x.max())[:, 0]), epsilon) if nsing == mx: self.logger.warn("optimal nsing=npar") nsing = None return nsing def get_null_proj(self, nsing=None): """ get a null-space projection matrix of XTQX Parameters ---------- nsing: int optional number of singular components to use If Nonte, then nsing is determined from call to MonteCarlo.get_nsing() Returns ------- v2_proj : pyemu.Matrix the null-space projection matrix (V2V2^T) """ if nsing is None: nsing = self.get_nsing() if nsing is None: raise Exception("nsing is None") print("using {0} singular components".format(nsing)) self.log( "forming null space projection matrix with " + "{0} of {1} singular components".format(nsing, self.jco.shape[1])) v2_proj = self.xtqx.v[:, nsing:] * self.xtqx.v[:, nsing:].T self.log( "forming null space projection matrix with " + "{0} of {1} singular components".format(nsing, self.jco.shape[1])) return v2_proj def draw( self, num_reals=1, par_file=None, obs=False, enforce_bounds=None, cov=None, how="gaussian", ): """draw stochastic realizations of parameters and optionally observations, filling MonteCarlo.parensemble and optionally MonteCarlo.obsensemble. Parameters ---------- num_reals : int number of realization to generate par_file : str parameter file to use as mean values. If None, use MonteCarlo.pst.parameter_data.parval1. Default is None obs : bool add a realization of measurement noise to observation values, forming MonteCarlo.obsensemble.Default is False enforce_bounds : str enforce parameter bounds based on control file information. options are 'reset', 'drop' or None. Default is None how : str type of distribution to draw from. Must be in ["gaussian","uniform"] default is "gaussian". Example ------- ``>>>import pyemu`` ``>>>mc = pyemu.MonteCarlo(pst="pest.pst")`` ``>>>mc.draw(1000)`` """ if par_file is not None: self.pst.parrep(par_file) how = how.lower().strip() assert how in ["gaussian", "uniform"] if cov is not None: assert isinstance(cov, Cov) if how == "uniform": raise Exception("MonteCarlo.draw() error: 'how'='uniform'," + " 'cov' arg cannot be passed") else: cov = self.parcov self.log("generating {0:d} parameter realizations".format(num_reals)) if how == "gaussian": self.parensemble = ParameterEnsemble.from_gaussian_draw( pst=self.pst, cov=cov, num_reals=num_reals, use_homegrown=True, enforce_bounds=False, ) elif how == "uniform": self.parensemble = ParameterEnsemble.from_uniform_draw( pst=self.pst, num_reals=num_reals) else: raise Exception( "MonteCarlo.draw(): unrecognized 'how' arg: {0}".format(how)) # self.parensemble = ParameterEnsemble(pst=self.pst) # self.obsensemble = ObservationEnsemble(pst=self.pst) # self.parensemble.draw(cov,num_reals=num_reals, how=how, # enforce_bounds=enforce_bounds) if enforce_bounds is not None: self.parensemble.enforce(enforce_bounds) self.log("generating {0:d} parameter realizations".format(num_reals)) if obs: self.log( "generating {0:d} observation realizations".format(num_reals)) self.obsensemble = ObservationEnsemble.from_id_gaussian_draw( pst=self.pst, num_reals=num_reals) self.log( "generating {0:d} observation realizations".format(num_reals)) def project_parensemble(self, par_file=None, nsing=None, inplace=True, enforce_bounds="reset"): """ perform the null-space projection operations for null-space monte carlo Parameters ---------- par_file: str an optional file of parameter values to use nsing: int number of singular values to in forming null subspace matrix inplace: bool overwrite the existing parameter ensemble with the projected values enforce_bounds: str how to enforce parameter bounds. can be None, 'reset', or 'drop'. Default is None Returns ------- par_en : pyemu.ParameterEnsemble if inplace is False, otherwise None Note ---- to use this method, the MonteCarlo instance must have been constructed with the ``jco`` argument. Example ------- ``>>>import pyemu`` ``>>>mc = pyemu.MonteCarlo(jco="pest.jcb")`` ``>>>mc.draw(1000)`` ``>>>mc.project_parensemble(par_file="final.par",nsing=100)`` """ assert self.jco is not None, ("MonteCarlo.project_parensemble()" + "requires a jacobian attribute") if par_file is not None: assert os.path.exists(par_file), ( "monte_carlo.draw() error: par_file not found:" + par_file) self.parensemble.pst.parrep(par_file) # project the ensemble self.log("projecting parameter ensemble") en = self.parensemble.project(self.get_null_proj(nsing), inplace=inplace, log=self.log) self.log("projecting parameter ensemble") return en def write_psts(self, prefix, existing_jco=None, noptmax=None): """ write parameter and optionally observation realizations to a series of pest control files Parameters ---------- prefix: str pest control file prefix existing_jco: str filename of an existing jacobian matrix to add to the pest++ options in the control file. This is useful for NSMC since this jco can be used to get the first set of parameter upgrades for free! Needs to be the path the jco file as seen from the location where pest++ will be run noptmax: int value of NOPTMAX to set in new pest control files Example ------- ``>>>import pyemu`` ``>>>mc = pyemu.MonteCarlo(jco="pest.jcb")`` ``>>>mc.draw(1000, obs=True)`` ``>>>mc.write_psts("mc_", existing_jco="pest.jcb", noptmax=1)`` """ self.log("writing realized pest control files") # get a copy of the pest control file pst = self.pst.get(par_names=self.pst.par_names, obs_names=self.pst.obs_names) if noptmax is not None: pst.control_data.noptmax = noptmax pst.control_data.noptmax = noptmax if existing_jco is not None: pst.pestpp_options["BASE_JACOBIAN"] = existing_jco # set the indices pst.parameter_data.index = pst.parameter_data.parnme pst.observation_data.index = pst.observation_data.obsnme if self.parensemble.istransformed: par_en = self.parensemble._back_transform(inplace=False) else: par_en = self.parensemble for i in range(self.num_reals): pst_name = prefix + "{0:d}.pst".format(i) self.log("writing realized pest control file " + pst_name) pst.parameter_data.loc[par_en.columns, "parval1"] = par_en.iloc[i, :].T # reset the regularization # if pst.control_data.pestmode == "regularization": # pst.zero_order_tikhonov(parbounds=True) # zero_order_tikhonov(pst,parbounds=True) # add the obs noise realization if needed if self.obsensemble.shape[0] == self.num_reals: pst.observation_data.loc[self.obsensemble.columns, "obsval"] = self.obsensemble.iloc[ i, :].T # write pst.write(pst_name) self.log("writing realized pest control file " + pst_name) self.log("writing realized pest control files")
class MonteCarlo(LinearAnalysis): """LinearAnalysis derived type for monte carlo analysis Note: requires a pest control file, which can be derived from a jco argument MonteCarlo.project_parsensemble also requires a jacobian """ def __init__(self,**kwargs): super(MonteCarlo,self).__init__(**kwargs) assert self.pst is not None, \ "monte carlo requires a pest control file" self.parensemble = ParameterEnsemble(pst=self.pst) self.obsensemble = ObservationEnsemble(pst=self.pst) @property def num_reals(self): return self.parensemble.shape[0] def get_nsing(self,epsilon=1.0e-6): """ get the number of solution space dimensions given a machine floating point precision (epsilon) Parameters: epsilon: machine floating point precision Returns : integer number of singular components above the epsilon ratio threshold """ nsing = self.xtqx.shape[0] - np.searchsorted( np.sort((self.xtqx.s.x / self.xtqx.s.x.max())[:,0]),epsilon) return nsing def get_null_proj(self,nsing=None): """ get a null-space projection matrix of XTQX Parameters: ---------- nsing: optional number of singular components to use if none, call self.get_nsing() Returns: ------- Matrix instance : V2V2^T """ if nsing is None: nsing = self.get_nsing() v2_proj = (self.xtqx.v[:,nsing:] * self.xtqx.v[:,nsing:].T) #v2_proj = (self.qhalfx.v[:,nsing:] * self.qhalfx.v[:,nsing:].T) #self.__parcov = self.parcov.identity return v2_proj def draw(self, num_reals=1, par_file = None, obs=False, enforce_bounds=False,cov=None): """draw stochastic realizations of parameters and optionally observations Parameters: ---------- num_reals (int): number of realization to generate par_file (str): parameter file to use as mean values obs (bool): add a realization of measurement noise to obs enforce_bounds (bool): enforce parameter bounds in control file Returns: None Raises: None """ if par_file is not None: self.pst.parrep(par_file) if cov is not None: assert isinstance(cov,Cov) else: cov = self.parcov self.log("generating {0:d} parameter realizations".format(num_reals)) self.parensemble.draw(cov,num_reals=num_reals) if enforce_bounds: self.parensemble.enforce() self.log("generating {0:d} parameter realizations".format(num_reals)) if obs: self.log("generating {0:d} observation realizations".format(num_reals)) self.obsensemble.draw(self.obscov,num_reals=num_reals) self.log("generating {0:d} observation realizations".format(num_reals)) def project_parensemble(self,par_file=None,nsing=None, inplace=True): """ perform the null-space projection operations for null-space monte carlo Parameters: par_file: str an optional file of parameter values to use nsing: int number of singular values to in forming null subspace matrix inplace: bool overwrite the existing parameter ensemble with the projected values Returns: ------- if inplace is False, ParameterEnsemble instance, otherwise None """ assert self.jco is not None,"MonteCarlo.project_parensemble()" +\ "requires a jacobian attribute" if par_file is not None: assert os.path.exists(par_file),"monte_carlo.draw() error: par_file not found:" +\ par_file self.parensemble.pst.parrep(par_file) # project the ensemble self.log("projecting parameter ensemble") en = self.parensemble.project(self.get_null_proj(nsing),inplace=inplace,log=self.log) self.log("projecting parameter ensemble") return en def write_psts(self,prefix): """ write parameter and optionally observation realizations to pest control files Parameters: ---------- prefix: str pest control file prefix Returns: ------- None """ self.log("writing realized pest control files") # get a copy of the pest control file pst = self.pst.get(par_names=self.pst.par_names,obs_names=self.pst.obs_names) # set the indices pst.parameter_data.index = pst.parameter_data.parnme pst.observation_data.index = pst.observation_data.obsnme if self.parensemble.islog: par_en = self.parensemble._back_transform(inplace=False) else: par_en = self.parensemble for i in range(self.num_reals): pst_name = prefix + "{0:d}.pst".format(i) self.log("writing realized pest control file " + pst_name) pst.parameter_data.loc[par_en.columns,"parval1"] = par_en.iloc[i, :].T if self.obsensemble.shape[0] == self.num_reals: pst.observation_data.loc[self.obsensemble.columns,"obsval"] = \ self.obsensemble.iloc[i, :].T pst.write(pst_name) self.log("writing realized pest control file " + pst_name) self.log("writing realized pest control files")
def draw(self, num_reals=1, par_file = None, obs=False, enforce_bounds=None, cov=None, how="gaussian"): """draw stochastic realizations of parameters and optionally observations, filling MonteCarlo.parensemble and optionally MonteCarlo.obsensemble. Parameters ---------- num_reals : int number of realization to generate par_file : str parameter file to use as mean values. If None, use MonteCarlo.pst.parameter_data.parval1. Default is None obs : bool add a realization of measurement noise to observation values, forming MonteCarlo.obsensemble.Default is False enforce_bounds : str enforce parameter bounds based on control file information. options are 'reset', 'drop' or None. Default is None how : str type of distribution to draw from. Must be in ["gaussian","uniform"] default is "gaussian". Example ------- ``>>>import pyemu`` ``>>>mc = pyemu.MonteCarlo(pst="pest.pst")`` ``>>>mc.draw(1000)`` """ if par_file is not None: self.pst.parrep(par_file) how = how.lower().strip() assert how in ["gaussian","uniform"] if cov is not None: assert isinstance(cov,Cov) if how == "uniform": raise Exception("MonteCarlo.draw() error: 'how'='uniform'," +\ " 'cov' arg cannot be passed") else: cov = self.parcov self.log("generating {0:d} parameter realizations".format(num_reals)) if how == "gaussian": self.parensemble = ParameterEnsemble.from_gaussian_draw(pst=self.pst,cov=cov, num_reals=num_reals, use_homegrown=True) elif how == "uniform": self.parensemble = ParameterEnsemble.from_uniform_draw(pst=self.pst,num_reals=num_reals) else: raise Exception("MonteCarlo.draw(): unrecognized 'how' arg: {0}".format(how)) #self.parensemble = ParameterEnsemble(pst=self.pst) #self.obsensemble = ObservationEnsemble(pst=self.pst) #self.parensemble.draw(cov,num_reals=num_reals, how=how, # enforce_bounds=enforce_bounds) if enforce_bounds is not None: self.parensemble.enforce(enforce_bounds) self.log("generating {0:d} parameter realizations".format(num_reals)) if obs: self.log("generating {0:d} observation realizations".format(num_reals)) self.obsensemble = ObservationEnsemble.from_id_gaussian_draw(pst=self.pst,num_reals=num_reals) self.log("generating {0:d} observation realizations".format(num_reals))
class MonteCarlo(LinearAnalysis): """LinearAnalysis derived type for monte carlo analysis Parameters ---------- **kwargs : dict dictionary of keyword arguments. See pyemu.LinearAnalysis for complete definitions Attributes ---------- parensemble : pyemu.ParameterEnsemble obsensemble : pyemu.ObservationEnsemble Returns ------- MonteCarlo : MonteCarlo Example ------- ``>>>import pyemu`` ``>>>mc = pyemu.MonteCarlo(pst="pest.pst")`` """ def __init__(self,**kwargs): super(MonteCarlo,self).__init__(**kwargs) assert self.pst is not None, \ "monte carlo requires a pest control file" self.parensemble = ParameterEnsemble(pst=self.pst) self.obsensemble = ObservationEnsemble(pst=self.pst) @property def num_reals(self): """ get the number of realizations in the parameter ensemble Returns ------- num_real : int """ return self.parensemble.shape[0] def get_nsing(self,epsilon=1.0e-4): """ get the number of solution space dimensions given a ratio between the largest and smallest singular values Parameters ---------- epsilon: float singular value ratio Returns ------- nsing : float number of singular components above the epsilon ratio threshold Note ----- If nsing == nadj_par, then None is returned """ mx = self.xtqx.shape[0] nsing = mx - np.searchsorted( np.sort((self.xtqx.s.x / self.xtqx.s.x.max())[:,0]),epsilon) if nsing == mx: self.logger.warn("optimal nsing=npar") nsing = None return nsing def get_null_proj(self,nsing=None): """ get a null-space projection matrix of XTQX Parameters ---------- nsing: int optional number of singular components to use If Nonte, then nsing is determined from call to MonteCarlo.get_nsing() Returns ------- v2_proj : pyemu.Matrix the null-space projection matrix (V2V2^T) """ if nsing is None: nsing = self.get_nsing() if nsing is None: raise Exception("nsing is None") print("using {0} singular components".format(nsing)) self.log("forming null space projection matrix with " +\ "{0} of {1} singular components".format(nsing,self.jco.shape[1])) v2_proj = (self.xtqx.v[:,nsing:] * self.xtqx.v[:,nsing:].T) self.log("forming null space projection matrix with " +\ "{0} of {1} singular components".format(nsing,self.jco.shape[1])) return v2_proj def draw(self, num_reals=1, par_file = None, obs=False, enforce_bounds=None, cov=None, how="gaussian"): """draw stochastic realizations of parameters and optionally observations, filling MonteCarlo.parensemble and optionally MonteCarlo.obsensemble. Parameters ---------- num_reals : int number of realization to generate par_file : str parameter file to use as mean values. If None, use MonteCarlo.pst.parameter_data.parval1. Default is None obs : bool add a realization of measurement noise to observation values, forming MonteCarlo.obsensemble.Default is False enforce_bounds : str enforce parameter bounds based on control file information. options are 'reset', 'drop' or None. Default is None how : str type of distribution to draw from. Must be in ["gaussian","uniform"] default is "gaussian". Example ------- ``>>>import pyemu`` ``>>>mc = pyemu.MonteCarlo(pst="pest.pst")`` ``>>>mc.draw(1000)`` """ if par_file is not None: self.pst.parrep(par_file) how = how.lower().strip() assert how in ["gaussian","uniform"] if cov is not None: assert isinstance(cov,Cov) if how == "uniform": raise Exception("MonteCarlo.draw() error: 'how'='uniform'," +\ " 'cov' arg cannot be passed") else: cov = self.parcov self.log("generating {0:d} parameter realizations".format(num_reals)) if how == "gaussian": self.parensemble = ParameterEnsemble.from_gaussian_draw(pst=self.pst,cov=cov, num_reals=num_reals, use_homegrown=True) elif how == "uniform": self.parensemble = ParameterEnsemble.from_uniform_draw(pst=self.pst,num_reals=num_reals) else: raise Exception("MonteCarlo.draw(): unrecognized 'how' arg: {0}".format(how)) #self.parensemble = ParameterEnsemble(pst=self.pst) #self.obsensemble = ObservationEnsemble(pst=self.pst) #self.parensemble.draw(cov,num_reals=num_reals, how=how, # enforce_bounds=enforce_bounds) if enforce_bounds is not None: self.parensemble.enforce(enforce_bounds) self.log("generating {0:d} parameter realizations".format(num_reals)) if obs: self.log("generating {0:d} observation realizations".format(num_reals)) self.obsensemble = ObservationEnsemble.from_id_gaussian_draw(pst=self.pst,num_reals=num_reals) self.log("generating {0:d} observation realizations".format(num_reals)) def project_parensemble(self,par_file=None,nsing=None, inplace=True,enforce_bounds='reset'): """ perform the null-space projection operations for null-space monte carlo Parameters ---------- par_file: str an optional file of parameter values to use nsing: int number of singular values to in forming null subspace matrix inplace: bool overwrite the existing parameter ensemble with the projected values enforce_bounds: str how to enforce parameter bounds. can be None, 'reset', or 'drop'. Default is None Returns ------- par_en : pyemu.ParameterEnsemble if inplace is False, otherwise None Note ---- to use this method, the MonteCarlo instance must have been constructed with the ``jco`` argument. Example ------- ``>>>import pyemu`` ``>>>mc = pyemu.MonteCarlo(jco="pest.jcb")`` ``>>>mc.draw(1000)`` ``>>>mc.project_parensemble(par_file="final.par",nsing=100)`` """ assert self.jco is not None,"MonteCarlo.project_parensemble()" +\ "requires a jacobian attribute" if par_file is not None: assert os.path.exists(par_file),"monte_carlo.draw() error: par_file not found:" +\ par_file self.parensemble.pst.parrep(par_file) # project the ensemble self.log("projecting parameter ensemble") en = self.parensemble.project(self.get_null_proj(nsing),inplace=inplace,log=self.log) self.log("projecting parameter ensemble") return en def write_psts(self,prefix,existing_jco=None,noptmax=None): """ write parameter and optionally observation realizations to a series of pest control files Parameters ---------- prefix: str pest control file prefix existing_jco: str filename of an existing jacobian matrix to add to the pest++ options in the control file. This is useful for NSMC since this jco can be used to get the first set of parameter upgrades for free! Needs to be the path the jco file as seen from the location where pest++ will be run noptmax: int value of NOPTMAX to set in new pest control files Example ------- ``>>>import pyemu`` ``>>>mc = pyemu.MonteCarlo(jco="pest.jcb")`` ``>>>mc.draw(1000, obs=True)`` ``>>>mc.write_psts("mc_", existing_jco="pest.jcb", noptmax=1)`` """ self.log("writing realized pest control files") # get a copy of the pest control file pst = self.pst.get(par_names=self.pst.par_names,obs_names=self.pst.obs_names) if noptmax is not None: pst.control_data.noptmax = noptmax pst.control_data.noptmax = noptmax if existing_jco is not None: pst.pestpp_options["BASE_JACOBIAN"] = existing_jco # set the indices pst.parameter_data.index = pst.parameter_data.parnme pst.observation_data.index = pst.observation_data.obsnme if self.parensemble.istransformed: par_en = self.parensemble._back_transform(inplace=False) else: par_en = self.parensemble for i in range(self.num_reals): pst_name = prefix + "{0:d}.pst".format(i) self.log("writing realized pest control file " + pst_name) pst.parameter_data.loc[par_en.columns,"parval1"] = par_en.iloc[i, :].T # reset the regularization #if pst.control_data.pestmode == "regularization": #pst.zero_order_tikhonov(parbounds=True) #zero_order_tikhonov(pst,parbounds=True) # add the obs noise realization if needed if self.obsensemble.shape[0] == self.num_reals: pst.observation_data.loc[self.obsensemble.columns,"obsval"] = \ self.obsensemble.iloc[i, :].T # write pst.write(pst_name) self.log("writing realized pest control file " + pst_name) self.log("writing realized pest control files")
def initialize(self,num_reals,init_lambda=None): ''' (re)initialize the process ''' assert num_reals > 1 # initialize the phi report csv self.phi_csv = open(self.pst.filename+".iobj.csv",'w') self.phi_csv.write("iter_num,total_runs,lambda,min,max,mean,median,std,") self.phi_csv.write(','.join(["{0:010d}".\ format(i+1) for i in range(num_reals)])) self.phi_csv.write('\n') self.total_runs = 0 # this matrix gets used a lot, so only calc once and store self.obscov_inv_sqrt = self.obscov.get(self.pst.nnz_obs_names).inv.sqrt if self.restart: print("restarting...ignoring num_reals") raise NotImplementedError() df = pd.read_csv(self.pst.filename+self.paren_prefix.format(self.restart_iter)) self.parensemble_0 = ParameterEnsemble.from_dataframe(df=df,pst=self.pst) self.parensemble = self.parensemble_0.copy() df = pd.read_csv(self.pst.filename+self.obsen_prefix.format(0)) self.obsensemble_0 = ObservationEnsemble.from_dataframe(df=df.loc[:,self.pst.nnz_obs_names], pst=self.pst) # this matrix gets used a lot, so only calc once self.obs0_matrix = self.obsensemble_0.as_pyemu_matrix() df = pd.read_csv(self.pst.filename+self.obsen_prefix.format(self.restart_iter)) self.obsensemble = ObservationEnsemble.from_dataframe(df=df.loc[:,self.pst.nnz_obs_names], pst=self.pst) assert self.parensemble.shape[0] == self.obsensemble.shape[0] self.num_reals = self.parensemble.shape[0] else: self.num_reals = int(num_reals) self.parensemble_0 = ParameterEnsemble(self.pst) self.parensemble_0.draw(cov=self.parcov,num_reals=num_reals) self.parensemble_0.enforce() self.parensemble = self.parensemble_0.copy() self.parensemble_0.to_csv(self.pst.filename +\ self.paren_prefix.format(0)) self.obsensemble_0 = ObservationEnsemble(self.pst) self.obsensemble_0.draw(cov=self.obscov,num_reals=num_reals) #self.obsensemble = self.obsensemble_0.copy() # save the base obsensemble self.obsensemble_0.to_csv(self.pst.filename +\ self.obsen_prefix.format(-1)) self.obs0_matrix = self.obsensemble_0.nonzero.as_pyemu_matrix() # run the initial parameter ensemble self.obsensemble = self._calc_obs(self.parensemble) self.obsensemble.to_csv(self.pst.filename +\ self.obsen_prefix.format(0)) self.current_phi_vec = self._calc_phi_vec(self.obsensemble) self._phi_report(self.current_phi_vec,0.0) self.last_best_mean = self.current_phi_vec.mean() self.last_best_std = self.current_phi_vec.std() if init_lambda is not None: self.current_lambda = float(init_lambda) else: #following chen and oliver x = self.last_best_mean / (2.0 * float(self.obsensemble.shape[1])) self.current_lambda = 10.0**(np.floor(np.log10(x))) # if using the approximate form of the algorithm, let # the parameter scaling matrix be the identity matrix # jwhite - dec 5 2016 - using the actual parcov inv # for upgrades seems to be pushing parameters around # too much. for now, just not using it, maybe # better choices of lambda will tame it if self.use_approx: self.half_parcov_diag = 1.0 else: # if self.parcov.isdiagonal: # self.half_parcov_diag = self.parcov.sqrt.inv # else: # self.half_parcov_diag = Cov(x=np.diag(self.parcov.x), # names=self.parcov.col_names, # isdiagonal=True).inv.sqrt self.half_parcov_diag = 1.0 self.delta_par_prior = self._calc_delta_par(self.parensemble_0) u,s,v = self.delta_par_prior.pseudo_inv_components() self.Am = u * s.inv self.__initialized = True
class EnsembleSmoother(): def __init__(self,pst,parcov=None,obscov=None): if isinstance(pst,str): pst = Pst(pst) assert isinstance(pst,Pst) self.pst = pst if parcov is not None: assert isinstance(parcov,Cov) else: parcov = Cov.from_parameter_data(self.pst) if obscov is not None: assert isinstance(obscov,Cov) else: obscov = Cov.from_observation_data(pst) self.parcov = parcov self.obscov = obscov self.__initialized = False self.num_reals = 0 self.half_parcov_diag = None self.half_obscov_diag = None self.delta_par_prior = None self.iter_num = 0 def initialize(self,num_reals): ''' (re)initialize the process ''' self.num_reals = int(num_reals) self.parensemble_0 = ParameterEnsemble(self.pst) self.parensemble_0.draw(cov=self.parcov,num_reals=num_reals) self.parensemble = self.parensemble_0.copy() self.parensemble_0.to_csv("parensemble.0.csv") self.obsensemble_0 = ObservationEnsemble(self.pst) self.obsensemble_0.draw(cov=self.obscov,num_reals=num_reals) self.obsensemble = self.obsensemble_0.copy() self.obsensemble_0.to_csv("obsensemble.0.csv") if self.parcov.isdiagonal: self.half_parcov_diag = self.parcov.inv.sqrt else: self.half_parcov_diag = Cov(x=np.diag(self.parcov.x), names=self.parcov.col_names, isdiagonal=True).inv.sqrt #if self.obscov.isdiagonal: #self.half_obscov_inv = self.obscov.inv.sqrt # else: # self.half_obscov_diag = Cov(x=np.diag(self.obscov.x), # names=self.obscov.col_names, # isdiagonal=True) self.delta_par_prior = self._calc_delta_par() u,s,v = self.delta_par_prior.pseudo_inv_components() self.Am = u * s.inv self.__initialized = True def _calc_delta_par(self): ''' calc the scaled parameter ensemble differences from the mean ''' mean = np.array(self.parensemble.mean(axis=0)) delta = self.parensemble.as_pyemu_matrix() for i in range(self.num_reals): delta.x[i,:] -= mean #delta = Matrix(x=(self.half_parcov_diag * delta.transpose()).x, # row_names=self.parensemble.columns) delta = self.half_parcov_diag * delta.T return delta * (1.0 / np.sqrt(float(self.num_reals - 1.0))) def _calc_delta_obs(self): ''' calc the scaled observation ensemble differences from the mean ''' mean = np.array(self.obsensemble.mean(axis=0)) delta = self.obsensemble.as_pyemu_matrix() for i in range(self.num_reals): delta.x[i,:] -= mean delta = self.obscov.inv.sqrt * delta.T return delta * (1.0 / np.sqrt(float(self.num_reals - 1.0))) def _calc_obs(self): ''' propagate the ensemble forward... ''' self.parensemble.to_csv(os.path.join("sweep_in.csv")) #os.chdir("smoother") print(os.listdir('.')) os.system("sweep {0}".format(self.pst.filename)) #os.chdir('..') obs = ObservationEnsemble.from_csv(os.path.join('sweep_out.csv')) obs.columns = [item.lower() for item in obs.columns] self.obsensemble = ObservationEnsemble.from_dataframe(df=obs.loc[:,self.obscov.row_names],pst=self.pst) #todo: modifiy sweep to be interactive... return @property def current_lambda(self): return 1.0 def update(self): if not self.__initialized: raise Exception("must call initialize() before update()") self._calc_obs() self.iter_num += 1 self.obsensemble.to_csv("obsensemble.{0}.csv".format(self.iter_num)) delta_obs = self._calc_delta_obs() u,s,v = delta_obs.pseudo_inv_components() scaled_par_diff = self._calc_delta_par() scaled_obs_diff = self.obsensemble.as_pyemu_matrix() -\ self.obsensemble_0.as_pyemu_matrix() scaled_ident = (self.current_lambda*Cov.identity_like(s) + s**2).inv x1 = u.T * self.obscov.inv.sqrt * scaled_obs_diff.T x1.autoalign = False x2 = scaled_ident * x1 x3 = v * s * x2 upgrade_1 = -1.0 * (self.half_parcov_diag * scaled_par_diff *\ x3).to_dataframe() upgrade_1.index.name = "parnme" upgrade_1.T.to_csv("upgrade_1.{0}.csv".format(self.iter_num)) self.parensemble += upgrade_1.T if self.iter_num > 1: par_diff = (self.parensemble - self.parensemble_0).\ as_pyemu_matrix().T x4 = self.Am.T * self.half_parcov_diag * par_diff x5 = self.Am * x4 x6 = scaled_par_diff.T * x5 x7 = v * scaled_ident * v.T * x6 upgrade_2 = -1.0 * (self.half_parcov_diag * scaled_par_diff * x7).to_dataframe() upgrade_2.index.name = "parnme" upgrade_2.T.to_csv("upgrade_2.{0}.csv".format(self.iter_num)) self.parensemble += upgrade_2.T self.parensemble.to_csv("parensemble.{0}.csv".format(self.iter_num))