def summary_table_memory( obs, noisemodel, sedgrid, keys=None, gridbackend="memory", threshold=-10, save_every_npts=None, lnp_npts=None, resume=False, max_nbins=100, stats_outname=None, pdf1d_outname=None, pdf2d_outname=None, pdf2d_param_list=None, grid_info_dict=None, lnp_outname=None, use_full_cov_matrix=True, surveyname="PHAT", extraInfo=False, do_not_normalize=False, ): """ Do the fitting in memory Parameters ---------- obs : Observation object instance observation catalog noisemodel : beast noisemodel instance noise model data sedgrid : str or grid.SEDgrid instance model grid keys : str or list of str if str - name of the quantity or expression to evaluate from the grid table if list - list of quantities or expresions gridbackend : str or grid.GridBackend backend to use to load the grid if necessary (memory, cache, hdf) (see beast.core.grid) save_every_npts : integer set to save the files below (if set) every n stars a requirement for recovering from partially complete runs resume : bool set to designate this run is resuming a partially complete run use_full_cov_matrix : bool set to use the full covariance matrix if it is present in the noise model file max_nbins : int (default=100) maxiumum number of bins to use for the 1D likelihood calculations stats_outname : str set to output the stats file into a FITS file with extensions pdf1d_outname : str set to output the 1D PDFs into a FITS file with extensions pdf2d_outname : str set to output the 2D PDFs into a FITS file with extensions pdf2d_param_list : list of strings or None set to the parameters for which to make the 2D PDFs grid_info_dict : dict Set to override the mins/maxes of the 1dpdfs, and the number of unique values. lnp_outname : str set to output the sparse likelihoods into a (usually HDF5) file threshold : float value above which to use/save for the lnps (defines the sparse likelihood) lnp_npts : int set to a number to output a random sampling of the lnp points above the threshold. otherwise, the full sparse likelihood is output surveyname : str name of survey [default = 'PHAT'] extraInfo : bool set to get extra information, such as IAU name, brick, field, etc. do_not_normalize : bool Do not normalize the prior weights before applying them. This should have no effect on the final outcome when using only a single grid, but is essential when using the subgridding approach. Returns ------- N/A """ if type(sedgrid) == str: g0 = grid.SEDGrid(sedgrid, backend=gridbackend) else: g0 = sedgrid if keys is None: keys = list(g0.keys()) # make sure keys are real keys skip_keys = "osl keep weight grid_weight prior_weight fullgrid_idx stage specgrid_indx".split( ) keys = [k for k in keys if k not in skip_keys] for key in keys: if not (key in list(g0.keys())): raise KeyError('Key "{0}" not recognized'.format(key)) # make sure there are 2D PDF params if needed if (pdf2d_outname is not None) and (pdf2d_param_list is None): raise KeyError("pdf2d_param_list cannot be None if saving 2D PDFs") # generate an IAU complient name for each source and add other inform res = IAU_names_and_extra_info(obs, surveyname=surveyname, extraInfo=False) Q_all_memory( res, obs, g0, noisemodel, keys, p=[16.0, 50.0, 84.0], resume=resume, threshold=threshold, save_every_npts=save_every_npts, lnp_npts=lnp_npts, max_nbins=max_nbins, stats_outname=stats_outname, pdf1d_outname=pdf1d_outname, pdf2d_outname=pdf2d_outname, pdf2d_param_list=pdf2d_param_list, grid_info_dict=grid_info_dict, lnp_outname=lnp_outname, use_full_cov_matrix=use_full_cov_matrix, do_not_normalize=do_not_normalize, )
def Q_all_memory( prev_result, obs, sedgrid, obsmodel, qnames_in, p=[16.0, 50.0, 84.0], gridbackend="cache", max_nbins=100, stats_outname=None, pdf1d_outname=None, pdf2d_outname=None, pdf2d_param_list=None, grid_info_dict=None, lnp_outname=None, lnp_npts=None, save_every_npts=None, threshold=-40, resume=False, use_full_cov_matrix=True, do_not_normalize=False, ): """ Fit each star, calculate various fit statistics, and output them to files. All done in one function for speed and ability to resume partially completed runs. Parameters ---------- prev_result : dict previous results to include in the output summary table usually basic data on each source obs : Observation object instance observation catalog sedgrid : str or grid.SEDgrid instance model grid obsmodel : beast noisemodel instance noise model data qnames : list names of quantities p : array-like list of percentile values gridbackend : str or grid.GridBackend backend to use to load the grid if necessary (memory, cache, hdf) (see beast.core.grid) max_nbins : int (default=100) maxiumum number of bins to use for the 1D likelihood calculations save_every_npts : int set to save the files below (if set) every n stars a requirement for recovering from partially complete runs resume : bool set to designate this run is resuming a partially complete run use_full_cov_matrix : bool set to use the full covariance matrix if it is present in the noise model file stats_outname : str set to output the stats file into a FITS file with extensions pdf1d_outname : str set to output the 1D PDFs into a FITS file with extensions pdf2d_outname : str set to output the 2D PDFs into a FITS file with extensions pdf2d_param_list : list of strs or None set to the parameters for which to make the 2D PDFs grid_info_dict : dict Set to override the mins/maxes of the 1dpdfs, and the number of unique values lnp_outname : str set to output the sparse likelihoods into a (usually HDF5) file threshold : float value above which to use/save for the lnps (defines the sparse likelihood) lnp_npts : int set to a number to output a random sampling of the lnp points above the threshold. Otherwise, the full sparse likelihood is output. do_not_normalize: bool Do not normalize the prior weights before applying them. This should have no effect on the final outcome when using only a single grid, but is essential when using the subgridding approach. Returns ------- N/A """ if type(sedgrid) == str: g0 = grid.SEDGrid(sedgrid, backend=gridbackend) else: g0 = sedgrid # remove weights that are less than zero (g0_indxs, ) = np.where(g0["weight"] > 0.0) g0_weights = np.log(g0["weight"][g0_indxs]) if not do_not_normalize: # this variable used on the next line, so is used regardless of what flake8 says g0_weights_sum = np.log(g0["weight"][g0_indxs].sum()) # noqa: E302 g0_weights = numexpr.evaluate("g0_weights - g0_weights_sum") if len(g0["weight"]) != len(g0_indxs): print("some zero weight models exist") print("orig/g0_indxs", len(g0["weight"]), len(g0_indxs)) # get the model SEDs if hasattr(g0.seds, "read"): _seds = g0.seds.read() else: _seds = g0.seds # links to errors and biases ast_error = obsmodel["error"] ast_bias = obsmodel["bias"] # if the ast file includes the full covariance matrices, make links full_cov_mat = False if (use_full_cov_matrix & ("q_norm" in obsmodel.keys()) & ("icov_diag" in obsmodel.keys()) & ("icov_offdiag" in obsmodel.keys())): full_cov_mat = True ast_q_norm = obsmodel["q_norm"] ast_icov_diag = obsmodel["icov_diag"] two_ast_icov_offdiag = 2.0 * obsmodel["icov_offdiag"] else: ast_ivar = 1.0 / np.asfortranarray(ast_error)**2 if full_cov_mat: print("using full covariance matrix") else: print("not using full covariance matrix") # number of observed SEDs to fit nobs = len(obs) # augment the qnames to include the *full* model SED # by this it means the physical model flux plus the noise model bias term qnames = qnames_in filters = sedgrid.filters for i, cfilter in enumerate(filters): qnames.append("symlog" + cfilter + "_wd_bias") # create the full model fluxes for later use # save as symmetric log, since the fluxes can be negative model_seds_with_bias = np.asfortranarray(_seds + ast_bias) # full_model_flux = np.sign(logtempseds) * np.log10(1 + np.abs(logtempseds * math.log(10))) full_model_flux = (np.sign(model_seds_with_bias) * np.log1p(np.abs(model_seds_with_bias * math.log(10))) / math.log(10)) # setup the arrays to temp store the results n_qnames = len(qnames) n_pers = len(p) best_vals = np.zeros((nobs, n_qnames)) exp_vals = np.zeros((nobs, n_qnames)) per_vals = np.zeros((nobs, n_qnames, n_pers)) chi2_vals = np.zeros(nobs) chi2_indx = np.zeros(nobs) lnp_vals = np.zeros(nobs) lnp_indx = np.zeros(nobs) best_specgrid_indx = np.zeros(nobs) total_log_norm = np.zeros(nobs) # variable to save the lnp files save_lnp_vals = [] # setup the mapping for the 1D PDFs fast_pdf1d_objs = [] save_pdf1d_vals = [] # make 1D PDF objects for qname in qnames: # get bin properties qname_vals, nbins, logspacing, minval, maxval = setup_param_bins( qname, max_nbins, g0, full_model_flux, filters, grid_info_dict) # generate the fast 1d pdf mapping _tpdf1d = pdf1d(qname_vals, nbins, logspacing=logspacing, minval=minval, maxval=maxval) fast_pdf1d_objs.append(_tpdf1d) # setup the arrays to save the 1d PDFs save_pdf1d_vals.append(np.zeros((nobs + 1, nbins))) save_pdf1d_vals[-1][-1, :] = _tpdf1d.bin_vals # if chosen, make 2D PDFs if pdf2d_outname is not None: # setup the 2D PDFs _pdf2d_params = [ qname for qname in qnames if qname in pdf2d_param_list and len(np.unique(g0[qname])) > 1 ] _n_params = len(_pdf2d_params) pdf2d_qname_pairs = [ _pdf2d_params[i] + "+" + _pdf2d_params[j] for i in range(_n_params) for j in range(i + 1, _n_params) ] fast_pdf2d_objs = [] save_pdf2d_vals = [] # make 2D PDF objects for qname_pair in pdf2d_qname_pairs: qname_1, qname_2 = qname_pair.split("+") # get bin properties ( qname_vals_p1, nbins_p1, logspacing_p1, minval_p1, maxval_p1, ) = setup_param_bins(qname_1, max_nbins, g0, full_model_flux, filters, grid_info_dict) ( qname_vals_p2, nbins_p2, logspacing_p2, minval_p2, maxval_p2, ) = setup_param_bins(qname_2, max_nbins, g0, full_model_flux, filters, grid_info_dict) # make 2D PDF _tpdf2d = pdf2d( qname_vals_p1, qname_vals_p2, nbins_p1, nbins_p2, logspacing_p1=logspacing_p1, logspacing_p2=logspacing_p2, minval_p1=minval_p1, maxval_p1=maxval_p1, minval_p2=minval_p2, maxval_p2=maxval_p2, ) fast_pdf2d_objs.append(_tpdf2d) # arrays for the PDFs and bins save_pdf2d_vals.append(np.zeros((nobs + 2, nbins_p1, nbins_p2))) save_pdf2d_vals[-1][-2, :, :] = np.tile(_tpdf2d.bin_vals_p1, (nbins_p2, 1)).T save_pdf2d_vals[-1][-1, :, :] = np.tile(_tpdf2d.bin_vals_p2, (nbins_p1, 1)) # if this is a resume job, read in the already computed stats and # fill the variables # also - find the start position for the resumed run if resume: stats_table = Table.read(stats_outname) for k, qname in enumerate(qnames): best_vals[:, k] = stats_table["{0:s}_Best".format(qname)] exp_vals[:, k] = stats_table["{0:s}_Exp".format(qname)] for i, pval in enumerate(p): per_vals[:, k, i] = stats_table["{0:s}_p{1:d}".format( qname, int(pval))] chi2_vals = stats_table["chi2min"] chi2_indx = stats_table["chi2min_indx"] lnp_vals = stats_table["Pmax"] lnp_indx = stats_table["Pmax_indx"] best_specgrid_indx = stats_table["specgrid_indx"] (indxs, ) = np.where(stats_table["Pmax"] != 0.0) start_pos = max(indxs) + 1 print("resuming run with start indx = " + str(start_pos) + " out of " + str(len(stats_table["Pmax"]))) # read in the already computed 1D PDFs if pdf1d_outname is not None: print("restoring the already computed 1D PDFs from " + pdf1d_outname) with fits.open(pdf1d_outname) as hdulist: for k in range(len(qnames)): save_pdf1d_vals[k] = hdulist[k + 1].data # read in the already computed 2D PDFs if pdf2d_outname is not None: print("restoring the already computed 2D PDFs from " + pdf2d_outname) with fits.open(pdf2d_outname) as hdulist: for k in range(len(pdf2d_qname_pairs)): save_pdf2d_vals[k] = hdulist[k + 1].data else: start_pos = 0 # setup a new lnp file if lnp_outname is not None: outfile = tables.open_file(lnp_outname, "w") # Save wavelengths in root, remember #n_stars = root._v_nchildren -1 outfile.create_array(outfile.root, "grid_waves", g0.lamb[:]) filters = obs.getFilters() outfile.create_array(outfile.root, "obs_filters", filters[:]) outfile.close() # loop over the objects and get all the requested quantities g0_specgrid_indx = g0["specgrid_indx"] _p = np.asarray(p, dtype=float) it = tqdm( islice(obs.enumobs(), int(start_pos), None), total=len(obs) - start_pos, desc="Calculating Lnp/Stats", ) for e, obj in it: # calculate the full nD posterior (sed) = obj cur_mask = sed == 0 # need an alternate way to generate the mask as zeros can be # valid values in the observed SED (KDG 29 Jan 2016) # currently, set mask to False always cur_mask[:] = False if full_cov_mat: (lnp, chi2) = N_covar_logLikelihood( sed, model_seds_with_bias, ast_q_norm, ast_icov_diag, two_ast_icov_offdiag, lnp_threshold=abs(threshold), ) else: (lnp, chi2) = N_logLikelihood_NM( sed, model_seds_with_bias, ast_ivar, mask=cur_mask, lnp_threshold=abs(threshold), ) lnp = lnp[g0_indxs] chi2 = chi2[g0_indxs] # lnp = numexpr.evaluate('lnp + g0_weights') lnp += g0_weights # multiply by the prior weights (sum in log space) (indx, ) = np.where((lnp - max(lnp[np.isfinite(lnp)])) > threshold) # now generate the sparse likelihood (remove later if this works # by updating code below) # checked if changing to the full likelihood speeds things up # - the answer is no # and is likely related to the switch here to the sparse # likelihood for the weight calculation lnps = lnp[indx] chi2s = chi2[indx] # log_norm = np.log(getNorm_lnP(lnps)) # if not np.isfinite(log_norm): # log_norm = lnps.max() log_norm = lnps.max() weights = np.exp(lnps - log_norm) # normalize the weights make sure they sum to one # needed for np.random.choice weight_sum = np.sum(weights) weights /= weight_sum # save the current set of lnps if lnp_outname is not None: if lnp_npts is not None: if lnp_npts < len(indx): rindx = np.random.choice(indx, size=lnp_npts, replace=False) if lnp_npts >= len(indx): rindx = indx else: rindx = indx save_lnp_vals.append([ e, np.array(g0_indxs[rindx], dtype=np.int64), np.array(lnp[rindx], dtype=np.float32), np.array(chi2[rindx], dtype=np.float32), np.array([sed]).T, ]) # To merge the stats for different subgrids, we need the total # weight of a grid, which is sum(exp(lnps)). Since sum(exp(lnps # - log_norm - log(weight_sum))) = 1, the relative weight of # each subgrid will be exp(log_norm + log(weight_sum)). # Therefore, we also store the following quantity: total_log_norm[e] = log_norm + np.log(weight_sum) # index to the full model grid for the best fit values best_full_indx = g0_indxs[indx[weights.argmax()]] # index to the spectral grid best_specgrid_indx[e] = g0_specgrid_indx[best_full_indx] # goodness of fit quantities chi2_vals[e] = chi2s.min() chi2_indx[e] = g0_indxs[indx[chi2s.argmin()]] lnp_vals[e] = lnps.max() lnp_indx[e] = best_full_indx # calculate quantities for individual parameters: # best value, expectation value, 1D PDF, percentiles for k, qname in enumerate(qnames): if "_bias" in qname: fname = (qname.replace("_wd_bias", "")).replace("symlog", "") q = full_model_flux[:, filters.index(fname)] else: q = g0[qname] # best value best_vals[e, k] = q[best_full_indx] # expectation value exp_vals[e, k] = expectation(q[g0_indxs[indx]], weights=weights) # percentile values pdf1d_bins, pdf1d_vals = fast_pdf1d_objs[k].gen1d( g0_indxs[indx], weights) save_pdf1d_vals[k][e, :] = pdf1d_vals if pdf1d_vals.max() > 0: # remove normalization to allow for post processing with # different distance runs (needed for the SMIDGE-SMC) # pdf1d_vals /= pdf1d_vals.max() per_vals[e, k, :] = percentile(pdf1d_bins, _p, weights=pdf1d_vals) else: per_vals[e, k, :] = [0.0, 0.0, 0.0] # calculate 2D PDFs for the subset of parameter pairs if pdf2d_outname is not None: for k in range(len(pdf2d_qname_pairs)): save_pdf2d_vals[k][e, :, :] = fast_pdf2d_objs[k].gen2d( g0_indxs[indx], weights) # incremental save (useful if job dies early to recover most # of the computations) if save_every_npts is not None: if (e > 0) & (e % save_every_npts == 0): # save the 1D PDFs if pdf1d_outname is not None: save_pdf1d(pdf1d_outname, save_pdf1d_vals, qnames) # save the 2D PDFs if pdf2d_outname is not None: save_pdf2d(pdf2d_outname, save_pdf2d_vals, pdf2d_qname_pairs) # save the stats/catalog if stats_outname is not None: save_stats( stats_outname, prev_result, best_vals, exp_vals, per_vals, chi2_vals, chi2_indx, lnp_vals, lnp_indx, best_specgrid_indx, total_log_norm, qnames, p, ) # save the lnps if lnp_outname is not None: save_lnp(lnp_outname, save_lnp_vals) save_lnp_vals = [] # do the final save of everything (or the last set for the lnp values) # save the 1D PDFs if pdf1d_outname is not None: save_pdf1d(pdf1d_outname, save_pdf1d_vals, qnames) # save the 2D PDFs if pdf2d_outname is not None: save_pdf2d(pdf2d_outname, save_pdf2d_vals, pdf2d_qname_pairs) # save the stats/catalog if stats_outname is not None: save_stats( stats_outname, prev_result, best_vals, exp_vals, per_vals, chi2_vals, chi2_indx, lnp_vals, lnp_indx, best_specgrid_indx, total_log_norm, qnames, p, ) # save the lnps if lnp_outname is not None: save_lnp(lnp_outname, save_lnp_vals)