def get_inference_data(self, data, eight_schools_params): return convert_to_inference_data( data.obj, group="posterior", coords={"school": np.arange(eight_schools_params["J"])}, dims={"theta": ["school"], "eta": ["school"]}, )
def _convert_pyjags_samples_dict_to_arviz_inference_data( samples: tp.Dict[str, np.ndarray]) -> az.InferenceData: """ Converts a PyJAGS samples dictionary to an ArviZ inference data object. Takes a python dictionary of samples that has been generated by the sample method of a model instance and returns an Arviz inference data object. Parameters ---------- samples: a dictionary mapping variable names to NumPy arrays with shape (parameter_dimension, chain_length, number_of_chains) Returns ------- An Arviz inference data object """ # pyjags returns a dictionary of NumPy arrays with shape # (parameter_dimension, chain_length, number_of_chains) # but arviz expects samples with shape # (number_of_chains, chain_length, parameter_dimension) return az.convert_to_inference_data( _convert_pyjags_samples_dictionary_to_arviz_samples_dictionary( samples))
def plot_trace(trace: Dict[str, ndarray], show: bool = False) -> Figure: """Use `Arviz` to plot a trace of the variable parameters, alongside a histogram of their distribution. Parameters ---------- trace: dict of str and numpy.ndarray The parameter trace with shape=(n_steps, n_variable_parameters) show: bool If true, the plot will be shown. Returns ------- matplotlib.pyplot.Figure The plotted figure. """ data = arviz.convert_to_inference_data(trace) axes = arviz.plot_trace(data) figure = axes[0][0].figure if show: figure.show() return figure
def plot_trace(self, trace, show=False): """Use `Arviz` to plot a trace of the trainable parameters, alongside a histogram of their distribution. Parameters ---------- trace: numpy.ndarray The parameter trace with shape=(n_steps, n_trainable_parameters+1) show: bool If true, the plot will be shown. Returns ------- matplotlib.pyplot.Figure The plotted figure. """ trace_dict = {} for index, label in enumerate(self._prior_labels): trace_dict[label] = trace[:, index + 1] data = arviz.convert_to_inference_data(trace_dict) axes = arviz.plot_trace(data) figure = axes[0][0].figure if show: figure.show() return figure
def to_arviz(name, chain): import arviz as az if len(chain.shape) == 1: chain = chain.reshape(-1, 1) return az.convert_to_inference_data({name: chain[np.newaxis, :, :]})
def get_inference_data(self): return convert_to_inference_data( self.obj, group="posterior", coords={"school": np.arange(self.data["J"])}, dims={"theta": ["school"], "theta_tilde": ["school"]}, )
def get_inference_data(self, data): return convert_to_inference_data( data.obj, group="posterior", coords={"school": np.arange(8)}, dims={"theta": ["school"], "eta": ["school"]}, )
def test_id_conversion_args(self): stored = load_arviz_data("centered_eight") IVIES = [ "Yale", "Harvard", "MIT", "Princeton", "Cornell", "Dartmouth", "Columbia", "Brown" ] # test dictionary argument... # I reverse engineered a dictionary out of the centered_eight # data. That's what this block of code does. d = stored.posterior.to_dict() d = d["data_vars"] test_dict = {} # type: Dict[str, np.ndarray] for var_name in d: data = d[var_name]["data"] # this is a list of chains that is a list of samples... chain_arrs = [] for chain in data: # list of samples chain_arrs.append(np.array(chain)) data_arr = np.stack(chain_arrs) test_dict[var_name] = data_arr inference_data = convert_to_inference_data(test_dict, dims={"theta": ["Ivies"]}, coords={"Ivies": IVIES}) assert isinstance(inference_data, InferenceData) assert set( inference_data.posterior.coords["Ivies"].values) == set(IVIES) assert inference_data.posterior["theta"].dims == ("chain", "draw", "Ivies")
def effective_sample_size(self): try: arviz_samples = az.convert_to_inference_data(self.samples) ess = az.ess(arviz_samples) except ModuleNotFoundError: print("Summary relies on arviz and arviz is not installed") ess = None return ess
def get_inference_data(self): return convert_to_inference_data( self.obj, group='posterior', coords={'school': np.arange(self.data['J'])}, dims={ 'theta': ['school'], 'theta_tilde': ['school'] }, )
def _create_inference_data(self, chains): if len(chains) > 1: data = { name: np.stack([c[name] for c in chains]) for name in chains[0]._names } else: data = {name: chains[0][name][None] for name in chains[0]._names} return az.convert_to_inference_data(data).posterior
def test_nd_to_inference_data(self): shape = (1, 2, 3, 4, 5) inference_data = convert_to_inference_data(np.random.randn(*shape), group="foo") assert hasattr(inference_data, "foo") assert len(inference_data.foo.data_vars) == 1 var_name = list(inference_data.foo.data_vars)[0] assert len(inference_data.foo.coords) == len(shape) assert inference_data.foo.chain.shape == shape[:1] assert inference_data.foo.draw.shape == shape[1:2] assert inference_data.foo[var_name].shape == shape
def test_more_chains_than_draws(self): shape = (10, 4) with pytest.warns(SyntaxWarning): inference_data = convert_to_inference_data(np.random.randn(*shape), group="foo") assert hasattr(inference_data, "foo") assert len(inference_data.foo.data_vars) == 1 var_name = list(inference_data.foo.data_vars)[0] assert len(inference_data.foo.coords) == len(shape) assert inference_data.foo.chain.shape == shape[:1] assert inference_data.foo.draw.shape == shape[1:2] assert inference_data.foo[var_name].shape == shape
def test_nd_to_inference_data(self): shape = (1, 2, 3, 4, 5) inference_data = convert_to_inference_data(np.random.randn(*shape), group="prior") assert hasattr(inference_data, "prior") assert len(inference_data.prior.data_vars) == 1 var_name = list(inference_data.prior.data_vars)[0] assert len(inference_data.prior.coords) == len(shape) assert inference_data.prior.chain.shape == shape[:1] assert inference_data.prior.draw.shape == shape[1:2] assert inference_data.prior[var_name].shape == shape assert repr(inference_data).startswith("Inference data with groups")
def test_nd_to_inference_data(self): shape = (1, 2, 3, 4, 5) inference_data = convert_to_inference_data( xr.DataArray(np.random.randn(*shape), dims=("chain", "draw", "dim_0", "dim_1", "dim_2")), group="prior", ) var_name = list(inference_data.prior.data_vars)[0] assert hasattr(inference_data, "prior") assert len(inference_data.prior.data_vars) == 1 assert inference_data.prior.chain.shape == shape[:1] assert inference_data.prior.draw.shape == shape[1:2] assert inference_data.prior[var_name].shape == shape
def neff_det_check_plot(c): fit = az.convert_to_inference_data(c) az.plot_density(fit, var_names=['neff_det'], credible_interval=0.99) xlabel(r'$N_\mathrm{eff}$') ylabel(r'$p\left( N_\mathrm{eff} \right)$') nobs = c.posterior['m1s'].shape[2] axvline(4 * nobs) nemin = percentile(c.posterior['neff_det'], 2.5) title(r'Two-sigma lower $N_\mathrm{{eff}}$ is factor {:.2f} above limit'. format(nemin / (4 * nobs)))
def test_monte_carlo_format(self): # Note that this file is empty, we are using it as a # placeholder to place the az Inference object filepath = self.get_data_path('monte-carlo-samples.az') size = 100 dataset = az.convert_to_inference_data(np.random.randn(size)) dataset.to_netcdf(filepath) temp_dir = self.temp_dir.name shutil.copy(filepath, os.path.join(temp_dir, 'monte-carlo-samples.az')) format = MonteCarloTensorDirectoryFormat(temp_dir, mode='r') format.validate()
def test_id_conversion_idempotent(self): stored = load_arviz_data("centered_eight") inference_data = convert_to_inference_data(stored) assert isinstance(inference_data, InferenceData) assert set(inference_data.observed_data.obs.coords["school"].values) == { "Hotchkiss", "Mt. Hermon", "Choate", "Deerfield", "Phillips Andover", "St. Paul's", "Lawrenceville", "Phillips Exeter", } assert inference_data.posterior["theta"].dims == ("chain", "draw", "school")
def posterior(self): """ To extract a number of burn-ins, you need to index using `xarray` syntax. In this case, it'll look like: ```helper.posterior.sel(draw=slice(3000,None))``` To get draws 3000 to the end. Unfortunately, -1000 does not appear to work for this syntax. Returns ------- [type] [description] """ return arviz.convert_to_inference_data(self.chain)
def _get_posterior_samples_from_matrix_as_inferencedata( self, matrix: torch.Tensor) -> az.InferenceData: np_samples_list = self._get_posterior_samples_from_matrix_as_numpy( matrix) dictdata = {} for param_name in self.model.parameters.keys(): dictdata[param_name] = np.stack( [sample[param_name] for sample in np_samples_list] )[np.newaxis, ...] # First dim should be "chain" for conversion to InferenceData. infdata = az.convert_to_inference_data(dictdata) infdata.posterior.attrs["inference_library"] = "smallx" infdata.posterior.attrs["inference_library_version"] = "0.0" return infdata
def traceplot(c): fit = az.convert_to_inference_data(c) lines = (('H0', {}, true_params['H0']), ('Om', {}, true_params['Om']), ('w0', {}, true_params['w']), ('R0_30', {}, true_params['R0_30']), ('MMin', {}, true_params['MMin']), ('MMax', {}, true_params['MMax']), ('smooth_min', {}, true_params['smooth_min']), ('smooth_max', {}, true_params['smooth_max']), ('alpha', {}, true_params['alpha']), ('beta', {}, true_params['beta']), ('gamma', {}, true_params['gamma'])) az.plot_trace(fit, var_names=[ 'H0', 'Om', 'w0', 'R0_30', 'MMax', 'smooth_max', 'alpha', 'beta', 'gamma' ], lines=lines)
def _sample_model(model, data, refresh=100, **kwargs): print() print("Data") print("----") for k, v in data.items(): if numpy.shape(v) == (): print(' %-10s: %s' % (k, v)) else: print(' %-10s: shape %s [%s ... %s]' % (k, numpy.shape(v), numpy.min(v), numpy.max(v))) print() print("sampling from model ...") fit = model.sampling(data=data, refresh=refresh, **kwargs) print("processing results ...") print(fit) print("checking results ...") check_all_diagnostics(fit, max_treedepth=kwargs.get('control', {}).get( 'max_treedepth', 10), quiet=False) return arviz.convert_to_inference_data(fit)
def test_convert_to_dataset_bad(tmpdir): first = convert_to_inference_data(np.random.randn(100), group="prior") filename = str(tmpdir.join("test_file.nc")) first.to_netcdf(filename) with pytest.raises(ValueError): convert_to_dataset(filename, group="bar")
def test_convert_to_inference_data_bad(): with pytest.raises(ValueError): convert_to_inference_data(1)
def test_convert_to_inference_data_from_file(tmpdir): first = convert_to_inference_data(np.random.randn(100), group="prior") filename = str(tmpdir.join("test_file.nc")) first.to_netcdf(filename) second = convert_to_inference_data(filename) assert first.prior.equals(second.prior)
def test_convert_to_inference_data_idempotent(): first = convert_to_inference_data(np.random.randn(100), group="prior") second = convert_to_inference_data(first) assert first.prior is second.prior
def main(): parser = argparse.ArgumentParser( description='Train PMF on CSV-formatted count matrix') parser.add_argument( '-f', '--csv-file', nargs='?', type=str, help="Enter the CSV file" ) parser.add_argument( '-e', '--epoch', nargs='?', type=int, default=300, help='Enter Epoch value: Default: 300' ) parser.add_argument( '-d', '--dimension', nargs='?', type=int, default=2, help='Enter embedding dimension. Default: 2' ) parser.add_argument( '-b', '--batch-size', nargs='?', type=int, default=5000, help='Enter batch size. Default: 5000' ) parser.add_argument( '-lr', '--learning-rate', nargs='?', type=float, default=0.01, help='Enter float. Default: 0.01' ) parser.add_argument( '-c', '--clip-value', nargs='?', type=float, default=3., help='Gradient clip value. Default: 3.0' ) parser.add_argument( '-lt', '--log-transform', help='Log-transform?', action='store_true' ) parser.add_argument( '-rn', '--row-normalize', help='Row normalize based on counts?', action='store_true' ) args = parser.parse_args(sys.argv[1:]) if args.csv_file is None: sys.exit("You need to specify a csv file") elif not os.path.exists(args.csv_file): sys.exit("File doesn't exist") else: _FILENAME = args.csv_file _BATCH_SIZE = args.batch_size _LOG_TRANSFORM = args.log_transform _EPOCH_NUMBER = args.epoch _DIMENSION = args.dimension _LEARNING_RATE = args.learning_rate _ROW_NORMALIZE = args.row_normalize _CLIP_VALUE = args.clip_value with open(_FILENAME) as f: csv_file = csv.reader(f) columns = len(next(csv_file)) csv_data0 = tf.data.experimental.CsvDataset( _FILENAME, [tf.float64]*columns) csv_data0 = csv_data0.enumerate() csv_data = csv_data0.map( lambda j, *x: { 'indices': j, 'counts': tf.squeeze(tf.stack(x, axis=-1)) }) # Grab a batch to compute statistics colsums = [] batch_sizes = [] N = 0 for batch in iter(csv_data.batch(_BATCH_SIZE, drop_remainder=False)): colsums += [tf.reduce_sum(batch['counts'], axis=0, keepdims=True)] N += batch['counts'].shape[0] colsums = tf.add_n(colsums) colmeans = colsums/N rowmean = tf.reduce_sum(colmeans) if _ROW_NORMALIZE: csv_data = csv_data0.map( lambda j, *x: { 'indices': j, 'counts': tf.squeeze(tf.stack(x, axis=-1)), 'normalization': tf.reduce_max([ tf.reduce_sum(x), 1.])/rowmean }) csv_data_batched = csv_data.batch(_BATCH_SIZE, drop_remainder=True) csv_data_batched = csv_data_batched.prefetch( tf.data.experimental.AUTOTUNE) factor = PoissonMatrixFactorization( csv_data_batched, latent_dim=_DIMENSION, strategy=None, scale_columns=True, log_transform=_LOG_TRANSFORM, column_norms=colmeans, u_tau_scale=1.0/np.sqrt(columns*N), dtype=tf.float64) factor.calibrate_advi( num_epochs=_EPOCH_NUMBER, rel_tol=1e-4, clip_value=_CLIP_VALUE, learning_rate=_LEARNING_RATE) print("Saving the encoding matrix") filename = f"{_FILENAME}_{_DIMENSION}D_encoding" filename += f"_lt_{_LOG_TRANSFORM}_rn_{_ROW_NORMALIZE}.csv" with open(filename, "w") as f: writer = csv.writer(f) encoding = factor.encoding_matrix().numpy().T for row in range(encoding.shape[0]): writer.writerow(encoding[row, :]) print("Saving the trained model object") filename = f"{_FILENAME}_{_DIMENSION}D_model" filename += f"_lt_{_LOG_TRANSFORM}_rn_{_ROW_NORMALIZE}.pkl" factor.save(filename) print("Saving figure with the encodings") fig, ax = plt.subplots(1, 2, figsize=(14, 8)) D = factor.feature_dim pcm = ax[0].imshow( factor.encoding_matrix().numpy()[::-1, :], vmin=0, cmap="Blues") ax[0].set_yticks(np.arange(factor.feature_dim)) ax[0].set_yticklabels(np.arange(factor.feature_dim)) ax[0].set_ylabel("item") ax[0].set_xlabel("factor dimension") ax[0].set_xticks(np.arange(_DIMENSION)) ax[0].set_xticklabels(np.arange(_DIMENSION)) surrogate_samples = factor.surrogate_distribution.sample(250) if 's' in surrogate_samples.keys(): weights = surrogate_samples['s'] / \ tf.reduce_sum(surrogate_samples['s'], -2, keepdims=True) intercept_data = az.convert_to_inference_data( { r"": ( tf.squeeze(surrogate_samples['w']) * weights[:, -1, :] * factor.eta_i ).numpy().T}) else: intercept_data = az.convert_to_inference_data( { r"": ( tf.squeeze(surrogate_samples['w']) * factor.eta_i).numpy().T}) fig.colorbar(pcm, ax=ax[0], orientation="vertical") az.plot_forest(intercept_data, ax=ax[1]) ax[1].set_xlabel("background rate") ax[1].set_ylim((-0.014, .466)) ax[1].set_title("65% and 95% CI") ax[1].axvline(1.0, linestyle='dashed', color="black") filename = f"{_FILENAME}_{_DIMENSION}D_encoding_" filename += f"lt_{_LOG_TRANSFORM}_rn_{_ROW_NORMALIZE}.pdf" plt.savefig( filename, bbox_inches='tight') print("Generating representations") filename = f"{_FILENAME}_{_DIMENSION}D_representation" filename += f"_lt_{_LOG_TRANSFORM}_rn_{_ROW_NORMALIZE}.csv" csv_data_batched = csv_data.batch(_BATCH_SIZE, drop_remainder=False) with open(filename, 'w') as f: writer = csv.writer(f) for record in iter(csv_data_batched): z = factor.encode(tf.cast(record['data'], factor.dtype)).numpy() if _ROW_NORMALIZE: z *= (record['normalization'].numpy())[:, np.newaxis] ind = record['indices'].numpy() for row in range(z.shape[0]): writer.writerow(np.concatenate([[ind[row]], z[row, :]]))
def test_convert_to_inference_data_from_file(tmpdir): first = convert_to_inference_data(np.random.randn(100), group='foo') filename = str(tmpdir.join('test_file.nc')) first.to_netcdf(filename) second = convert_to_inference_data(filename) assert first.foo.equals(second.foo)
def test_convert_to_inference_data_idempotent(): first = convert_to_inference_data(np.random.randn(100), group='foo') second = convert_to_inference_data(first) assert first.foo is second.foo
expts = ("Oid_1ngmL", "O1_1ngmL", "O2_1ngmL") data_uv5, data_rep = srep.utils.condense_data(expts) # load in the pickled samples pklfile = open(f"{repo_rootdir}/data/mcmc_samples/1ngmL_sampler.pkl", 'rb') sampler = dill.load(pklfile) pklfile.close() n_dim = np.shape(sampler.get_chain())[-1] # remember these are log_10 of actual params!! var_labels = ("k_burst", "b", "kR_on", "koff_Oid", "koff_O1", "koff_O2") #%% emcee_output = az.convert_to_inference_data( sampler, var_names=var_labels ) bokeh.io.show(bebi103.viz.corner(emcee_output, plot_ecdf=True)) #%% # ppc plots plotting_draws = 75 ppc_uv5 = srep.models.post_pred_bursty_rep( sampler, n_pred=sum(data_uv5[1]), n_post=plotting_draws, kon_ind='nbinom', koff_ind='nbinom' ) ppc_rep = [] for i in range(len(expts)): ppc_rep.append( srep.models.post_pred_bursty_rep( sampler, n_pred=sum(data_rep[i][1]), n_post=plotting_draws, kon_ind=2, koff_ind=3+i))