示例#1
0
 def get_inference_data(self, data, eight_schools_params):
     return convert_to_inference_data(
         data.obj,
         group="posterior",
         coords={"school": np.arange(eight_schools_params["J"])},
         dims={"theta": ["school"], "eta": ["school"]},
     )
示例#2
0
def _convert_pyjags_samples_dict_to_arviz_inference_data(
        samples: tp.Dict[str, np.ndarray]) -> az.InferenceData:
    """
    Converts a PyJAGS samples dictionary to an ArviZ inference data object.

    Takes a python dictionary of samples that has been generated by the sample
    method of a model instance and returns an Arviz inference data object.

    Parameters
    ----------
    samples: a dictionary mapping variable names to NumPy arrays with shape
             (parameter_dimension, chain_length, number_of_chains)

    Returns
    -------
    An Arviz inference data object
    """
    # pyjags returns a dictionary of NumPy arrays with shape
    #         (parameter_dimension, chain_length, number_of_chains)
    # but arviz expects samples with shape
    #         (number_of_chains, chain_length, parameter_dimension)

    return az.convert_to_inference_data(
        _convert_pyjags_samples_dictionary_to_arviz_samples_dictionary(
            samples))
示例#3
0
def plot_trace(trace: Dict[str, ndarray], show: bool = False) -> Figure:
    """Use `Arviz` to plot a trace of the variable parameters,
    alongside a histogram of their distribution.

    Parameters
    ----------
    trace: dict of str and numpy.ndarray
        The parameter trace with shape=(n_steps, n_variable_parameters)
    show: bool
        If true, the plot will be shown.

    Returns
    -------
    matplotlib.pyplot.Figure
        The plotted figure.
    """

    data = arviz.convert_to_inference_data(trace)

    axes = arviz.plot_trace(data)
    figure = axes[0][0].figure

    if show:
        figure.show()

    return figure
示例#4
0
    def plot_trace(self, trace, show=False):
        """Use `Arviz` to plot a trace of the trainable parameters,
        alongside a histogram of their distribution.

        Parameters
        ----------
        trace: numpy.ndarray
            The parameter trace with shape=(n_steps, n_trainable_parameters+1)
        show: bool
            If true, the plot will be shown.

        Returns
        -------
        matplotlib.pyplot.Figure
            The plotted figure.
        """

        trace_dict = {}

        for index, label in enumerate(self._prior_labels):
            trace_dict[label] = trace[:, index + 1]

        data = arviz.convert_to_inference_data(trace_dict)

        axes = arviz.plot_trace(data)
        figure = axes[0][0].figure

        if show:
            figure.show()

        return figure
示例#5
0
    def to_arviz(name, chain):
        import arviz as az

        if len(chain.shape) == 1:
            chain = chain.reshape(-1, 1)

        return az.convert_to_inference_data({name: chain[np.newaxis, :, :]})
示例#6
0
 def get_inference_data(self):
     return convert_to_inference_data(
         self.obj,
         group="posterior",
         coords={"school": np.arange(self.data["J"])},
         dims={"theta": ["school"], "theta_tilde": ["school"]},
     )
示例#7
0
 def get_inference_data(self, data):
     return convert_to_inference_data(
         data.obj,
         group="posterior",
         coords={"school": np.arange(8)},
         dims={"theta": ["school"], "eta": ["school"]},
     )
示例#8
0
    def test_id_conversion_args(self):
        stored = load_arviz_data("centered_eight")
        IVIES = [
            "Yale", "Harvard", "MIT", "Princeton", "Cornell", "Dartmouth",
            "Columbia", "Brown"
        ]
        # test dictionary argument...
        # I reverse engineered a dictionary out of the centered_eight
        # data. That's what this block of code does.
        d = stored.posterior.to_dict()
        d = d["data_vars"]
        test_dict = {}  # type: Dict[str, np.ndarray]
        for var_name in d:
            data = d[var_name]["data"]
            # this is a list of chains that is a list of samples...
            chain_arrs = []
            for chain in data:  # list of samples
                chain_arrs.append(np.array(chain))
            data_arr = np.stack(chain_arrs)
            test_dict[var_name] = data_arr

        inference_data = convert_to_inference_data(test_dict,
                                                   dims={"theta": ["Ivies"]},
                                                   coords={"Ivies": IVIES})

        assert isinstance(inference_data, InferenceData)
        assert set(
            inference_data.posterior.coords["Ivies"].values) == set(IVIES)
        assert inference_data.posterior["theta"].dims == ("chain", "draw",
                                                          "Ivies")
示例#9
0
 def effective_sample_size(self):
     try:
         arviz_samples = az.convert_to_inference_data(self.samples)
         ess = az.ess(arviz_samples)
     except ModuleNotFoundError:
         print("Summary relies on arviz and arviz is not installed")
         ess = None
     return ess
示例#10
0
 def get_inference_data(self):
     return convert_to_inference_data(
         self.obj,
         group='posterior',
         coords={'school': np.arange(self.data['J'])},
         dims={
             'theta': ['school'],
             'theta_tilde': ['school']
         },
     )
示例#11
0
    def _create_inference_data(self, chains):
        if len(chains) > 1:
            data = {
                name: np.stack([c[name] for c in chains])
                for name in chains[0]._names
            }
        else:
            data = {name: chains[0][name][None] for name in chains[0]._names}

        return az.convert_to_inference_data(data).posterior
示例#12
0
    def test_nd_to_inference_data(self):
        shape = (1, 2, 3, 4, 5)
        inference_data = convert_to_inference_data(np.random.randn(*shape), group="foo")
        assert hasattr(inference_data, "foo")
        assert len(inference_data.foo.data_vars) == 1
        var_name = list(inference_data.foo.data_vars)[0]

        assert len(inference_data.foo.coords) == len(shape)
        assert inference_data.foo.chain.shape == shape[:1]
        assert inference_data.foo.draw.shape == shape[1:2]
        assert inference_data.foo[var_name].shape == shape
示例#13
0
    def test_more_chains_than_draws(self):
        shape = (10, 4)
        with pytest.warns(SyntaxWarning):
            inference_data = convert_to_inference_data(np.random.randn(*shape), group="foo")
        assert hasattr(inference_data, "foo")
        assert len(inference_data.foo.data_vars) == 1
        var_name = list(inference_data.foo.data_vars)[0]

        assert len(inference_data.foo.coords) == len(shape)
        assert inference_data.foo.chain.shape == shape[:1]
        assert inference_data.foo.draw.shape == shape[1:2]
        assert inference_data.foo[var_name].shape == shape
示例#14
0
    def test_nd_to_inference_data(self):
        shape = (1, 2, 3, 4, 5)
        inference_data = convert_to_inference_data(np.random.randn(*shape), group="prior")
        assert hasattr(inference_data, "prior")
        assert len(inference_data.prior.data_vars) == 1
        var_name = list(inference_data.prior.data_vars)[0]

        assert len(inference_data.prior.coords) == len(shape)
        assert inference_data.prior.chain.shape == shape[:1]
        assert inference_data.prior.draw.shape == shape[1:2]
        assert inference_data.prior[var_name].shape == shape
        assert repr(inference_data).startswith("Inference data with groups")
示例#15
0
    def test_nd_to_inference_data(self):
        shape = (1, 2, 3, 4, 5)
        inference_data = convert_to_inference_data(
            xr.DataArray(np.random.randn(*shape),
                         dims=("chain", "draw", "dim_0", "dim_1", "dim_2")),
            group="prior",
        )
        var_name = list(inference_data.prior.data_vars)[0]

        assert hasattr(inference_data, "prior")
        assert len(inference_data.prior.data_vars) == 1
        assert inference_data.prior.chain.shape == shape[:1]
        assert inference_data.prior.draw.shape == shape[1:2]
        assert inference_data.prior[var_name].shape == shape
示例#16
0
def neff_det_check_plot(c):
    fit = az.convert_to_inference_data(c)

    az.plot_density(fit, var_names=['neff_det'], credible_interval=0.99)

    xlabel(r'$N_\mathrm{eff}$')
    ylabel(r'$p\left( N_\mathrm{eff} \right)$')

    nobs = c.posterior['m1s'].shape[2]
    axvline(4 * nobs)

    nemin = percentile(c.posterior['neff_det'], 2.5)
    title(r'Two-sigma lower $N_\mathrm{{eff}}$ is factor {:.2f} above limit'.
          format(nemin / (4 * nobs)))
示例#17
0
    def test_monte_carlo_format(self):
        # Note that this file is empty, we are using it as a
        # placeholder to place the az Inference object
        filepath = self.get_data_path('monte-carlo-samples.az')

        size = 100
        dataset = az.convert_to_inference_data(np.random.randn(size))
        dataset.to_netcdf(filepath)

        temp_dir = self.temp_dir.name
        shutil.copy(filepath,
                    os.path.join(temp_dir, 'monte-carlo-samples.az'))
        format = MonteCarloTensorDirectoryFormat(temp_dir, mode='r')
        format.validate()
示例#18
0
 def test_id_conversion_idempotent(self):
     stored = load_arviz_data("centered_eight")
     inference_data = convert_to_inference_data(stored)
     assert isinstance(inference_data, InferenceData)
     assert set(inference_data.observed_data.obs.coords["school"].values) == {
         "Hotchkiss",
         "Mt. Hermon",
         "Choate",
         "Deerfield",
         "Phillips Andover",
         "St. Paul's",
         "Lawrenceville",
         "Phillips Exeter",
     }
     assert inference_data.posterior["theta"].dims == ("chain", "draw", "school")
示例#19
0
    def posterior(self):
        """
        To extract a number of burn-ins, you need to index using `xarray`
        syntax. In this case, it'll look like:
        
        ```helper.posterior.sel(draw=slice(3000,None))```
        
        To get draws 3000 to the end. Unfortunately, -1000 does not appear
        to work for this syntax.

        Returns
        -------
        [type]
            [description]
        """
        return arviz.convert_to_inference_data(self.chain)
示例#20
0
    def _get_posterior_samples_from_matrix_as_inferencedata(
            self, matrix: torch.Tensor) -> az.InferenceData:
        np_samples_list = self._get_posterior_samples_from_matrix_as_numpy(
            matrix)
        dictdata = {}

        for param_name in self.model.parameters.keys():
            dictdata[param_name] = np.stack(
                [sample[param_name] for sample in np_samples_list]
            )[np.newaxis,
              ...]  # First dim should be "chain" for conversion to InferenceData.

        infdata = az.convert_to_inference_data(dictdata)
        infdata.posterior.attrs["inference_library"] = "smallx"
        infdata.posterior.attrs["inference_library_version"] = "0.0"

        return infdata
示例#21
0
def traceplot(c):
    fit = az.convert_to_inference_data(c)

    lines = (('H0', {}, true_params['H0']), ('Om', {}, true_params['Om']),
             ('w0', {}, true_params['w']), ('R0_30', {}, true_params['R0_30']),
             ('MMin', {}, true_params['MMin']),
             ('MMax', {}, true_params['MMax']), ('smooth_min', {},
                                                 true_params['smooth_min']),
             ('smooth_max', {},
              true_params['smooth_max']), ('alpha', {}, true_params['alpha']),
             ('beta', {}, true_params['beta']), ('gamma', {},
                                                 true_params['gamma']))

    az.plot_trace(fit,
                  var_names=[
                      'H0', 'Om', 'w0', 'R0_30', 'MMax', 'smooth_max', 'alpha',
                      'beta', 'gamma'
                  ],
                  lines=lines)
示例#22
0
def _sample_model(model, data, refresh=100, **kwargs):
    print()
    print("Data")
    print("----")
    for k, v in data.items():
        if numpy.shape(v) == ():
            print('  %-10s: %s' % (k, v))
        else:
            print('  %-10s: shape %s [%s ... %s]' %
                  (k, numpy.shape(v), numpy.min(v), numpy.max(v)))

    print()
    print("sampling from model ...")
    fit = model.sampling(data=data, refresh=refresh, **kwargs)
    print("processing results ...")
    print(fit)
    print("checking results ...")
    check_all_diagnostics(fit,
                          max_treedepth=kwargs.get('control', {}).get(
                              'max_treedepth', 10),
                          quiet=False)
    return arviz.convert_to_inference_data(fit)
示例#23
0
def test_convert_to_dataset_bad(tmpdir):
    first = convert_to_inference_data(np.random.randn(100), group="prior")
    filename = str(tmpdir.join("test_file.nc"))
    first.to_netcdf(filename)
    with pytest.raises(ValueError):
        convert_to_dataset(filename, group="bar")
示例#24
0
def test_convert_to_inference_data_bad():
    with pytest.raises(ValueError):
        convert_to_inference_data(1)
示例#25
0
def test_convert_to_inference_data_from_file(tmpdir):
    first = convert_to_inference_data(np.random.randn(100), group="prior")
    filename = str(tmpdir.join("test_file.nc"))
    first.to_netcdf(filename)
    second = convert_to_inference_data(filename)
    assert first.prior.equals(second.prior)
示例#26
0
def test_convert_to_inference_data_idempotent():
    first = convert_to_inference_data(np.random.randn(100), group="prior")
    second = convert_to_inference_data(first)
    assert first.prior is second.prior
示例#27
0
def main():
    parser = argparse.ArgumentParser(
        description='Train PMF on CSV-formatted count matrix')
    parser.add_argument(
        '-f', '--csv-file', nargs='?', type=str,
        help="Enter the CSV file"
    )
    parser.add_argument(
        '-e', '--epoch', nargs='?', type=int, default=300,
        help='Enter Epoch value: Default: 300'
    )
    parser.add_argument(
        '-d', '--dimension', nargs='?', type=int, default=2,
        help='Enter embedding dimension. Default: 2'
    )
    parser.add_argument(
        '-b', '--batch-size', nargs='?', type=int, default=5000,
        help='Enter batch size. Default: 5000'
    )

    parser.add_argument(
        '-lr', '--learning-rate', nargs='?', type=float, default=0.01,
        help='Enter float. Default: 0.01'
    )
    
    parser.add_argument(
        '-c', '--clip-value', nargs='?', type=float, default=3.,
        help='Gradient clip value. Default: 3.0'
    )

    parser.add_argument(
        '-lt', '--log-transform',
        help='Log-transform?', action='store_true'
    )

    parser.add_argument(
        '-rn', '--row-normalize',
        help='Row normalize based on counts?', action='store_true'
    )

    args = parser.parse_args(sys.argv[1:])
    if args.csv_file is None:
        sys.exit("You need to specify a csv file")
    elif not os.path.exists(args.csv_file):
        sys.exit("File doesn't exist")
    else:
        _FILENAME = args.csv_file

    _BATCH_SIZE = args.batch_size
    _LOG_TRANSFORM = args.log_transform
    _EPOCH_NUMBER = args.epoch
    _DIMENSION = args.dimension
    _LEARNING_RATE = args.learning_rate
    _ROW_NORMALIZE = args.row_normalize
    _CLIP_VALUE = args.clip_value

    with open(_FILENAME) as f:
        csv_file = csv.reader(f)
        columns = len(next(csv_file))

    csv_data0 = tf.data.experimental.CsvDataset(
        _FILENAME, [tf.float64]*columns)
    csv_data0 = csv_data0.enumerate()

    csv_data = csv_data0.map(
        lambda j, *x: {
            'indices': j,
            'counts': tf.squeeze(tf.stack(x, axis=-1))
        })

    # Grab a batch to compute statistics
    colsums = []
    batch_sizes = []
    N = 0
    for batch in iter(csv_data.batch(_BATCH_SIZE, drop_remainder=False)):
        colsums += [tf.reduce_sum(batch['counts'], axis=0, keepdims=True)]
        N += batch['counts'].shape[0]

    colsums = tf.add_n(colsums)
    colmeans = colsums/N
    rowmean = tf.reduce_sum(colmeans)

    if _ROW_NORMALIZE:
        csv_data = csv_data0.map(
            lambda j, *x: {
                'indices': j,
                'counts': tf.squeeze(tf.stack(x, axis=-1)),
                'normalization': tf.reduce_max([
                    tf.reduce_sum(x), 1.])/rowmean
            })

    csv_data_batched = csv_data.batch(_BATCH_SIZE, drop_remainder=True)
    csv_data_batched = csv_data_batched.prefetch(
        tf.data.experimental.AUTOTUNE)

    factor = PoissonMatrixFactorization(
        csv_data_batched, latent_dim=_DIMENSION, strategy=None,
        scale_columns=True, log_transform=_LOG_TRANSFORM,
        column_norms=colmeans,
        u_tau_scale=1.0/np.sqrt(columns*N),
        dtype=tf.float64)

    factor.calibrate_advi(
        num_epochs=_EPOCH_NUMBER,
        rel_tol=1e-4, clip_value=_CLIP_VALUE,
        learning_rate=_LEARNING_RATE)

    print("Saving the encoding matrix")

    filename = f"{_FILENAME}_{_DIMENSION}D_encoding"
    filename += f"_lt_{_LOG_TRANSFORM}_rn_{_ROW_NORMALIZE}.csv"
    with open(filename, "w") as f:
        writer = csv.writer(f)
        encoding = factor.encoding_matrix().numpy().T
        for row in range(encoding.shape[0]):
            writer.writerow(encoding[row, :])

    print("Saving the trained model object")
    filename = f"{_FILENAME}_{_DIMENSION}D_model"
    filename += f"_lt_{_LOG_TRANSFORM}_rn_{_ROW_NORMALIZE}.pkl"
    factor.save(filename)

    print("Saving figure with the encodings")

    fig, ax = plt.subplots(1, 2, figsize=(14, 8))
    D = factor.feature_dim
    pcm = ax[0].imshow(
        factor.encoding_matrix().numpy()[::-1, :],
        vmin=0, cmap="Blues")
    ax[0].set_yticks(np.arange(factor.feature_dim))
    ax[0].set_yticklabels(np.arange(factor.feature_dim))
    ax[0].set_ylabel("item")
    ax[0].set_xlabel("factor dimension")
    ax[0].set_xticks(np.arange(_DIMENSION))
    ax[0].set_xticklabels(np.arange(_DIMENSION))

    surrogate_samples = factor.surrogate_distribution.sample(250)
    if 's' in surrogate_samples.keys():
        weights = surrogate_samples['s'] / \
            tf.reduce_sum(surrogate_samples['s'], -2, keepdims=True)
        intercept_data = az.convert_to_inference_data(
            {
                r"":
                    (
                        tf.squeeze(surrogate_samples['w'])
                        * weights[:, -1, :]
                        * factor.eta_i
                    ).numpy().T})
    else:
        intercept_data = az.convert_to_inference_data(
            {
                r"":
                    (
                        tf.squeeze(surrogate_samples['w'])
                        * factor.eta_i).numpy().T})

    fig.colorbar(pcm, ax=ax[0], orientation="vertical")
    az.plot_forest(intercept_data, ax=ax[1])
    ax[1].set_xlabel("background rate")
    ax[1].set_ylim((-0.014, .466))
    ax[1].set_title("65% and 95% CI")
    ax[1].axvline(1.0, linestyle='dashed', color="black")
    filename = f"{_FILENAME}_{_DIMENSION}D_encoding_"
    filename += f"lt_{_LOG_TRANSFORM}_rn_{_ROW_NORMALIZE}.pdf"
    plt.savefig(
        filename,
        bbox_inches='tight')

    print("Generating representations")
    filename = f"{_FILENAME}_{_DIMENSION}D_representation"
    filename += f"_lt_{_LOG_TRANSFORM}_rn_{_ROW_NORMALIZE}.csv"

    csv_data_batched = csv_data.batch(_BATCH_SIZE, drop_remainder=False)
    with open(filename, 'w') as f:
        writer = csv.writer(f)
        for record in iter(csv_data_batched):
            z = factor.encode(tf.cast(record['data'], factor.dtype)).numpy()
            if _ROW_NORMALIZE:
                z *= (record['normalization'].numpy())[:, np.newaxis]
            ind = record['indices'].numpy()
            for row in range(z.shape[0]):
                writer.writerow(np.concatenate([[ind[row]], z[row, :]]))
示例#28
0
def test_convert_to_inference_data_from_file(tmpdir):
    first = convert_to_inference_data(np.random.randn(100), group='foo')
    filename = str(tmpdir.join('test_file.nc'))
    first.to_netcdf(filename)
    second = convert_to_inference_data(filename)
    assert first.foo.equals(second.foo)
示例#29
0
def test_convert_to_inference_data_idempotent():
    first = convert_to_inference_data(np.random.randn(100), group='foo')
    second = convert_to_inference_data(first)
    assert first.foo is second.foo
expts = ("Oid_1ngmL", "O1_1ngmL", "O2_1ngmL")
data_uv5, data_rep = srep.utils.condense_data(expts)

# load in the pickled samples
pklfile = open(f"{repo_rootdir}/data/mcmc_samples/1ngmL_sampler.pkl", 'rb')
sampler = dill.load(pklfile)
pklfile.close()

n_dim = np.shape(sampler.get_chain())[-1]
# remember these are log_10 of actual params!!
var_labels = ("k_burst", "b", "kR_on", "koff_Oid", "koff_O1", "koff_O2")

#%%
emcee_output = az.convert_to_inference_data(
    sampler, var_names=var_labels
    )
bokeh.io.show(bebi103.viz.corner(emcee_output, plot_ecdf=True))
#%%
# ppc plots
plotting_draws = 75
ppc_uv5 = srep.models.post_pred_bursty_rep(
    sampler, n_pred=sum(data_uv5[1]), n_post=plotting_draws,
    kon_ind='nbinom', koff_ind='nbinom'
    )
ppc_rep = []
for i in range(len(expts)):
    ppc_rep.append(
        srep.models.post_pred_bursty_rep(
            sampler, n_pred=sum(data_rep[i][1]), n_post=plotting_draws,
            kon_ind=2, koff_ind=3+i))