def test_uniform_normal(): true_coef = 0.9 data = true_coef + random.normal(random.PRNGKey(0), (1000,)) def model(data): alpha = numpyro.sample("alpha", dist.Uniform(0, 1)) with numpyro.handlers.reparam(config={"loc": TransformReparam()}): loc = numpyro.sample( "loc", dist.TransformedDistribution( dist.Uniform(0, 1), transforms.AffineTransform(0, alpha) ), ) numpyro.sample("obs", dist.Normal(loc, 0.1), obs=data) adam = optim.Adam(0.01) rng_key_init = random.PRNGKey(1) guide = AutoDiagonalNormal(model) svi = SVI(model, guide, adam, Trace_ELBO()) svi_state = svi.init(rng_key_init, data) def body_fn(i, val): svi_state, loss = svi.update(val, data) return svi_state svi_state = fori_loop(0, 1000, body_fn, svi_state) params = svi.get_params(svi_state) median = guide.median(params) assert_allclose(median["loc"], true_coef, rtol=0.05) # test .quantile method median = guide.quantiles(params, [0.2, 0.5]) assert_allclose(median["loc"][1], true_coef, rtol=0.1)
def test_init_to_scalar_value(): def model(): numpyro.sample("x", dist.Normal(0, 1)) guide = AutoDiagonalNormal(model, init_loc_fn=init_to_value(values={"x": 1.0})) svi = SVI(model, guide, optim.Adam(1.0), Trace_ELBO()) svi.init(random.PRNGKey(0))
def test_autoguide(deterministic): GLOBAL["count"] = 0 guide = AutoDiagonalNormal(model) svi = SVI(model, guide, optim.Adam(0.1), Trace_ELBO(), deterministic=deterministic) svi_state = svi.init(random.PRNGKey(0)) svi_state = lax.fori_loop(0, 100, lambda i, val: svi.update(val)[0], svi_state) params = svi.get_params(svi_state) guide.sample_posterior(random.PRNGKey(1), params, sample_shape=(100, )) if deterministic: assert GLOBAL["count"] == 5 else: assert GLOBAL["count"] == 4
def test_autocontinuous_local_error(): def model(): with numpyro.plate("N", 10, subsample_size=4): numpyro.sample("x", dist.Normal(0, 1)) guide = AutoDiagonalNormal(model) svi = SVI(model, guide, optim.Adam(1.0), Trace_ELBO()) with pytest.raises(ValueError, match="local latent variables"): svi.init(random.PRNGKey(0))
def test_module(): x = random.normal(random.PRNGKey(0), (100, 10)) y = random.normal(random.PRNGKey(1), (100,)) def model(x, y): nn = numpyro.module("nn", Dense(1), (10,)) mu = nn(x).squeeze(-1) sigma = numpyro.sample("sigma", dist.HalfNormal(1)) numpyro.sample("y", dist.Normal(mu, sigma), obs=y) guide = AutoDiagonalNormal(model) svi = SVI(model, guide, optim.Adam(0.003), Trace_ELBO(), x=x, y=y) svi_state = svi.init(random.PRNGKey(2)) lax.scan(lambda state, i: svi.update(state), svi_state, jnp.zeros(1000))
def test_improper(): y = random.normal(random.PRNGKey(0), (100,)) def model(y): lambda1 = numpyro.sample('lambda1', dist.ImproperUniform(dist.constraints.real, (), ())) lambda2 = numpyro.sample('lambda2', dist.ImproperUniform(dist.constraints.real, (), ())) sigma = numpyro.sample('sigma', dist.ImproperUniform(dist.constraints.positive, (), ())) mu = numpyro.deterministic('mu', lambda1 + lambda2) numpyro.sample('y', dist.Normal(mu, sigma), obs=y) guide = AutoDiagonalNormal(model) svi = SVI(model, guide, optim.Adam(0.003), Trace_ELBO(), y=y) svi_state = svi.init(random.PRNGKey(2)) lax.scan(lambda state, i: svi.update(state), svi_state, jnp.zeros(10000))
def test_dynamic_supports(): true_coef = 0.9 data = true_coef + random.normal(random.PRNGKey(0), (1000,)) def actual_model(data): alpha = numpyro.sample("alpha", dist.Uniform(0, 1)) with numpyro.handlers.reparam(config={"loc": TransformReparam()}): loc = numpyro.sample( "loc", dist.TransformedDistribution( dist.Uniform(0, 1), transforms.AffineTransform(0, alpha) ), ) with numpyro.plate("N", len(data)): numpyro.sample("obs", dist.Normal(loc, 0.1), obs=data) def expected_model(data): alpha = numpyro.sample("alpha", dist.Uniform(0, 1)) loc = numpyro.sample("loc", dist.Uniform(0, 1)) * alpha with numpyro.plate("N", len(data)): numpyro.sample("obs", dist.Normal(loc, 0.1), obs=data) adam = optim.Adam(0.01) rng_key_init = random.PRNGKey(1) guide = AutoDiagonalNormal(actual_model) svi = SVI(actual_model, guide, adam, Trace_ELBO()) svi_state = svi.init(rng_key_init, data) actual_opt_params = adam.get_params(svi_state.optim_state) actual_params = svi.get_params(svi_state) actual_values = guide.median(actual_params) actual_loss = svi.evaluate(svi_state, data) guide = AutoDiagonalNormal(expected_model) svi = SVI(expected_model, guide, adam, Trace_ELBO()) svi_state = svi.init(rng_key_init, data) expected_opt_params = adam.get_params(svi_state.optim_state) expected_params = svi.get_params(svi_state) expected_values = guide.median(expected_params) expected_loss = svi.evaluate(svi_state, data) # test auto_loc, auto_scale check_eq(actual_opt_params, expected_opt_params) check_eq(actual_params, expected_params) # test latent values assert_allclose(actual_values["alpha"], expected_values["alpha"]) assert_allclose(actual_values["loc_base"], expected_values["loc"]) assert_allclose(actual_loss, expected_loss)
def model_factory(twinify_args: argparse.Namespace, unparsed_args: Iterable[str], orig_data: pd.DataFrame) -> TModelFunction: model_args_parser = argparse.ArgumentParser() model_args_parser.add_argument('--prior_mu', type=float, default=0.) args = model_args_parser.parse_args(unparsed_args, twinify_args) d = orig_data.shape[-1] print(f"Model using prior mu = {args.prior_mu}") print(f"Privacy parameter epsilon is {args.epsilon}") def model(z = None, num_obs_total = None) -> None: batch_size = 1 if z is not None: batch_size = z.shape[0] if num_obs_total is None: num_obs_total = batch_size mu = sample('mu', dists.Normal(args.prior_mu).expand_by((d,)).to_event(1)) sigma = sample('sigma', dists.InverseGamma(1.).expand_by((d,)).to_event(1)) with plate('batch', num_obs_total, batch_size): sample('x', dists.Normal(mu, sigma).to_event(1), obs=z) guide = AutoDiagonalNormal(model, prefix="guide") return model, guide
def test_dynamic_supports(): true_coef = 0.9 data = true_coef + random.normal(random.PRNGKey(0), (1000, )) def actual_model(data): alpha = numpyro.sample('alpha', dist.Uniform(0, 1)) with reparam(config={'loc': TransformReparam()}): loc = numpyro.sample('loc', dist.Uniform(0, alpha)) numpyro.sample('obs', dist.Normal(loc, 0.1), obs=data) def expected_model(data): alpha = numpyro.sample('alpha', dist.Uniform(0, 1)) loc = numpyro.sample('loc', dist.Uniform(0, 1)) * alpha numpyro.sample('obs', dist.Normal(loc, 0.1), obs=data) adam = optim.Adam(0.01) rng_key_init = random.PRNGKey(1) guide = AutoDiagonalNormal(actual_model) svi = SVI(actual_model, guide, adam, ELBO()) svi_state = svi.init(rng_key_init, data) actual_opt_params = adam.get_params(svi_state.optim_state) actual_params = svi.get_params(svi_state) actual_values = guide.median(actual_params) actual_loss = svi.evaluate(svi_state, data) guide = AutoDiagonalNormal(expected_model) svi = SVI(expected_model, guide, adam, ELBO()) svi_state = svi.init(rng_key_init, data) expected_opt_params = adam.get_params(svi_state.optim_state) expected_params = svi.get_params(svi_state) expected_values = guide.median(expected_params) expected_loss = svi.evaluate(svi_state, data) # test auto_loc, auto_scale check_eq(actual_opt_params, expected_opt_params) check_eq(actual_params, expected_params) # test latent values assert_allclose(actual_values['alpha'], expected_values['alpha']) assert_allclose(actual_values['loc_base'], expected_values['loc']) assert_allclose(actual_loss, expected_loss)
def main(): args = parser.parse_args() print(args) # read data df = pd.read_csv(args.data_path) # check whether we parse model from txt or whether we have a numpyro module try: if args.model_path[-3:] == '.py': spec = importlib.util.spec_from_file_location( "model_module", args.model_path) model_module = importlib.util.module_from_spec(spec) spec.loader.exec_module(model_module) model = model_module.model train_df = df if args.drop_na: train_df = train_df.dropna() ## AUTOMATIC PREPROCESSING CURRENTLY UNAVAILABLE # data preprocessing: determines number of categories for Categorical # distribution and maps categorical values in the data to ints # for feature in features: # train_df = feature.preprocess_data(train_df) ## ALTERNATIVE # we do allow the user to specify a preprocess/postprocess function pair # in the numpyro model file try: preprocess_fn = model_module.preprocess except: preprocess_fn = None if preprocess_fn: train_df = preprocess_fn(train_df) try: postprocess_fn = model_module.postprocess except: postprocess_fn = None try: guide = model_module.guide except: guide = AutoDiagonalNormal(model) else: print( "Parsing model from txt file (was unable to read it as python module containing numpyro code)" ) k = args.k # read model file with open(args.model_path, 'r') as model_handle: model_str = "".join(model_handle.readlines()) features = automodel.parse_model(model_str) feature_names = [feature.name for feature in features] # pick features from data according to model file missing_features = set(feature_names).difference(df.columns) if missing_features: raise automodel.ParsingError( "The model specifies features that are not present in the data:\n{}" .format(", ".join(missing_features))) train_df = df.loc[:, feature_names] if args.drop_na: train_df = train_df.dropna() # TODO normalize? # data preprocessing: determines number of categories for Categorical # distribution and maps categorical values in the data to ints for feature in features: train_df = feature.preprocess_data(train_df) # build model model = automodel.make_model(features, k) # build variational guide for optimization guide = AutoDiagonalNormal(model) # postprocessing for automodel def postprocess_fn(syn_df): for feature in features: syn_df = feature.postprocess_data(syn_df) return syn_df except Exception as e: # handling errors in py-file parsing print("\n#### FAILED TO PARSE THE MODEL SPECIFICATION ####") print("Here's the technical error description:") print(e) traceback.print_tb(e.__traceback__) print("\nAborting...") exit(3) # pick features from data according to model file num_data = train_df.shape[0] if args.drop_na: print( "After removing missing values, the data has {} entries with {} features" .format(*train_df.shape)) else: print( "The data has {} entries with {} features".format(*train_df.shape)) # compute DP values target_delta = args.delta if target_delta is None: target_delta = 1. / num_data if target_delta * num_data > 1.: print("!!!!! WARNING !!!!! The given value for privacy parameter delta ({:1.3e}) exceeds 1/(number of data) ({:1.3e}),\n" \ "which the maximum value that is usually considered safe!".format( target_delta, 1. / num_data )) x = input("Continue? (type YES ): ") if x != "YES": print("Aborting...") exit(4) print("Continuing... (YOU HAVE BEEN WARNED!)") num_compositions = int(args.num_epochs / args.sampling_ratio) dp_sigma, epsilon, _ = approximate_sigma_remove_relation( args.epsilon, target_delta, args.sampling_ratio, num_compositions) batch_size = q_to_batch_size(args.sampling_ratio, num_data) sigma_per_sample = dp_sigma / q_to_batch_size(args.sampling_ratio, num_data) print("Will apply noise with std deviation {:.2f} (~ {:.2f} per element in batch) to achieve privacy epsilon "\ "of {:.3f} (for delta {:.2e}) ".format(dp_sigma, sigma_per_sample, epsilon, target_delta)) # TODO: warn for high noise? but when is it too high? what is a good heuristic? inference_rng, sampling_rng = initialize_rngs(args.seed) # learn posterior distributions try: posterior_params = train_model_no_dp( inference_rng, model, guide, train_df.to_numpy(), batch_size=int(args.sampling_ratio * len(train_df)), num_epochs=args.num_epochs, dp_scale=dp_sigma, clipping_threshold=args.clipping_threshold) except (InferenceException, FloatingPointError): print( "################################## ERROR ##################################" ) print( "!!!!! The inference procedure encountered a NaN value (not a number). !!!!!" ) print( "This means the model has major difficulties in capturing the data and is" ) print("likely to happen when the dataset is very small and/or sparse.") print("Try adapting (simplifying) the model.") print("Aborting...") exit(2) num_synthetic = args.num_synthetic if num_synthetic is None: num_synthetic = train_df.shape[0] predictive_model = lambda: model(None) posterior_samples = Predictive( predictive_model, guide=guide, params=posterior_params, num_samples=num_synthetic).get_samples(sampling_rng) # sample synthetic data from posterior predictive distribution # posterior_samples = sample_multi_posterior_predictive(sampling_rng, # args.num_synthetic, model, (None,), guide, (), posterior_params) syn_data = posterior_samples['x'] # save results syn_df = pd.DataFrame(syn_data, columns=train_df.columns) # postprocess: if preprocessing involved data mapping, it is mapped back here # so that the synthetic twin looks like the original data encoded_syn_df = syn_df.copy() if postprocess_fn: encoded_syn_df = postprocess_fn(encoded_syn_df) encoded_syn_df.to_csv("{}.csv".format(args.output_path), index=False) pickle.dump(posterior_params, open("{}.p".format(args.output_path), "wb")) ## illustrate results if args.visualize != 'none': show_popups = args.visualize in ('popup', 'both') save_plots = args.visualize in ('store', 'both') # Missing value rate if not args.drop_na: missing_value_fig = plot_missing_values(syn_df, train_df, show=show_popups) if save_plots: missing_value_fig.savefig(args.output_path + "_missing_value_plots.svg") # Marginal violins margin_fig = plot_margins(syn_df, train_df, show=show_popups) # Covariance matrices cov_fig = plot_covariance_heatmap(syn_df, train_df, show=show_popups) if save_plots: margin_fig.savefig(args.output_path + "_marginal_plots.svg") cov_fig.savefig(args.output_path + "_correlation_plots.svg") if show_popups: plt.show()
def main(): args, unknown_args = parser.parse_known_args() print(args) if unknown_args: print(f"Additional received arguments: {unknown_args}") # read data try: df = pd.read_csv(args.data_path) except Exception as e: print("#### UNABLE TO READ DATA FILE ####") print(e) return 1 print("Loaded data set has {} rows (entries) and {} columns (features).".format(*df.shape)) num_data = len(df) try: # check whether we parse model from txt or whether we have a numpyro module if args.model_path[-3:] == '.py': train_df = df.copy() if args.drop_na: train_df = train_df.dropna() try: model, guide, preprocess_fn, postprocess_fn = load_custom_numpyro_model(args.model_path, args, unknown_args, train_df) except (ModuleNotFoundError, FileNotFoundError) as e: print("#### COULD NOT FIND THE MODEL FILE ####") print(e) return 1 train_data, num_data, feature_names = preprocess_fn(train_df) else: print("Parsing model from txt file (was unable to read it as python module containing numpyro code)") k = args.k # read model file with open(args.model_path, 'r') as model_handle: model_str = "".join(model_handle.readlines()) features = automodel.parse_model(model_str) feature_names = [feature.name for feature in features] # pick features from data according to model file missing_features = set(feature_names).difference(df.columns) if missing_features: raise automodel.ParsingError( "The model specifies features that are not present in the data:\n{}".format( ", ".join(missing_features) ) ) df = df.loc[:, feature_names] train_df = df.copy() # TODO: this duplicates code with the other branch but cannot currently pull it out because we are manipulating df above if args.drop_na: train_df = train_df.dropna() # TODO normalize? # data preprocessing: determines number of categories for Categorical # distribution and maps categorical values in the data to ints for feature in features: train_df = feature.preprocess_data(train_df) # build model model = automodel.make_model(features, k) # build variational guide for optimization guide = AutoDiagonalNormal(model) # postprocessing for automodel postprocess_fn = automodel.postprocess_function_factory(features) num_data = train_df.shape[0] train_data = (train_df,) assert isinstance(train_data, tuple) if len(train_data) == 1: print("After preprocessing, the data has {} entries with {} features each.".format(*train_data[0].shape)) else: print("After preprocessing, the data was split into {} splits:".format(len(train_data))) for i, x in enumerate(train_data): print("\tSplit {} has {} entries with {} features each.".format(i, x.shape[0], 1 if x.ndim == 1 else x.shape[1])) # compute DP values # TODO need to make this fail safely batch_size = q_to_batch_size(args.sampling_ratio, num_data) if not args.no_privacy: target_delta = args.delta if target_delta is None: target_delta = 1. / num_data if target_delta * num_data > 1.: print("!!!!! WARNING !!!!! The given value for privacy parameter delta ({:1.3e}) exceeds 1/(number of data) ({:1.3e}),\n" \ "which the maximum value that is usually considered safe!".format( target_delta, 1. / num_data )) x = input("Continue? (type YES ): ") if x != "YES": print("Aborting...") return 4 print("Continuing... (YOU HAVE BEEN WARNED!)") num_compositions = int(args.num_epochs / args.sampling_ratio) dp_sigma, epsilon, _ = approximate_sigma_remove_relation( args.epsilon, target_delta, args.sampling_ratio, num_compositions ) sigma_per_sample = dp_sigma / q_to_batch_size(args.sampling_ratio, num_data) print("Will apply noise with std deviation {:.2f} (~ {:.2f} per element in batch) to achieve privacy epsilon "\ "of {:.3f} (for delta {:.2e}) ".format(dp_sigma, sigma_per_sample, epsilon, target_delta)) # TODO: warn for high noise? but when is it too high? what is a good heuristic? do_training = lambda inference_rng: train_model( inference_rng, d3p.random, model, guide, train_data, batch_size=batch_size, num_data=num_data, num_epochs=args.num_epochs, dp_scale=dp_sigma, clipping_threshold=args.clipping_threshold ) else: print("!!!!! WARNING !!!!! PRIVACY FEATURES HAVE BEEN DISABLED!") do_training = lambda inference_rng: train_model_no_dp( inference_rng, model, guide, train_data, batch_size=batch_size, num_data=num_data, num_epochs=args.num_epochs ) inference_rng, sampling_rng = initialize_rngs(args.seed) # learn posterior distributions try: posterior_params, elbo = do_training(inference_rng) except (InferenceException, FloatingPointError): print("################################## ERROR ##################################") print("!!!!! The inference procedure encountered a NaN value (not a number). !!!!!") print("This means the model has major difficulties in capturing the data and is") print("likely to happen when the dataset is very small and/or sparse.") print("Try adapting (simplifying) the model.") print("Aborting...") return 2 # Store learned model parameters # TODO: we should have a mode for twinify that allows to rerun the sampling without training, using stored parameters store_twinify_run_result(f"{args.output_path}.p", posterior_params, elbo, args, unknown_args, __version__) # sample synthetic data print("Model learning complete; now sampling data!") num_synthetic = args.num_synthetic if num_synthetic is None: num_synthetic = num_data num_parameter_samples = int(np.ceil(num_synthetic / args.num_synthetic_records_per_parameter_sample)) num_synthetic = num_parameter_samples * args.num_synthetic_records_per_parameter_sample print(f"Will sample {args.num_synthetic_records_per_parameter_sample} synthetic data records for each of " f"{num_parameter_samples} samples from the parameter posterior for a total of {num_synthetic} records.") if args.separate_output: print("They will be stored in separate data sets for each parameter posterior sample.") else: print("They will be stored in a single large data set.") posterior_samples = sample_synthetic_data( model, guide, posterior_params, sampling_rng, num_parameter_samples, args.num_synthetic_records_per_parameter_sample ) # postprocess: so that the synthetic twin looks like the original data # - extract samples from the posterior_samples dictionary and construct pd.DataFrame # - if preprocessing involved data mapping, it is mapped back here conditioned_postprocess_fn = lambda posterior_samples: postprocess_fn(posterior_samples, df, feature_names) for i, (syn_df, encoded_syn_df) in enumerate(reshape_and_postprocess_synthetic_data( posterior_samples, conditioned_postprocess_fn, args.separate_output, num_parameter_samples )): if args.separate_output: filename = f"{args.output_path}.{i}.csv" else: filename = f"{args.output_path}.csv" encoded_syn_df.to_csv(filename, index=False) ### illustrate results TODO need to adopt new way of handing train_df #if args.visualize != 'none': # show_popups = args.visualize in ('popup', 'both') # save_plots = args.visualize in ('store', 'both') # # Missing value rate # if not args.drop_na: # missing_value_fig = plot_missing_values(syn_df, train_df, show=show_popups) # if save_plots: # missing_value_fig.savefig(args.output_path + "_missing_value_plots.svg") # # Marginal violins # margin_fig = plot_margins(syn_df, train_df, show=show_popups) # # Covariance matrices # cov_fig = plot_covariance_heatmap(syn_df, train_df, show=show_popups) # if save_plots: # margin_fig.savefig(args.output_path + "_marginal_plots.svg") # cov_fig.savefig(args.output_path + "_correlation_plots.svg") # if show_popups: # plt.show() return 0 except ModelException as e: print(e.format_message(args.model_path)) except AssertionError as e: raise e except Exception as e: print("#### AN UNCATEGORISED ERROR OCCURRED ####") raise e return 1
def load_custom_numpyro_model( model_path: str, args: argparse.Namespace, unknown_args: Iterable[str], orig_data: pd.DataFrame ) -> Tuple[TModelFunction, TGuideFunction, TWrappedPreprocessFunction, TWrappedPostprocessFunction]: if not os.path.exists(model_path): raise FileNotFoundError(model_path) try: spec = importlib.util.spec_from_file_location("model_module", model_path) model_module = importlib.util.module_from_spec(spec) spec.loader.exec_module(model_module) except Exception as e: # handling errors in py-file parsing raise NumpyroModelParsingException( "Unable to read the specified file as a Python module.", e) from e # load the model function from the module model = None guide = None try: model = model_module.model except AttributeError: # model file did not directly contain a model function; check if it has model_factory try: model_factory = model_module.model_factory except AttributeError: raise NumpyroModelParsingException( "Model module does neither specify a 'model' nor a 'model_factory' function." ) try: model_factory_return = model_factory(args, unknown_args, orig_data) except TypeError as e: if str(e).find('positional argument') != -1: raise ModelException( "FAILED IN MODEL FACTORY", f"Custom model_factory functions must accept a namespace of parsed arguments, an iterable of unparsed arguments and a pandas.DataFrame as arguments." ) raise e except Exception as e: raise ModelException('FAILED IN MODEL FACTORY', base_exception=e) from e # determine whether model_factory returned model function or (model, guide) tuple if (type(model_factory_return) is tuple and isinstance(model_factory_return[0], TModelFunction) and isinstance(model_factory_return[1], TGuideFunction)): model, guide = model_factory_return elif isinstance(model_factory_return, TModelFunction): model = model_factory_return else: raise ModelException( 'FAILED IN MODEL FACTORY', f"Custom model_factory functions must return either a model function or a tuple consisting of model and guide function, but returned {type(model_factory_return)}." ) except Exception as e: raise NumpyroModelParsingUnknownException('model', e) from e if not isinstance(model, Callable): raise NumpyroModelParsingException( f"'model' must be a function; got {type(model)}") model = guard_model(model) if guide is None: try: guide = model_module.guide except AttributeError: guide = AutoDiagonalNormal(model) except Exception as e: raise NumpyroModelParsingUnknownException('guide', e) from e # try to obtain preprocessing function from custom model try: preprocess_fn = guard_preprocess(model_module.preprocess) except AttributeError: preprocess_fn = default_preprocess except Exception as e: raise NumpyroModelParsingUnknownException('preprocess', e) from e # try to obtain postprocessing function from custom model try: postprocess_fn = guard_postprocess(model_module.postprocess) except AttributeError: print( "Warning: Your model does not specify a postprocessing function for generated samples." ) print( " Using default, which assumes that your model only produces samples at sample site 'x' and outputs samples as they are." ) postprocess_fn = automodel.postprocess_function_factory([]) except Exception as e: raise NumpyroModelParsingUnknownException('postprocess', e) from e return model, guide, preprocess_fn, postprocess_fn