def __init__(self, vocab_size, latent_size, hidden_sizes, pad_idx, num_masks=1, resample_mask_every=0): super().__init__() self.pad_idx = pad_idx self.resample_every = resample_mask_every self.counter = resample_mask_every self.vocab_size = vocab_size self.made_conditioner = MADEConditioner(input_size=vocab_size + latent_size, output_size=vocab_size, context_size=latent_size, hidden_sizes=hidden_sizes, num_masks=num_masks) self.product_of_poissons = AutoregressiveLikelihood( event_size=vocab_size, dist_type=Poisson, conditioner=self.made_conditioner)
def __init__(self, args): print("\n# Hyperparameters", file=sys.stderr) pprint.pprint(args.__dict__, stream=sys.stderr) print("\n# Data", file=sys.stderr) print(" - Standard MNIST", file=sys.stderr) print( " - digit_dim=%d*%d" % (args.height, args.width), file=sys.stderr ) print(" - data_dim=%d*%d" % (args.height, args.width), file=sys.stderr) train_loader, valid_loader, test_loader = load_mnist( args.batch_size, save_to="{}/std/{}x{}".format( args.data_dir, args.height, args.width ), height=args.height, width=args.width, ) x_size = args.height * args.width z_size = args.latent_size y_size = 10 if args.conditional else 0 # Configure prior if args.prior == "gaussian": prior_type = Normal if len(args.prior_params) != 2: raise ValueError( "A Gaussian prior takes two parameters (loc, scale)" ) if args.prior_params[1] <= 0: raise ValueError( "The Gaussian scale must be strictly positive" ) if args.posterior not in ["gaussian"]: raise ValueError( ( "Pairing a Gaussian prior " "with a %s posterior is not a good idea") % args.posterior ) elif args.prior == "beta": prior_type = Beta if len(args.prior_params) != 2: raise ValueError( "A Beta prior takes two parameters (shape_a, shape_b)" ) if args.prior_params[0] <= 0 or args.prior_params[1] <= 0: raise ValueError( "The Beta shape parameters must be strictly positive" ) if args.posterior not in ["kumaraswamy"]: raise ValueError( ( "Pairing a Beta prior " "with a %s posterior is not a good idea") % args.posterior ) else: raise ValueError("Unknown prior: %s" % args.prior) p_z = PriorLayer( event_shape=z_size, dist_type=prior_type, params=args.prior_params ) # Configure likelihood if args.likelihood == "bernoulli": likelihood_type = Bernoulli decoder_outputs = 1 * x_size else: raise ValueError("Unknown likelihood: %s" % args.likelihood) if args.decoder == "basic": likelihood_conditioner = FFConditioner( input_size=z_size + y_size, output_size=decoder_outputs, context_size=y_size, hidden_sizes=args.hidden_sizes, ) elif args.decoder == "cnn": likelihood_conditioner = TransposedConv2DConditioner( input_size=z_size + y_size, output_size=decoder_outputs, context_size=y_size, input_channels=32, output_channels=decoder_outputs // x_size, last_kernel_size=7, ) elif args.decoder == "made": likelihood_conditioner = MADEConditioner( input_size=x_size + z_size + y_size, output_size=decoder_outputs, context_size=z_size + y_size, hidden_sizes=args.hidden_sizes, num_masks=1, ) else: raise ValueError("Unknown decoder: %s" % args.decoder) if args.decoder == "made": conditional_x = AutoregressiveLikelihood( event_size=x_size, dist_type=likelihood_type, conditioner=likelihood_conditioner, ) else: conditional_x = FullyFactorizedLikelihood( event_size=x_size, dist_type=likelihood_type, conditioner=likelihood_conditioner, ) # CPU/CUDA device device = torch.device(args.device) # Create generative model P(z)P(x|z) gen_model = GenerativeModel( x_size=x_size, z_size=z_size, y_size=y_size, prior_z=p_z, conditional_x=conditional_x, ).to(device) print("\n# Generative Model", file=sys.stderr) print(gen_model, file=sys.stderr) # Configure posterior # Z|x,y if args.posterior == "gaussian": encoder_outputs = z_size * 2 posterior_type = Normal elif args.posterior == "kumaraswamy": encoder_outputs = z_size * 2 posterior_type = Kumaraswamy else: raise ValueError("Unknown posterior: %s" % args.posterior) if args.encoder == "basic": conditioner = FFConditioner( input_size=x_size + y_size, output_size=encoder_outputs, hidden_sizes=args.hidden_sizes, ) elif args.encoder == "cnn": conditioner = Conv2DConditioner( input_size=x_size + y_size, output_size=encoder_outputs, context_size=y_size, width=args.width, height=args.height, output_channels=256, last_kernel_size=7, ) else: raise ValueError("Unknown encoder architecture: %s" % args.encoder) q_z = ConditionalLayer( event_size=z_size, dist_type=posterior_type, conditioner=conditioner, ) inf_model = InferenceModel( x_size=x_size, z_size=z_size, y_size=y_size, conditional_z=q_z ).to(device) print("\n# Inference Model", file=sys.stderr) print(inf_model, file=sys.stderr) print("\n# Optimizers", file=sys.stderr) gen_opt = get_optimizer( args.gen_opt, gen_model.parameters(), args.gen_lr, args.gen_l2_weight, args.gen_momentum, ) gen_scheduler = ReduceLROnPlateau( gen_opt, factor=0.5, patience=args.patience, early_stopping=args.early_stopping, mode="max", threshold_mode="abs", ) print(gen_opt, file=sys.stderr) inf_z_opt = get_optimizer( args.inf_z_opt, inf_model.parameters(), args.inf_z_lr, args.inf_z_l2_weight, args.inf_z_momentum, ) inf_z_scheduler = ReduceLROnPlateau( inf_z_opt, factor=0.5, patience=args.patience, mode="max", threshold_mode="abs", ) print(inf_z_opt, file=sys.stderr) self.optimizers = {"gen": gen_opt, "inf_z": inf_z_opt} self.schedulers = {"gen": gen_scheduler, "inf_z": inf_z_scheduler} self.train_loader = train_loader self.valid_loader = valid_loader self.test_loader = test_loader self.models = {"gen": gen_model, "inf": inf_model} self.args = args
class CorrelatedBernoullisLM(GenerativeLM): """ This parameterises an autoregressive product of Bernoulli distributions, P(x|z) = \prod_{v=1}^V Bern([v in x]|b_v(z, x)) where V is the vocabulary size and b(z,x) \in (0, 1)^V is autoregressive in x (we use a MADE). """ def __init__(self, vocab_size, latent_size, hidden_sizes, pad_idx, num_masks=1, resample_mask_every=0): super().__init__() self.pad_idx = pad_idx self.resample_every = resample_mask_every self.counter = resample_mask_every self.vocab_size = vocab_size self.made_conditioner = MADEConditioner(input_size=vocab_size + latent_size, output_size=vocab_size, context_size=latent_size, hidden_sizes=hidden_sizes, num_masks=num_masks) self.product_of_bernoullis = AutoregressiveLikelihood( event_size=vocab_size, dist_type=Bernoulli, conditioner=self.made_conditioner) def make_indicators(self, x): """Return a vocab_size-dimensional bit-vector view of x""" # We convert ids to V-dimensional one-hot vectors and reduce-sum the time dimension # this gives us word counts # [B, T] -> [B, T, V] -> [B, V] word_counts = F.one_hot(x, self.vocab_size).sum(1) word_counts[:, self.pad_idx] = 0 indicators = (word_counts > 0).float() return indicators def forward(self, x, z, state=dict()) -> Bernoulli: """ Return Bernoulli distributions [v \in X]|z, \Sigma_{<v} ~ Bernoulli(b_v(z, \Sigma_{<v})) with shape [B, Vx] where Vx = |\Sigma| and \Sigma is the vocabulary. """ # We convert ids to V-dimensional one-hot vectors and sum the time dimension # this gives us word counts # [B, V] indicators = self.make_indicators(x) if self.resample_every > 0: self.counter = self.counter - 1 if self.counter > 0 else self.resample_every return self.product_of_bernoullis(z, history=indicators, resample_mask=self.resample_every > 0 and self.counter == 0) def log_prob(self, likelihood: Bernoulli, x): # [B, V] indicators = self.make_indicators(x) # [B, V] -> [B] return likelihood.log_prob(indicators).sum(-1) def sample(self, z, max_len=None, greedy=False, state=dict()): """ Sample from X|z where z [B, Dz] """ shape = [z.size(0), self.product_of_bernoullis.event_size] if greedy: raise NotImplementedError( "Greedy decoding not implemented for MADE") x = self.product_of_bernoullis.sample( z, torch.zeros(shape, dtype=z.dtype, device=z.device)) return x
class CorrelatedPoissonsLM(GenerativeLM): """ This parameterises an autoregressive product of Poisson distributions, P(x|z) = \prod_{v=1}^V Bern(c_v(x)|b_v(z, x)) where V is the vocabulary size, c_v(x) counts the occurrences of v in x, and b(z,x) \in (0, infty)^V is autoregressive in x (we use a MADE). """ def __init__(self, vocab_size, latent_size, hidden_sizes, pad_idx, num_masks=1, resample_mask_every=0): super().__init__() self.pad_idx = pad_idx self.resample_every = resample_mask_every self.counter = resample_mask_every self.vocab_size = vocab_size self.made_conditioner = MADEConditioner(input_size=vocab_size + latent_size, output_size=vocab_size, context_size=latent_size, hidden_sizes=hidden_sizes, num_masks=num_masks) self.product_of_poissons = AutoregressiveLikelihood( event_size=vocab_size, dist_type=Poisson, conditioner=self.made_conditioner) def make_counts(self, x): """Return a vocab_size-dimensional count-vector view of x""" # We convert ids to V-dimensional one-hot vectors and reduce-sum the time dimension # this gives us word counts # [B, T] -> [B, T, V] -> [B, V] word_counts = F.one_hot(x, self.vocab_size).sum(1) word_counts[:, self. pad_idx] = 0 # we could actually leave it here, it is a way to model length return word_counts.float() def forward(self, x, z, state=dict()) -> Poisson: """ Return Poisson distributions c_v(X)|z, \Sigma_{<v} ~ Poisson(b_v(z, \Sigma_{<v})) with shape [B, Vx] where Vx = |\Sigma| and \Sigma is the vocabulary. """ # We convert ids to V-dimensional one-hot vectors and sum the time dimension # this gives us word counts # [B, V] counts = self.make_counts(x) if self.resample_every > 0: self.counter = self.counter - 1 if self.counter > 0 else self.resample_every return self.product_of_poissons(z, history=counts, resample_mask=self.resample_every > 0 and self.counter == 0) def log_prob(self, likelihood: Poisson, x): # [B, V] counts = self.make_counts(x) # [B, V] -> [B] return likelihood.log_prob(counts).sum(-1) def sample(self, z, max_len=None, greedy=False, state=dict()): """ Sample from X|z where z [B, Dz] """ shape = [z.size(0), self.product_of_poissons.event_size] if greedy: raise NotImplementedError( "Greedy decoding not implemented for MADE") x = self.product_of_poissons.sample( z, torch.zeros(shape, dtype=z.dtype, device=z.device)) return x
def __init__(self, args): print("\n# Hyperparameters", file=sys.stderr) pprint.pprint(args.__dict__, stream=sys.stderr) print("\n# Data", file=sys.stderr) print(" - MNIST", file=sys.stderr) print(" - digit_dim=%d*%d" % (args.height, args.width), file=sys.stderr) print(" - data_dim=%d*%d" % (args.height, args.width), file=sys.stderr) train_loader, valid_loader, test_loader = load_mnist( args.batch_size, save_to='{}/std/{}x{}'.format(args.data_dir, args.height, args.width), height=args.height, width=args.width) print("\n# Generative model", file=sys.stderr) print(" - binary outputs:", args.binarize, file=sys.stderr) print(" - distribution:", args.distribution, file=sys.stderr) print(" - conditional:", args.conditional, file=sys.stderr) x_size = args.width * args.height y_size = 10 if args.conditional else 0 # CPU/CUDA device device = torch.device(args.device) if args.distribution == 'bernoulli': if not args.binarize: raise ValueError( "--distribution bernoulli requires --binarize True") made = MADEConditioner(input_size=x_size + y_size, output_size=x_size * 1, context_size=y_size, hidden_sizes=args.hidden_sizes, num_masks=args.num_masks) model = AutoregressiveLikelihood(event_size=x_size, dist_type=Bernoulli, conditioner=made).to(device) else: raise ValueError("I do not know this likelihood: %s" % args.distribution) print("\n# Architecture", file=sys.stderr) print(model, file=sys.stderr) print("\n# Optimizer", file=sys.stderr) gen_opt = get_optimizer(args.gen_opt, model.parameters(), args.gen_lr, args.gen_l2_weight, args.gen_momentum) gen_scheduler = ReduceLROnPlateau(gen_opt, factor=0.5, patience=args.patience, early_stopping=args.early_stopping, mode='min', threshold_mode='abs') print(gen_opt, file=sys.stderr) self.train_loader = train_loader self.valid_loader = valid_loader self.test_loader = test_loader self.model = model self.gen_opt = gen_opt self.gen_scheduler = gen_scheduler self.args = args