Пример #1
0
 def __init__(self,
              vocab_size,
              latent_size,
              hidden_sizes,
              pad_idx,
              num_masks=1,
              resample_mask_every=0):
     super().__init__()
     self.pad_idx = pad_idx
     self.resample_every = resample_mask_every
     self.counter = resample_mask_every
     self.vocab_size = vocab_size
     self.made_conditioner = MADEConditioner(input_size=vocab_size +
                                             latent_size,
                                             output_size=vocab_size,
                                             context_size=latent_size,
                                             hidden_sizes=hidden_sizes,
                                             num_masks=num_masks)
     self.product_of_poissons = AutoregressiveLikelihood(
         event_size=vocab_size,
         dist_type=Poisson,
         conditioner=self.made_conditioner)
Пример #2
0
    def __init__(self, args):

        print("\n# Hyperparameters", file=sys.stderr)
        pprint.pprint(args.__dict__, stream=sys.stderr)

        print("\n# Data", file=sys.stderr)
        print(" - Standard MNIST", file=sys.stderr)
        print(
            " - digit_dim=%d*%d" % (args.height, args.width), file=sys.stderr
        )
        print(" - data_dim=%d*%d" % (args.height, args.width), file=sys.stderr)
        train_loader, valid_loader, test_loader = load_mnist(
            args.batch_size,
            save_to="{}/std/{}x{}".format(
                args.data_dir, args.height, args.width
            ),
            height=args.height,
            width=args.width,
        )

        x_size = args.height * args.width
        z_size = args.latent_size
        y_size = 10 if args.conditional else 0

        # Configure prior
        if args.prior == "gaussian":
            prior_type = Normal
            if len(args.prior_params) != 2:
                raise ValueError(
                    "A Gaussian prior takes two parameters (loc, scale)"
                )
            if args.prior_params[1] <= 0:
                raise ValueError(
                    "The Gaussian scale must be strictly positive"
                )
            if args.posterior not in ["gaussian"]:
                raise ValueError(
                    (
                        "Pairing a Gaussian prior "
                        "with a %s posterior is not a good idea")
                    % args.posterior
                )
        elif args.prior == "beta":
            prior_type = Beta
            if len(args.prior_params) != 2:
                raise ValueError(
                    "A Beta prior takes two parameters (shape_a, shape_b)"
                )
            if args.prior_params[0] <= 0 or args.prior_params[1] <= 0:
                raise ValueError(
                    "The Beta shape parameters must be strictly positive"
                )
            if args.posterior not in ["kumaraswamy"]:
                raise ValueError(
                    (
                        "Pairing a Beta prior "
                        "with a %s posterior is not a good idea")
                    % args.posterior
                )
        else:
            raise ValueError("Unknown prior: %s" % args.prior)
        p_z = PriorLayer(
            event_shape=z_size, dist_type=prior_type, params=args.prior_params
        )

        # Configure likelihood
        if args.likelihood == "bernoulli":
            likelihood_type = Bernoulli
            decoder_outputs = 1 * x_size
        else:
            raise ValueError("Unknown likelihood: %s" % args.likelihood)

        if args.decoder == "basic":
            likelihood_conditioner = FFConditioner(
                input_size=z_size + y_size,
                output_size=decoder_outputs,
                context_size=y_size,
                hidden_sizes=args.hidden_sizes,
            )
        elif args.decoder == "cnn":
            likelihood_conditioner = TransposedConv2DConditioner(
                input_size=z_size + y_size,
                output_size=decoder_outputs,
                context_size=y_size,
                input_channels=32,
                output_channels=decoder_outputs // x_size,
                last_kernel_size=7,
            )
        elif args.decoder == "made":
            likelihood_conditioner = MADEConditioner(
                input_size=x_size + z_size + y_size,
                output_size=decoder_outputs,
                context_size=z_size + y_size,
                hidden_sizes=args.hidden_sizes,
                num_masks=1,
            )
        else:
            raise ValueError("Unknown decoder: %s" % args.decoder)

        if args.decoder == "made":
            conditional_x = AutoregressiveLikelihood(
                event_size=x_size,
                dist_type=likelihood_type,
                conditioner=likelihood_conditioner,
            )
        else:
            conditional_x = FullyFactorizedLikelihood(
                event_size=x_size,
                dist_type=likelihood_type,
                conditioner=likelihood_conditioner,
            )

        # CPU/CUDA device
        device = torch.device(args.device)

        # Create generative model P(z)P(x|z)
        gen_model = GenerativeModel(
            x_size=x_size,
            z_size=z_size,
            y_size=y_size,
            prior_z=p_z,
            conditional_x=conditional_x,
        ).to(device)
        print("\n# Generative Model", file=sys.stderr)
        print(gen_model, file=sys.stderr)

        # Configure posterior
        # Z|x,y
        if args.posterior == "gaussian":
            encoder_outputs = z_size * 2
            posterior_type = Normal
        elif args.posterior == "kumaraswamy":
            encoder_outputs = z_size * 2
            posterior_type = Kumaraswamy
        else:
            raise ValueError("Unknown posterior: %s" % args.posterior)

        if args.encoder == "basic":
            conditioner = FFConditioner(
                input_size=x_size + y_size,
                output_size=encoder_outputs,
                hidden_sizes=args.hidden_sizes,
            )
        elif args.encoder == "cnn":
            conditioner = Conv2DConditioner(
                input_size=x_size + y_size,
                output_size=encoder_outputs,
                context_size=y_size,
                width=args.width,
                height=args.height,
                output_channels=256,
                last_kernel_size=7,
            )
        else:
            raise ValueError("Unknown encoder architecture: %s" % args.encoder)

        q_z = ConditionalLayer(
            event_size=z_size,
            dist_type=posterior_type,
            conditioner=conditioner,
        )

        inf_model = InferenceModel(
            x_size=x_size, z_size=z_size, y_size=y_size, conditional_z=q_z
        ).to(device)
        print("\n# Inference Model", file=sys.stderr)
        print(inf_model, file=sys.stderr)

        print("\n# Optimizers", file=sys.stderr)
        gen_opt = get_optimizer(
            args.gen_opt,
            gen_model.parameters(),
            args.gen_lr,
            args.gen_l2_weight,
            args.gen_momentum,
        )
        gen_scheduler = ReduceLROnPlateau(
            gen_opt,
            factor=0.5,
            patience=args.patience,
            early_stopping=args.early_stopping,
            mode="max",
            threshold_mode="abs",
        )
        print(gen_opt, file=sys.stderr)

        inf_z_opt = get_optimizer(
            args.inf_z_opt,
            inf_model.parameters(),
            args.inf_z_lr,
            args.inf_z_l2_weight,
            args.inf_z_momentum,
        )
        inf_z_scheduler = ReduceLROnPlateau(
            inf_z_opt,
            factor=0.5,
            patience=args.patience,
            mode="max",
            threshold_mode="abs",
        )
        print(inf_z_opt, file=sys.stderr)

        self.optimizers = {"gen": gen_opt, "inf_z": inf_z_opt}
        self.schedulers = {"gen": gen_scheduler, "inf_z": inf_z_scheduler}

        self.train_loader = train_loader
        self.valid_loader = valid_loader
        self.test_loader = test_loader
        self.models = {"gen": gen_model, "inf": inf_model}
        self.args = args
Пример #3
0
class CorrelatedBernoullisLM(GenerativeLM):
    """
    This parameterises an autoregressive product of Bernoulli distributions,
        P(x|z) = \prod_{v=1}^V Bern([v in x]|b_v(z, x))
    where V is the vocabulary size and b(z,x) \in (0, 1)^V is autoregressive in x (we use a MADE).
    """
    def __init__(self,
                 vocab_size,
                 latent_size,
                 hidden_sizes,
                 pad_idx,
                 num_masks=1,
                 resample_mask_every=0):
        super().__init__()
        self.pad_idx = pad_idx
        self.resample_every = resample_mask_every
        self.counter = resample_mask_every
        self.vocab_size = vocab_size
        self.made_conditioner = MADEConditioner(input_size=vocab_size +
                                                latent_size,
                                                output_size=vocab_size,
                                                context_size=latent_size,
                                                hidden_sizes=hidden_sizes,
                                                num_masks=num_masks)
        self.product_of_bernoullis = AutoregressiveLikelihood(
            event_size=vocab_size,
            dist_type=Bernoulli,
            conditioner=self.made_conditioner)

    def make_indicators(self, x):
        """Return a vocab_size-dimensional bit-vector view of x"""
        # We convert ids to V-dimensional one-hot vectors and reduce-sum the time dimension
        #  this gives us word counts
        # [B, T] -> [B, T, V] -> [B, V]
        word_counts = F.one_hot(x, self.vocab_size).sum(1)
        word_counts[:, self.pad_idx] = 0
        indicators = (word_counts > 0).float()
        return indicators

    def forward(self, x, z, state=dict()) -> Bernoulli:
        """
        Return Bernoulli distributions 
            [v \in X]|z, \Sigma_{<v} ~ Bernoulli(b_v(z, \Sigma_{<v}))
        with shape [B, Vx] where Vx = |\Sigma| and \Sigma is the vocabulary.
        """
        # We convert ids to V-dimensional one-hot vectors and sum the time dimension
        #  this gives us word counts
        # [B, V]
        indicators = self.make_indicators(x)
        if self.resample_every > 0:
            self.counter = self.counter - 1 if self.counter > 0 else self.resample_every
        return self.product_of_bernoullis(z,
                                          history=indicators,
                                          resample_mask=self.resample_every > 0
                                          and self.counter == 0)

    def log_prob(self, likelihood: Bernoulli, x):
        # [B, V]
        indicators = self.make_indicators(x)
        # [B, V] -> [B]
        return likelihood.log_prob(indicators).sum(-1)

    def sample(self, z, max_len=None, greedy=False, state=dict()):
        """
        Sample from X|z where z [B, Dz]
        """
        shape = [z.size(0), self.product_of_bernoullis.event_size]
        if greedy:
            raise NotImplementedError(
                "Greedy decoding not implemented for MADE")
        x = self.product_of_bernoullis.sample(
            z, torch.zeros(shape, dtype=z.dtype, device=z.device))
        return x
Пример #4
0
class CorrelatedPoissonsLM(GenerativeLM):
    """
    This parameterises an autoregressive product of Poisson distributions,
        P(x|z) = \prod_{v=1}^V Bern(c_v(x)|b_v(z, x))
    where V is the vocabulary size, c_v(x) counts the occurrences of v in x,
    and b(z,x) \in (0, infty)^V is autoregressive in x (we use a MADE).
    """
    def __init__(self,
                 vocab_size,
                 latent_size,
                 hidden_sizes,
                 pad_idx,
                 num_masks=1,
                 resample_mask_every=0):
        super().__init__()
        self.pad_idx = pad_idx
        self.resample_every = resample_mask_every
        self.counter = resample_mask_every
        self.vocab_size = vocab_size
        self.made_conditioner = MADEConditioner(input_size=vocab_size +
                                                latent_size,
                                                output_size=vocab_size,
                                                context_size=latent_size,
                                                hidden_sizes=hidden_sizes,
                                                num_masks=num_masks)
        self.product_of_poissons = AutoregressiveLikelihood(
            event_size=vocab_size,
            dist_type=Poisson,
            conditioner=self.made_conditioner)

    def make_counts(self, x):
        """Return a vocab_size-dimensional count-vector view of x"""
        # We convert ids to V-dimensional one-hot vectors and reduce-sum the time dimension
        #  this gives us word counts
        # [B, T] -> [B, T, V] -> [B, V]
        word_counts = F.one_hot(x, self.vocab_size).sum(1)
        word_counts[:, self.
                    pad_idx] = 0  # we could actually leave it here, it is a way to model length
        return word_counts.float()

    def forward(self, x, z, state=dict()) -> Poisson:
        """
        Return Poisson distributions 
            c_v(X)|z, \Sigma_{<v} ~ Poisson(b_v(z, \Sigma_{<v}))
        with shape [B, Vx] where Vx = |\Sigma| and \Sigma is the vocabulary.
        """
        # We convert ids to V-dimensional one-hot vectors and sum the time dimension
        #  this gives us word counts
        # [B, V]
        counts = self.make_counts(x)
        if self.resample_every > 0:
            self.counter = self.counter - 1 if self.counter > 0 else self.resample_every
        return self.product_of_poissons(z,
                                        history=counts,
                                        resample_mask=self.resample_every > 0
                                        and self.counter == 0)

    def log_prob(self, likelihood: Poisson, x):
        # [B, V]
        counts = self.make_counts(x)
        # [B, V] -> [B]
        return likelihood.log_prob(counts).sum(-1)

    def sample(self, z, max_len=None, greedy=False, state=dict()):
        """
        Sample from X|z where z [B, Dz]
        """
        shape = [z.size(0), self.product_of_poissons.event_size]
        if greedy:
            raise NotImplementedError(
                "Greedy decoding not implemented for MADE")
        x = self.product_of_poissons.sample(
            z, torch.zeros(shape, dtype=z.dtype, device=z.device))
        return x
Пример #5
0
    def __init__(self, args):

        print("\n# Hyperparameters", file=sys.stderr)
        pprint.pprint(args.__dict__, stream=sys.stderr)

        print("\n# Data", file=sys.stderr)
        print(" - MNIST", file=sys.stderr)
        print(" - digit_dim=%d*%d" % (args.height, args.width),
              file=sys.stderr)
        print(" - data_dim=%d*%d" % (args.height, args.width), file=sys.stderr)
        train_loader, valid_loader, test_loader = load_mnist(
            args.batch_size,
            save_to='{}/std/{}x{}'.format(args.data_dir, args.height,
                                          args.width),
            height=args.height,
            width=args.width)

        print("\n# Generative model", file=sys.stderr)
        print(" - binary outputs:", args.binarize, file=sys.stderr)
        print(" - distribution:", args.distribution, file=sys.stderr)
        print(" - conditional:", args.conditional, file=sys.stderr)
        x_size = args.width * args.height
        y_size = 10 if args.conditional else 0
        # CPU/CUDA device
        device = torch.device(args.device)
        if args.distribution == 'bernoulli':
            if not args.binarize:
                raise ValueError(
                    "--distribution bernoulli requires --binarize True")
            made = MADEConditioner(input_size=x_size + y_size,
                                   output_size=x_size * 1,
                                   context_size=y_size,
                                   hidden_sizes=args.hidden_sizes,
                                   num_masks=args.num_masks)
            model = AutoregressiveLikelihood(event_size=x_size,
                                             dist_type=Bernoulli,
                                             conditioner=made).to(device)
        else:
            raise ValueError("I do not know this likelihood: %s" %
                             args.distribution)

        print("\n# Architecture", file=sys.stderr)
        print(model, file=sys.stderr)

        print("\n# Optimizer", file=sys.stderr)
        gen_opt = get_optimizer(args.gen_opt, model.parameters(), args.gen_lr,
                                args.gen_l2_weight, args.gen_momentum)
        gen_scheduler = ReduceLROnPlateau(gen_opt,
                                          factor=0.5,
                                          patience=args.patience,
                                          early_stopping=args.early_stopping,
                                          mode='min',
                                          threshold_mode='abs')
        print(gen_opt, file=sys.stderr)

        self.train_loader = train_loader
        self.valid_loader = valid_loader
        self.test_loader = test_loader
        self.model = model
        self.gen_opt = gen_opt
        self.gen_scheduler = gen_scheduler
        self.args = args