示例#1
0
def train_dae(NET, dae_layer, mlp_params, sgd_params):
    """Run DAE training test."""

    # Load some data to train/validate/test with
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset)

    # Run denoising autoencoder training on the given layer of NET
    NT.train_dae(NET=NET, \
        dae_layer=dae_layer, \
        mlp_params=mlp_params, \
        sgd_params=sgd_params, \
        datasets=datasets)
    return
示例#2
0
def train_dae(NET, dae_layer, mlp_params, sgd_params):
    """Run DAE training test."""

    # Load some data to train/validate/test with
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset)

    # Run denoising autoencoder training on the given layer of NET
    NT.train_dae(NET=NET, \
        dae_layer=dae_layer, \
        mlp_params=mlp_params, \
        sgd_params=sgd_params, \
        datasets=datasets)
    return 1
def test_mnist_img(occ_dim=15, drop_prob=0.0):
    #########################################
    # Format the result tag more thoroughly #
    #########################################
    dp_int = int(100.0 * drop_prob)
    result_tag = RESULT_PATH + "TM_OD{}_DP{}".format(occ_dim, dp_int)

    ##########################
    # Get some training data #
    ##########################
    rng = np.random.RandomState(1234)
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, as_shared=False, zero_mean=False)
    Xtr = datasets[0][0]
    Xva = datasets[1][0]
    Xtr = to_fX(shift_and_scale_into_01(Xtr))
    Xva = to_fX(shift_and_scale_into_01(Xva))
    tr_samples = Xtr.shape[0]
    va_samples = Xva.shape[0]
    batch_size = 200
    batch_reps = 1
    all_pix_mean = np.mean(np.mean(Xtr, axis=1))
    data_mean = to_fX(all_pix_mean * np.ones((Xtr.shape[1],)))

    TM = TemplateMatchImputer(x_train=Xtr, x_type='bernoulli')

    Xva = row_shuffle(Xva)
    # record an estimate of performance on the test set
    xi, xo, xm = construct_masked_data(Xva[:500], drop_prob=drop_prob, \
                                       occ_dim=occ_dim, data_mean=data_mean)
    img_match_on_known, img_match_on_unknown = TM.best_match_img(xo, xm)

    display_count = 100
    # visualize matches on known elements
    Xs = np.zeros((2*display_count, Xva.shape[1]))
    for idx in range(display_count):
        Xs[2*idx] = xi[idx]
        Xs[(2*idx)+1] = img_match_on_known[idx]
    file_name = "{0:s}_SAMPLES_MOK.png".format(result_tag)
    utils.visualize_samples(Xs, file_name, num_rows=20)
    # visualize matches on unknown elements
    Xs = np.zeros((2*display_count, Xva.shape[1]))
    for idx in range(display_count):
        Xs[2*idx] = xi[idx]
        Xs[(2*idx)+1] = img_match_on_unknown[idx]
    file_name = "{0:s}_SAMPLES_MOU.png".format(result_tag)
    utils.visualize_samples(Xs, file_name, num_rows=20)
    return
示例#4
0
def test_mnist_img(occ_dim=15, drop_prob=0.0):
    #########################################
    # Format the result tag more thoroughly #
    #########################################
    dp_int = int(100.0 * drop_prob)
    result_tag = RESULT_PATH + "TM_OD{}_DP{}".format(occ_dim, dp_int)

    ##########################
    # Get some training data #
    ##########################
    rng = np.random.RandomState(1234)
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, as_shared=False, zero_mean=False)
    Xtr = datasets[0][0]
    Xva = datasets[1][0]
    Xtr = to_fX(shift_and_scale_into_01(Xtr))
    Xva = to_fX(shift_and_scale_into_01(Xva))
    tr_samples = Xtr.shape[0]
    va_samples = Xva.shape[0]
    batch_size = 200
    batch_reps = 1
    all_pix_mean = np.mean(np.mean(Xtr, axis=1))
    data_mean = to_fX(all_pix_mean * np.ones((Xtr.shape[1], )))

    TM = TemplateMatchImputer(x_train=Xtr, x_type='bernoulli')

    Xva = row_shuffle(Xva)
    # record an estimate of performance on the test set
    xi, xo, xm = construct_masked_data(Xva[:500], drop_prob=drop_prob, \
                                       occ_dim=occ_dim, data_mean=data_mean)
    img_match_on_known, img_match_on_unknown = TM.best_match_img(xo, xm)

    display_count = 100
    # visualize matches on known elements
    Xs = np.zeros((2 * display_count, Xva.shape[1]))
    for idx in range(display_count):
        Xs[2 * idx] = xi[idx]
        Xs[(2 * idx) + 1] = img_match_on_known[idx]
    file_name = "{0:s}_SAMPLES_MOK.png".format(result_tag)
    utils.visualize_samples(Xs, file_name, num_rows=20)
    # visualize matches on unknown elements
    Xs = np.zeros((2 * display_count, Xva.shape[1]))
    for idx in range(display_count):
        Xs[2 * idx] = xi[idx]
        Xs[(2 * idx) + 1] = img_match_on_unknown[idx]
    file_name = "{0:s}_SAMPLES_MOU.png".format(result_tag)
    utils.visualize_samples(Xs, file_name, num_rows=20)
    return
def test_mnist_nll(occ_dim=15, drop_prob=0.0):
    RESULT_PATH = "IMP_MNIST_TM/"
    #########################################
    # Format the result tag more thoroughly #
    #########################################
    dp_int = int(100.0 * drop_prob)
    result_tag = RESULT_PATH + "TM_OD{}_DP{}".format(occ_dim, dp_int)

    ##########################
    # Get some training data #
    ##########################
    rng = np.random.RandomState(1234)
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, as_shared=False, zero_mean=False)
    Xtr = datasets[0][0]
    Xva = datasets[1][0]
    Xtr = to_fX(shift_and_scale_into_01(Xtr))
    Xva = to_fX(shift_and_scale_into_01(Xva))
    tr_samples = Xtr.shape[0]
    va_samples = Xva.shape[0]
    batch_size = 200
    batch_reps = 1
    all_pix_mean = np.mean(np.mean(Xtr, axis=1))
    data_mean = to_fX(all_pix_mean * np.ones((Xtr.shape[1],)))

    TM = TemplateMatchImputer(x_train=Xtr, x_type='bernoulli')

    log_name = "{}_RESULTS.txt".format(result_tag)
    out_file = open(log_name, 'wb')

    Xva = row_shuffle(Xva)
    # record an estimate of performance on the test set
    xi, xo, xm = construct_masked_data(Xva, drop_prob=drop_prob, \
                                       occ_dim=occ_dim, data_mean=data_mean)
    result = TM.best_match_nll(xo, xm)
    match_on_known = np.mean(result[0])
    match_on_unknown = np.mean(result[1])
    str0 = "Test 1:"
    str1 = "    match on known   : {}".format(match_on_known)
    str2 = "    match on unknown : {}".format(match_on_unknown)
    joint_str = "\n".join([str0, str1, str2])
    print(joint_str)
    out_file.write(joint_str+"\n")
    out_file.flush()
    out_file.close()
    return
示例#6
0
def test_mnist_nll(occ_dim=15, drop_prob=0.0):
    RESULT_PATH = "IMP_MNIST_TM/"
    #########################################
    # Format the result tag more thoroughly #
    #########################################
    dp_int = int(100.0 * drop_prob)
    result_tag = RESULT_PATH + "TM_OD{}_DP{}".format(occ_dim, dp_int)

    ##########################
    # Get some training data #
    ##########################
    rng = np.random.RandomState(1234)
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, as_shared=False, zero_mean=False)
    Xtr = datasets[0][0]
    Xva = datasets[1][0]
    Xtr = to_fX(shift_and_scale_into_01(Xtr))
    Xva = to_fX(shift_and_scale_into_01(Xva))
    tr_samples = Xtr.shape[0]
    va_samples = Xva.shape[0]
    batch_size = 200
    batch_reps = 1
    all_pix_mean = np.mean(np.mean(Xtr, axis=1))
    data_mean = to_fX(all_pix_mean * np.ones((Xtr.shape[1], )))

    TM = TemplateMatchImputer(x_train=Xtr, x_type='bernoulli')

    log_name = "{}_RESULTS.txt".format(result_tag)
    out_file = open(log_name, 'wb')

    Xva = row_shuffle(Xva)
    # record an estimate of performance on the test set
    xi, xo, xm = construct_masked_data(Xva, drop_prob=drop_prob, \
                                       occ_dim=occ_dim, data_mean=data_mean)
    result = TM.best_match_nll(xo, xm)
    match_on_known = np.mean(result[0])
    match_on_unknown = np.mean(result[1])
    str0 = "Test 1:"
    str1 = "    match on known   : {}".format(match_on_known)
    str2 = "    match on unknown : {}".format(match_on_unknown)
    joint_str = "\n".join([str0, str1, str2])
    print(joint_str)
    out_file.write(joint_str + "\n")
    out_file.flush()
    out_file.close()
    return
示例#7
0
    from PeaNet import PeaNet
    from InfNet import InfNet
    from GenNet import GenNet
    from GIPair import GIPair
    from NetLayers import relu_actfun, softplus_actfun, \
                          safe_softmax, safe_log

    # Simple test code, to check that everything is basically functional.
    print("TESTING...")

    # Initialize a source of randomness
    rng = np.random.RandomState(1234)

    # Load some data to train/validate/test with
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, zero_mean=False)
    Xtr = datasets[0][0]

    # get and set some basic dataset information
    tr_samples = Xtr.get_value(borrow=True).shape[0]
    data_dim = Xtr.get_value(borrow=True).shape[1]
    prior_dim = 100
    prior_sigma = 1.0
    
    # Do moment matching in some transformed space
    mm_proj_dim = 250
    #P = np.identity(data_dim)
    P = npr.randn(data_dim, mm_proj_dim) / np.sqrt(float(mm_proj_dim))
    P = theano.shared(value=P.astype(theano.config.floatX), name='P_proj')

    target_mean, target_cov = projected_moments(Xtr, P, ary_type='theano')
def test_imocld_mnist(step_type="add", attention=False):
    ##########################
    # Get some training data #
    ##########################
    rng = np.random.RandomState(1234)
    dataset = "data/mnist.pkl.gz"
    datasets = load_udm(dataset, as_shared=False, zero_mean=False)
    Xtr = datasets[0][0]
    Xva = datasets[1][0]
    Xtr = to_fX(shift_and_scale_into_01(Xtr))
    Xva = to_fX(shift_and_scale_into_01(Xva))
    tr_samples = Xtr.shape[0]
    va_samples = Xva.shape[0]
    batch_size = 250

    ############################################################
    # Setup some parameters for the Iterative Refinement Model #
    ############################################################
    x_dim = Xtr.shape[1]
    write_dim = 300
    enc_dim = 300
    dec_dim = 300
    mix_dim = 20
    z_dim = 100
    n_iter = 16

    rnninits = {"weights_init": IsotropicGaussian(0.01), "biases_init": Constant(0.0)}
    inits = {"weights_init": IsotropicGaussian(0.01), "biases_init": Constant(0.0)}

    att_tag = "NA"  # attention not implemented yet

    # setup the reader and writer (shared by primary and guide policies)
    read_dim = 2 * x_dim  # dimension of output from reader_mlp
    reader_mlp = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits)
    writer_mlp = MLP([None, None], [dec_dim, write_dim, x_dim], name="writer_mlp", **inits)

    # mlps for setting conditionals over z_mix
    mix_var_mlp = CondNet([Tanh()], [x_dim, 250, mix_dim], name="mix_var_mlp", **inits)
    mix_enc_mlp = CondNet([Tanh()], [x_dim, 250, mix_dim], name="mix_enc_mlp", **inits)
    # mlp for decoding z_mix into a distribution over initial LSTM states
    mix_dec_mlp = MLP(
        [Tanh(), Tanh()],
        [mix_dim, 250, (2 * enc_dim + 2 * dec_dim + 2 * enc_dim + mix_dim)],
        name="mix_dec_mlp",
        **inits
    )
    # mlps for processing inputs to LSTMs
    var_mlp_in = MLP([Identity()], [(read_dim + dec_dim + mix_dim), 4 * enc_dim], name="var_mlp_in", **inits)
    enc_mlp_in = MLP([Identity()], [(read_dim + dec_dim + mix_dim), 4 * enc_dim], name="enc_mlp_in", **inits)
    dec_mlp_in = MLP([Identity()], [z_dim, 4 * dec_dim], name="dec_mlp_in", **inits)
    # mlps for turning LSTM outputs into conditionals over z_gen
    var_mlp_out = CondNet([], [enc_dim, z_dim], name="var_mlp_out", **inits)
    enc_mlp_out = CondNet([], [enc_dim, z_dim], name="enc_mlp_out", **inits)
    # LSTMs for the actual LSTMs (obviously, perhaps)
    var_rnn = BiasedLSTM(dim=enc_dim, ig_bias=2.0, fg_bias=2.0, name="var_rnn", **rnninits)
    enc_rnn = BiasedLSTM(dim=enc_dim, ig_bias=2.0, fg_bias=2.0, name="enc_rnn", **rnninits)
    dec_rnn = BiasedLSTM(dim=dec_dim, ig_bias=2.0, fg_bias=2.0, name="dec_rnn", **rnninits)

    draw = IMoCLDrawModels(
        n_iter,
        step_type=step_type,  # step_type can be 'add' or 'jump'
        reader_mlp=reader_mlp,
        writer_mlp=writer_mlp,
        mix_enc_mlp=mix_enc_mlp,
        mix_dec_mlp=mix_dec_mlp,
        mix_var_mlp=mix_var_mlp,
        enc_mlp_in=enc_mlp_in,
        enc_mlp_out=enc_mlp_out,
        enc_rnn=enc_rnn,
        dec_mlp_in=dec_mlp_in,
        dec_rnn=dec_rnn,
        var_mlp_in=var_mlp_in,
        var_mlp_out=var_mlp_out,
        var_rnn=var_rnn,
    )
    draw.initialize()
    # build the cost gradients, training function, samplers, etc.
    draw.build_model_funcs()

    # sample several interchangeable versions of the model
    conditions = [{"occ_dim": 0, "drop_prob": 0.8}, {"occ_dim": 16, "drop_prob": 0.0}]
    for cond_dict in conditions:
        occ_dim = cond_dict["occ_dim"]
        drop_prob = cond_dict["drop_prob"]
        dp_int = int(100.0 * drop_prob)

        draw.load_model_params(
            f_name="TBCLM_IMP_MNIST_PARAMS_OD{}_DP{}_{}_{}.pkl".format(occ_dim, dp_int, step_type, att_tag)
        )

        # draw some independent samples from the model
        Xva = row_shuffle(Xva)
        Xb = to_fX(Xva[:128])
        _, Xb, Mb = construct_masked_data(Xb, drop_prob=drop_prob, occ_dim=occ_dim, data_mean=None)
        Xb = np.repeat(Xb, 2, axis=0)
        Mb = np.repeat(Mb, 2, axis=0)
        samples = draw.do_sample(Xb, Mb)

        # save the samples to a pkl file, in their numpy array form
        sample_pkl_name = "IMP-MNIST-OD{0:d}-DP{1:d}-{2:s}.pkl".format(occ_dim, dp_int, step_type)
        f_handle = file(sample_pkl_name, "wb")
        cPickle.dump(samples, f_handle, protocol=-1)
        f_handle.close()
        print("Saved some samples in: {}".format(sample_pkl_name))
    return
示例#9
0
def train_walk_from_pretrained_osm(lam_kld=0.0):
    # Simple test code, to check that everything is basically functional.
    print("TESTING...")

    # Initialize a source of randomness
    rng = np.random.RandomState(1234)

    # Load some data to train/validate/test with
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, zero_mean=False)
    Xtr = datasets[0][0]
    Xtr = Xtr.get_value(borrow=False)
    Xva = datasets[2][0]
    Xva = Xva.get_value(borrow=False)
    print("Xtr.shape: {0:s}, Xva.shape: {1:s}".format(str(Xtr.shape),
                                                      str(Xva.shape)))

    # get and set some basic dataset information
    tr_samples = Xtr.shape[0]
    va_samples = Xva.shape[0]
    data_dim = Xtr.shape[1]
    batch_size = 100
    batch_reps = 5
    prior_sigma = 1.0
    Xtr_mean = np.mean(Xtr, axis=0, keepdims=True)
    Xtr_mean = (0.0 * Xtr_mean) + np.mean(np.mean(Xtr, axis=1))
    Xc_mean = np.repeat(Xtr_mean, batch_size, axis=0)

    # Symbolic inputs
    Xd = T.matrix(name='Xd')
    Xc = T.matrix(name='Xc')
    Xm = T.matrix(name='Xm')
    Xt = T.matrix(name='Xt')

    ###############################
    # Setup discriminator network #
    ###############################
    # Set some reasonable mlp parameters
    dn_params = {}
    # Set up some proto-networks
    pc0 = [data_dim, (300, 4), (300, 4), 10]
    dn_params['proto_configs'] = [pc0]
    # Set up some spawn networks
    sc0 = {
        'proto_key': 0,
        'input_noise': 0.0,
        'bias_noise': 0.1,
        'do_dropout': True
    }
    #sc1 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True}
    dn_params['spawn_configs'] = [sc0]
    dn_params['spawn_weights'] = [1.0]
    # Set remaining params
    dn_params['init_scale'] = 0.75
    dn_params['lam_l2a'] = 1e-2
    dn_params['vis_drop'] = 0.2
    dn_params['hid_drop'] = 0.5
    # Initialize a network object to use as the discriminator
    DN = PeaNet(rng=rng, Xd=Xd, params=dn_params)
    DN.init_biases(0.0)

    #######################################################
    # Load inferencer and generator from saved parameters #
    #######################################################
    gn_fname = RESULT_PATH + "pt_osm_params_b80000_GN.pkl"
    in_fname = RESULT_PATH + "pt_osm_params_b80000_IN.pkl"
    IN = load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd)
    GN = load_infnet_from_file(f_name=gn_fname, rng=rng, Xd=Xd)

    ########################################################
    # Define parameters for the VCGLoop, and initialize it #
    ########################################################
    print("Building the VCGLoop...")
    vcgl_params = {}
    vcgl_params['x_type'] = 'bernoulli'
    vcgl_params['xt_transform'] = 'sigmoid'
    vcgl_params['logvar_bound'] = LOGVAR_BOUND
    vcgl_params['cost_decay'] = 0.0
    vcgl_params['chain_type'] = 'walkout'
    vcgl_params['lam_l2d'] = 5e-2
    VCGL = VCGLoop(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, Xt=Xt, \
                 i_net=IN, g_net=GN, d_net=DN, chain_len=6, \
                 data_dim=data_dim, prior_dim=PRIOR_DIM, params=vcgl_params)

    out_file = open(RESULT_PATH + "pt_walk_results.txt", 'wb')
    ####################################################
    # Train the VCGLoop by unrolling and applying BPTT #
    ####################################################
    learn_rate = 0.0004
    cost_1 = [0. for i in range(10)]
    for i in range(50000):
        scale = float(min((i + 1), 5000)) / 5000.0
        if ((i + 1 % 25000) == 0):
            learn_rate = learn_rate * 0.9
        ########################################
        # TRAIN THE CHAIN IN FREE-RUNNING MODE #
        ########################################
        VCGL.set_all_sgd_params(learn_rate=(scale*learn_rate), \
                                mom_1=0.5, mom_2=0.99)
        VCGL.set_disc_weights(dweight_gn=25.0, dweight_dn=25.0)
        VCGL.set_lam_chain_nll(1.0)
        VCGL.set_lam_chain_kld(lam_kld)
        # get some data to train with
        tr_idx = npr.randint(low=0, high=tr_samples, size=(batch_size, ))
        Xd_batch = Xtr.take(tr_idx, axis=0)
        Xc_batch = 0.0 * Xd_batch
        Xm_batch = 0.0 * Xd_batch
        # examples from the target distribution, to train discriminator
        tr_idx = npr.randint(low=0, high=tr_samples, size=(2 * batch_size, ))
        Xt_batch = Xtr.take(tr_idx, axis=0)
        # do a minibatch update of the model, and compute some costs
        outputs = VCGL.train_joint(Xd_batch, Xc_batch, Xm_batch, Xt_batch,
                                   batch_reps)
        cost_1 = [(cost_1[k] + 1. * outputs[k]) for k in range(len(outputs))]
        if ((i % 500) == 0):
            cost_1 = [(v / 500.0) for v in cost_1]
            o_str_1 = "batch: {0:d}, joint_cost: {1:.4f}, chain_nll_cost: {2:.4f}, chain_kld_cost: {3:.4f}, disc_cost_gn: {4:.4f}, disc_cost_dn: {5:.4f}".format( \
                    i, cost_1[0], cost_1[1], cost_1[2], cost_1[5], cost_1[6])
            print(o_str_1)
            cost_1 = [0. for v in cost_1]
        if ((i % 1000) == 0):
            tr_idx = npr.randint(low=0, high=Xtr.shape[0], size=(5, ))
            va_idx = npr.randint(low=0, high=Xva.shape[0], size=(5, ))
            Xd_batch = np.vstack(
                [Xtr.take(tr_idx, axis=0),
                 Xva.take(va_idx, axis=0)])
            # draw some chains of samples from the VAE loop
            file_name = RESULT_PATH + "pt_walk_chain_samples_b{0:d}.png".format(
                i)
            Xd_samps = np.repeat(Xd_batch, 3, axis=0)
            sample_lists = VCGL.OSM.sample_from_chain(Xd_samps, loop_iters=20)
            Xs = np.vstack(sample_lists["data samples"])
            utils.visualize_samples(Xs, file_name, num_rows=20)
            # draw some masked chains of samples from the VAE loop
            file_name = RESULT_PATH + "pt_walk_mask_samples_b{0:d}.png".format(
                i)
            Xd_samps = np.repeat(Xc_mean[0:Xd_batch.shape[0], :], 3, axis=0)
            Xc_samps = np.repeat(Xd_batch, 3, axis=0)
            Xm_rand = sample_masks(Xc_samps, drop_prob=0.0)
            Xm_patch = sample_patch_masks(Xc_samps, (28, 28), (16, 16))
            Xm_samps = Xm_rand * Xm_patch
            sample_lists = VCGL.OSM.sample_from_chain(Xd_samps, \
                    X_c=Xc_samps, X_m=Xm_samps, loop_iters=20)
            Xs = np.vstack(sample_lists["data samples"])
            utils.visualize_samples(Xs, file_name, num_rows=20)
            # draw some samples independently from the GenNet's prior
            file_name = RESULT_PATH + "pt_walk_prior_samples_b{0:d}.png".format(
                i)
            Xs = VCGL.sample_from_prior(20 * 20)
            utils.visualize_samples(Xs, file_name, num_rows=20)
        # DUMP PARAMETERS FROM TIME-TO-TIME
        if (i % 10000 == 0):
            DN.save_to_file(f_name=RESULT_PATH +
                            "pt_walk_params_b{0:d}_DN.pkl".format(i))
            IN.save_to_file(f_name=RESULT_PATH +
                            "pt_walk_params_b{0:d}_IN.pkl".format(i))
            GN.save_to_file(f_name=RESULT_PATH +
                            "pt_walk_params_b{0:d}_GN.pkl".format(i))
    return
示例#10
0
def test_gip_sigma_scale_mnist():
    from LogPDFs import cross_validate_sigma

    # Simple test code, to check that everything is basically functional.
    print("TESTING...")

    # Initialize a source of randomness
    rng = np.random.RandomState(12345)

    # Load some data to train/validate/test with
    dataset = "data/mnist.pkl.gz"
    datasets = load_udm(dataset, zero_mean=False)
    Xtr = datasets[0][0]
    Xtr = Xtr.get_value(borrow=False)
    Xva = datasets[2][0]
    Xva = Xva.get_value(borrow=False)
    print("Xtr.shape: {0:s}, Xva.shape: {1:s}".format(str(Xtr.shape), str(Xva.shape)))

    # get and set some basic dataset information
    tr_samples = Xtr.shape[0]
    batch_size = 100
    Xtr_mean = np.mean(Xtr, axis=0, keepdims=True)
    Xtr_mean = (0.0 * Xtr_mean) + np.mean(Xtr)
    Xc_mean = np.repeat(Xtr_mean, batch_size, axis=0).astype(theano.config.floatX)

    # Symbolic inputs
    Xd = T.matrix(name="Xd")
    Xc = T.matrix(name="Xc")
    Xm = T.matrix(name="Xm")
    Xt = T.matrix(name="Xt")

    # Load inferencer and generator from saved parameters
    gn_fname = "MNIST_WALKOUT_TEST_MED_KLD/pt_osm_params_b80000_GN.pkl"
    in_fname = "MNIST_WALKOUT_TEST_MED_KLD/pt_osm_params_b80000_IN.pkl"
    IN = load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd)
    GN = load_infnet_from_file(f_name=gn_fname, rng=rng, Xd=Xd)
    x_dim = IN.shared_layers[0].in_dim
    z_dim = IN.mu_layers[-1].out_dim
    # construct a GIPair with the loaded InfNet and GenNet
    osm_params = {}
    osm_params["x_type"] = "gaussian"
    osm_params["xt_transform"] = "sigmoid"
    osm_params["logvar_bound"] = LOGVAR_BOUND
    OSM = OneStageModel(
        rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, p_x_given_z=GN, q_z_given_x=IN, x_dim=x_dim, z_dim=z_dim, params=osm_params
    )
    # compute variational likelihood bound and its sub-components
    Xva = row_shuffle(Xva)
    Xb = Xva[0:5000]
    file_name = "AX_MNIST_MAX_KLD_POST_KLDS.png"
    post_klds = OSM.compute_post_klds(Xb)
    post_dim_klds = np.mean(post_klds, axis=0)
    utils.plot_stem(np.arange(post_dim_klds.shape[0]), post_dim_klds, file_name)
    # compute information about free-energy on validation set
    file_name = "AX_MNIST_MAX_KLD_FREE_ENERGY.png"
    fe_terms = OSM.compute_fe_terms(Xb, 20)
    utils.plot_scatter(fe_terms[1], fe_terms[0], file_name, x_label="Posterior KLd", y_label="Negative Log-likelihood")

    # bound_results = OSM.compute_ll_bound(Xva)
    # ll_bounds = bound_results[0]
    # post_klds = bound_results[1]
    # log_likelihoods = bound_results[2]
    # max_lls = bound_results[3]
    # print("mean ll bound: {0:.4f}".format(np.mean(ll_bounds)))
    # print("mean posterior KLd: {0:.4f}".format(np.mean(post_klds)))
    # print("mean log-likelihood: {0:.4f}".format(np.mean(log_likelihoods)))
    # print("mean max log-likelihood: {0:.4f}".format(np.mean(max_lls)))
    # print("min ll bound: {0:.4f}".format(np.min(ll_bounds)))
    # print("max posterior KLd: {0:.4f}".format(np.max(post_klds)))
    # print("min log-likelihood: {0:.4f}".format(np.min(log_likelihoods)))
    # print("min max log-likelihood: {0:.4f}".format(np.min(max_lls)))
    # # compute some information about the approximate posteriors
    # post_stats = OSM.compute_post_stats(Xva, 0.0*Xva, 0.0*Xva)
    # all_post_klds = np.sort(post_stats[0].ravel()) # post KLds for each obs and dim
    # obs_post_klds = np.sort(post_stats[1]) # summed post KLds for each obs
    # post_dim_klds = post_stats[2] # average post KLds for each post dim
    # post_dim_vars = post_stats[3] # average squared mean for each post dim
    # utils.plot_line(np.arange(all_post_klds.shape[0]), all_post_klds, "AAA_ALL_POST_KLDS.png")
    # utils.plot_line(np.arange(obs_post_klds.shape[0]), obs_post_klds, "AAA_OBS_POST_KLDS.png")
    # utils.plot_stem(np.arange(post_dim_klds.shape[0]), post_dim_klds, "AAA_POST_DIM_KLDS.png")
    # utils.plot_stem(np.arange(post_dim_vars.shape[0]), post_dim_vars, "AAA_POST_DIM_VARS.png")

    # draw many samples from the GIP
    for i in range(5):
        tr_idx = npr.randint(low=0, high=tr_samples, size=(100,))
        Xd_batch = Xtr.take(tr_idx, axis=0)
        Xs = []
        for row in range(3):
            Xs.append([])
            for col in range(3):
                sample_lists = OSM.sample_from_chain(Xd_batch[0:10, :], loop_iters=100, sigma_scale=1.0)
                Xs[row].append(group_chains(sample_lists["data samples"]))
        Xs, block_im_dim = block_video(Xs, (28, 28), (3, 3))
        to_video(Xs, block_im_dim, "AX_MNIST_MAX_KLD_CHAIN_VIDEO_{0:d}.avi".format(i), frame_rate=10)
    file_name = "AX_MNIST_MAX_KLD_PRIOR_SAMPLE.png"
    Xs = OSM.sample_from_prior(20 * 20)
    utils.visualize_samples(Xs, file_name, num_rows=20)
    # # test Parzen density estimator built from prior samples
    # Xs = OSM.sample_from_prior(10000)
    # [best_sigma, best_ll, best_lls] = \
    #         cross_validate_sigma(Xs, Xva, [0.12, 0.14, 0.15, 0.16, 0.18], 20)
    # sort_idx = np.argsort(best_lls)
    # sort_idx = sort_idx[0:400]
    # utils.plot_line(np.arange(sort_idx.shape[0]), best_lls[sort_idx], "A_MNIST_MAX_KLD_BEST_LLS_1.png")
    # utils.visualize_samples(Xva[sort_idx], "A_MNIST_MAX_KLD_BAD_DIGITS_1.png", num_rows=20)
    return
def test_imocld_mnist(step_type='add', attention=False):
    ##########################
    # Get some training data #
    ##########################
    rng = np.random.RandomState(1234)
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, as_shared=False, zero_mean=False)
    Xtr = datasets[0][0]
    Xva = datasets[1][0]
    Xtr = to_fX(shift_and_scale_into_01(Xtr))
    Xva = to_fX(shift_and_scale_into_01(Xva))
    tr_samples = Xtr.shape[0]
    va_samples = Xva.shape[0]
    batch_size = 250

    ############################################################
    # Setup some parameters for the Iterative Refinement Model #
    ############################################################
    x_dim = Xtr.shape[1]
    write_dim = 300
    enc_dim = 300
    dec_dim = 300
    mix_dim = 20
    z_dim = 100
    n_iter = 16
    
    rnninits = {
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }
    inits = {
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }

    att_tag = "NA" # attention not implemented yet

    # setup the reader and writer (shared by primary and guide policies)
    read_dim = 2*x_dim # dimension of output from reader_mlp
    reader_mlp = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits)
    writer_mlp = MLP([None, None], [dec_dim, write_dim, x_dim], \
                     name="writer_mlp", **inits)
    
    # mlps for setting conditionals over z_mix
    mix_var_mlp = CondNet([Tanh()], [x_dim, 250, mix_dim], \
                          name="mix_var_mlp", **inits)
    mix_enc_mlp = CondNet([Tanh()], [x_dim, 250, mix_dim], \
                          name="mix_enc_mlp", **inits)
    # mlp for decoding z_mix into a distribution over initial LSTM states
    mix_dec_mlp = MLP([Tanh(), Tanh()], \
                      [mix_dim, 250, (2*enc_dim + 2*dec_dim + 2*enc_dim + mix_dim)], \
                      name="mix_dec_mlp", **inits)
    # mlps for processing inputs to LSTMs
    var_mlp_in = MLP([Identity()], [(read_dim + dec_dim + mix_dim), 4*enc_dim], \
                     name="var_mlp_in", **inits)
    enc_mlp_in = MLP([Identity()], [(read_dim + dec_dim + mix_dim), 4*enc_dim], \
                     name="enc_mlp_in", **inits)
    dec_mlp_in = MLP([Identity()], [                         z_dim, 4*dec_dim], \
                     name="dec_mlp_in", **inits)
    # mlps for turning LSTM outputs into conditionals over z_gen
    var_mlp_out = CondNet([], [enc_dim, z_dim], name="var_mlp_out", **inits)
    enc_mlp_out = CondNet([], [enc_dim, z_dim], name="enc_mlp_out", **inits)
    # LSTMs for the actual LSTMs (obviously, perhaps)
    var_rnn = BiasedLSTM(dim=enc_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="var_rnn", **rnninits)
    enc_rnn = BiasedLSTM(dim=enc_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="enc_rnn", **rnninits)
    dec_rnn = BiasedLSTM(dim=dec_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="dec_rnn", **rnninits)

    draw = IMoCLDrawModels(
                n_iter,
                step_type=step_type, # step_type can be 'add' or 'jump'
                reader_mlp=reader_mlp,
                writer_mlp=writer_mlp,
                mix_enc_mlp=mix_enc_mlp,
                mix_dec_mlp=mix_dec_mlp,
                mix_var_mlp=mix_var_mlp,
                enc_mlp_in=enc_mlp_in,
                enc_mlp_out=enc_mlp_out,
                enc_rnn=enc_rnn,
                dec_mlp_in=dec_mlp_in,
                dec_rnn=dec_rnn,
                var_mlp_in=var_mlp_in,
                var_mlp_out=var_mlp_out,
                var_rnn=var_rnn)
    draw.initialize()
    # build the cost gradients, training function, samplers, etc.
    draw.build_model_funcs()

    # sample several interchangeable versions of the model
    conditions = [{'occ_dim': 0, 'drop_prob': 0.8}, \
                  {'occ_dim': 16, 'drop_prob': 0.0}]
    for cond_dict in conditions:
        occ_dim = cond_dict['occ_dim']
        drop_prob = cond_dict['drop_prob']
        dp_int = int(100.0 * drop_prob)

        draw.load_model_params(f_name="TBCLM_IMP_MNIST_PARAMS_OD{}_DP{}_{}_{}.pkl".format(occ_dim, dp_int, step_type, att_tag))

        # draw some independent samples from the model
        Xva = row_shuffle(Xva)
        Xb = to_fX(Xva[:128])
        _, Xb, Mb = construct_masked_data(Xb, drop_prob=drop_prob, \
                                occ_dim=occ_dim, data_mean=None)
        Xb = np.repeat(Xb, 2, axis=0)
        Mb = np.repeat(Mb, 2, axis=0)
        samples, _ = draw.do_sample(Xb, Mb)

        # save the samples to a pkl file, in their numpy array form
        sample_pkl_name = "IMP-MNIST-OD{0:d}-DP{1:d}-{2:s}.pkl".format(occ_dim, dp_int, step_type)
        f_handle = file(sample_pkl_name, 'wb')
        cPickle.dump(samples, f_handle, protocol=-1)
        f_handle.close()
        print("Saved some samples in: {}".format(sample_pkl_name))
    return
def test_lstm_structpred(step_type='add', use_pol=True, use_binary=False):
    ###########################################
    # Make a tag for identifying result files #
    ###########################################
    pol_tag = "P1" if use_pol else "P0"
    bin_tag = "B1" if use_binary else "B0"
    res_tag = "STRUCT_PRED_RESULTS/SP_LSTM_{}_{}_{}".format(
        step_type, pol_tag, bin_tag)

    if use_binary:
        ############################
        # Get binary training data #
        ############################
        rng = np.random.RandomState(1234)
        Xtr, Xva, Xte = load_binarized_mnist(data_path='./data/')
        #Xtr = np.vstack((Xtr, Xva))
        #Xva = Xte
    else:
        ################################
        # Get continuous training data #
        ################################
        rng = np.random.RandomState(1234)
        dataset = 'data/mnist.pkl.gz'
        datasets = load_udm(dataset, as_shared=False, zero_mean=False)
        Xtr = datasets[0][0]
        Xva = datasets[1][0]
        Xte = datasets[2][0]
        #Xtr = np.concatenate((Xtr, Xva), axis=0)
        #Xva = Xte
        Xtr = to_fX(shift_and_scale_into_01(Xtr))
        Xva = to_fX(shift_and_scale_into_01(Xva))
    tr_samples = Xtr.shape[0]
    va_samples = Xva.shape[0]
    batch_size = 200

    ########################################################
    # Split data into "observation" and "prediction" parts #
    ########################################################
    obs_cols = 14  # number of columns to observe
    pred_cols = 28 - obs_cols  # number of columns to predict
    x_dim = obs_cols * 28  # dimensionality of observations
    y_dim = pred_cols * 28  # dimensionality of predictions
    Xtr, Ytr = img_split(Xtr,
                         im_dim=(28, 28),
                         split_col=obs_cols,
                         transposed=True)
    Xva, Yva = img_split(Xva,
                         im_dim=(28, 28),
                         split_col=obs_cols,
                         transposed=True)

    ############################################################
    # Setup some parameters for the Iterative Refinement Model #
    ############################################################
    read_dim = 128
    write_dim = 128
    mlp_dim = 128
    rnn_dim = 128
    z_dim = 64
    n_iter = 15

    rnninits = {
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }
    inits = {
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }

    # setup reader/writer models
    reader_mlp = MLP([Rectifier(), Tanh()], [x_dim, mlp_dim, read_dim],
                     name="reader_mlp",
                     **inits)
    writer_mlp = MLP([Rectifier(), None], [rnn_dim, mlp_dim, y_dim],
                     name="writer_mlp",
                     **inits)

    # setup submodels for processing LSTM inputs
    pol_inp_dim = y_dim + read_dim + rnn_dim
    var_inp_dim = y_dim + y_dim + read_dim + rnn_dim
    pol_mlp_in = MLP([Identity()], [pol_inp_dim, 4 * rnn_dim],
                     name="pol_mlp_in",
                     **inits)
    var_mlp_in = MLP([Identity()], [var_inp_dim, 4 * rnn_dim],
                     name="var_mlp_in",
                     **inits)
    dec_mlp_in = MLP([Identity()], [z_dim, 4 * rnn_dim],
                     name="dec_mlp_in",
                     **inits)

    # setup submodels for turning LSTM states into conditionals over z
    pol_mlp_out = CondNet([], [rnn_dim, z_dim], name="pol_mlp_out", **inits)
    var_mlp_out = CondNet([], [rnn_dim, z_dim], name="var_mlp_out", **inits)
    dec_mlp_out = CondNet([], [rnn_dim, z_dim], name="dec_mlp_out", **inits)

    # setup the LSTMs for primary policy, guide policy, and shared dynamics
    pol_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="pol_rnn", **rnninits)
    var_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="var_rnn", **rnninits)
    dec_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="dec_rnn", **rnninits)

    model = IRStructPredModel(n_iter,
                              step_type=step_type,
                              use_pol=use_pol,
                              reader_mlp=reader_mlp,
                              writer_mlp=writer_mlp,
                              pol_mlp_in=pol_mlp_in,
                              pol_mlp_out=pol_mlp_out,
                              pol_rnn=pol_rnn,
                              var_mlp_in=var_mlp_in,
                              var_mlp_out=var_mlp_out,
                              var_rnn=var_rnn,
                              dec_mlp_in=dec_mlp_in,
                              dec_mlp_out=dec_mlp_out,
                              dec_rnn=dec_rnn)
    model.initialize()

    compile_start_time = time.time()

    # build the cost gradients, training function, samplers, etc.
    model.build_sampling_funcs()
    print("Testing model sampler...")
    # draw some independent samples from the model
    samp_count = 10
    samp_reps = 3
    x_in = Xtr[:10, :].repeat(samp_reps, axis=0)
    y_in = Ytr[:10, :].repeat(samp_reps, axis=0)
    x_samps, y_samps = model.sample_model(x_in, y_in, sample_source='p')
    # TODO: visualize sample prediction trajectories
    img_seq = seq_img_join(x_samps, y_samps, im_dim=(28, 28), transposed=True)
    seq_len = len(img_seq)
    samp_count = img_seq[0].shape[0]
    seq_samps = np.zeros((seq_len * samp_count, img_seq[0].shape[1]))
    idx = 0
    for s1 in range(samp_count):
        for s2 in range(seq_len):
            seq_samps[idx] = img_seq[s2][s1]
            idx += 1
    file_name = "{0:s}_samples_b{1:d}.png".format(res_tag, 0)
    utils.visualize_samples(seq_samps, file_name, num_rows=samp_count)

    model.build_model_funcs()

    compile_end_time = time.time()
    compile_minutes = (compile_end_time - compile_start_time) / 60.0
    print("THEANO COMPILE TIME (MIN): {}".format(compile_minutes))

    ################################################################
    # Apply some updates, to check that they aren't totally broken #
    ################################################################
    print("Beginning to train the model...")
    out_file = open("{}_results.txt".format(res_tag), 'wb')
    costs = [0. for i in range(10)]
    learn_rate = 0.0002
    momentum = 0.9
    batch_idx = np.arange(batch_size) + tr_samples
    for i in range(300000):
        scale = min(1.0, ((i + 1) / 5000.0))
        if (((i + 1) % 10000) == 0):
            learn_rate = learn_rate * 0.95
        # get the indices of training samples for this batch update
        batch_idx += batch_size
        if (np.max(batch_idx) >= tr_samples):
            # we finished an "epoch", so we rejumble the training set
            Xtr, Ytr = row_shuffle(Xtr, Ytr)
            batch_idx = np.arange(batch_size)
        # set sgd and objective function hyperparams for this update
        model.set_sgd_params(lr=scale * learn_rate,
                             mom_1=scale * momentum,
                             mom_2=0.98)
        model.set_lam_kld(lam_kld_q2p=1.0, lam_kld_p2q=0.1)
        model.set_grad_noise(grad_noise=0.02)
        # perform a minibatch update and record the cost for this batch
        Xb = to_fX(Xtr.take(batch_idx, axis=0))
        Yb = to_fX(Ytr.take(batch_idx, axis=0))
        result = model.train_joint(Xb, Yb)
        costs = [(costs[j] + result[j]) for j in range(len(result))]

        # diagnostics
        if ((i % 250) == 0):
            costs = [(v / 250.0) for v in costs]
            str1 = "-- batch {0:d} --".format(i)
            str2 = "    total_cost: {0:.4f}".format(costs[0])
            str3 = "    nll_bound : {0:.4f}".format(costs[1])
            str4 = "    nll_term  : {0:.4f}".format(costs[2])
            str5 = "    kld_q2p   : {0:.4f}".format(costs[3])
            str6 = "    kld_p2q   : {0:.4f}".format(costs[4])
            str7 = "    reg_term  : {0:.4f}".format(costs[5])
            joint_str = "\n".join([str1, str2, str3, str4, str5, str6, str7])
            print(joint_str)
            out_file.write(joint_str + "\n")
            out_file.flush()
            costs = [0.0 for v in costs]
        if ((i % 1000) == 0):
            model.save_model_params("{}_params.pkl".format(res_tag))
            # compute a small-sample estimate of NLL bound on validation set
            Xva, Yva = row_shuffle(Xva, Yva)
            Xb = to_fX(Xva[:5000])
            Yb = to_fX(Yva[:5000])
            va_costs = model.compute_nll_bound(Xb, Yb)
            str1 = "    va_nll_bound : {}".format(va_costs[1])
            str2 = "    va_nll_term  : {}".format(va_costs[2])
            str3 = "    va_kld_q2p   : {}".format(va_costs[3])
            joint_str = "\n".join([str1, str2, str3])
            print(joint_str)
            out_file.write(joint_str + "\n")
            out_file.flush()
            # draw some independent samples from the model
            samp_count = 10
            samp_reps = 3
            x_in = Xva[:samp_count, :].repeat(samp_reps, axis=0)
            y_in = Yva[:samp_count, :].repeat(samp_reps, axis=0)
            x_samps, y_samps = model.sample_model(x_in,
                                                  y_in,
                                                  sample_source='p')
            # visualize sample prediction trajectories
            img_seq = seq_img_join(x_samps,
                                   y_samps,
                                   im_dim=(28, 28),
                                   transposed=True)
            seq_len = len(img_seq)
            samp_count = img_seq[0].shape[0]
            seq_samps = np.zeros((seq_len * samp_count, img_seq[0].shape[1]))
            idx = 0
            for s1 in range(samp_count):
                for s2 in range(seq_len):
                    if use_binary:
                        seq_samps[idx] = binarize_data(img_seq[s2][s1])
                    else:
                        seq_samps[idx] = img_seq[s2][s1]
                    idx += 1
            file_name = "{0:s}_samples_b{1:d}.png".format(res_tag, i)
            utils.visualize_samples(seq_samps, file_name, num_rows=samp_count)
def test_lstm_structpred(step_type='add', use_pol=True, use_binary=False):
    ###########################################
    # Make a tag for identifying result files #
    ###########################################
    pol_tag = "P1" if use_pol else "P0"
    bin_tag = "B1" if use_binary else "B0"
    res_tag = "STRUCT_PRED_RESULTS/SP_LSTM_{}_{}_{}".format(step_type, pol_tag, bin_tag)

    if use_binary:
        ############################
        # Get binary training data #
        ############################
        rng = np.random.RandomState(1234)
        Xtr, Xva, Xte = load_binarized_mnist(data_path='./data/')
        #Xtr = np.vstack((Xtr, Xva))
        #Xva = Xte
    else:
        ################################
        # Get continuous training data #
        ################################
        rng = np.random.RandomState(1234)
        dataset = 'data/mnist.pkl.gz'
        datasets = load_udm(dataset, as_shared=False, zero_mean=False)
        Xtr = datasets[0][0]
        Xva = datasets[1][0]
        Xte = datasets[2][0]
        #Xtr = np.concatenate((Xtr, Xva), axis=0)
        #Xva = Xte
        Xtr = to_fX(shift_and_scale_into_01(Xtr))
        Xva = to_fX(shift_and_scale_into_01(Xva))
    tr_samples = Xtr.shape[0]
    va_samples = Xva.shape[0]
    batch_size = 200


    ########################################################
    # Split data into "observation" and "prediction" parts #
    ########################################################
    obs_cols = 14             # number of columns to observe
    pred_cols = 28 - obs_cols # number of columns to predict
    x_dim = obs_cols * 28     # dimensionality of observations
    y_dim = pred_cols * 28    # dimensionality of predictions
    Xtr, Ytr = img_split(Xtr, im_dim=(28, 28), split_col=obs_cols, transposed=True)
    Xva, Yva = img_split(Xva, im_dim=(28, 28), split_col=obs_cols, transposed=True)

    ############################################################
    # Setup some parameters for the Iterative Refinement Model #
    ############################################################
    read_dim = 128
    write_dim = 128
    mlp_dim = 128
    rnn_dim = 128
    z_dim = 64
    n_iter = 15

    rnninits = {
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }
    inits = {
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }

    # setup reader/writer models
    reader_mlp = MLP([Rectifier(), Tanh()], [x_dim, mlp_dim, read_dim],
                     name="reader_mlp", **inits)
    writer_mlp = MLP([Rectifier(), None], [rnn_dim, mlp_dim, y_dim],
                     name="writer_mlp", **inits)

    # setup submodels for processing LSTM inputs
    pol_inp_dim = y_dim + read_dim + rnn_dim
    var_inp_dim = y_dim + y_dim + read_dim + rnn_dim
    pol_mlp_in = MLP([Identity()], [pol_inp_dim, 4*rnn_dim],
                     name="pol_mlp_in", **inits)
    var_mlp_in = MLP([Identity()], [var_inp_dim, 4*rnn_dim],
                     name="var_mlp_in", **inits)
    dec_mlp_in = MLP([Identity()], [z_dim, 4*rnn_dim],
                     name="dec_mlp_in", **inits)

    # setup submodels for turning LSTM states into conditionals over z
    pol_mlp_out = CondNet([], [rnn_dim, z_dim], name="pol_mlp_out", **inits)
    var_mlp_out = CondNet([], [rnn_dim, z_dim], name="var_mlp_out", **inits)
    dec_mlp_out = CondNet([], [rnn_dim, z_dim], name="dec_mlp_out", **inits)

    # setup the LSTMs for primary policy, guide policy, and shared dynamics
    pol_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="pol_rnn", **rnninits)
    var_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="var_rnn", **rnninits)
    dec_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="dec_rnn", **rnninits)

    model = IRStructPredModel(
                n_iter,
                step_type=step_type,
                use_pol=use_pol,
                reader_mlp=reader_mlp,
                writer_mlp=writer_mlp,
                pol_mlp_in=pol_mlp_in,
                pol_mlp_out=pol_mlp_out,
                pol_rnn=pol_rnn,
                var_mlp_in=var_mlp_in,
                var_mlp_out=var_mlp_out,
                var_rnn=var_rnn,
                dec_mlp_in=dec_mlp_in,
                dec_mlp_out=dec_mlp_out,
                dec_rnn=dec_rnn)
    model.initialize()

    compile_start_time = time.time()

    # build the cost gradients, training function, samplers, etc.
    model.build_sampling_funcs()
    print("Testing model sampler...")
    # draw some independent samples from the model
    samp_count = 10
    samp_reps = 3
    x_in = Xtr[:10,:].repeat(samp_reps, axis=0)
    y_in = Ytr[:10,:].repeat(samp_reps, axis=0)
    x_samps, y_samps = model.sample_model(x_in, y_in, sample_source='p')
    # TODO: visualize sample prediction trajectories
    img_seq = seq_img_join(x_samps, y_samps, im_dim=(28,28), transposed=True)
    seq_len = len(img_seq)
    samp_count = img_seq[0].shape[0]
    seq_samps = np.zeros((seq_len*samp_count, img_seq[0].shape[1]))
    idx = 0
    for s1 in range(samp_count):
        for s2 in range(seq_len):
            seq_samps[idx] = img_seq[s2][s1]
            idx += 1
    file_name = "{0:s}_samples_b{1:d}.png".format(res_tag, 0)
    utils.visualize_samples(seq_samps, file_name, num_rows=samp_count)

    model.build_model_funcs()

    compile_end_time = time.time()
    compile_minutes = (compile_end_time - compile_start_time) / 60.0
    print("THEANO COMPILE TIME (MIN): {}".format(compile_minutes))

    ################################################################
    # Apply some updates, to check that they aren't totally broken #
    ################################################################
    print("Beginning to train the model...")
    out_file = open("{}_results.txt".format(res_tag), 'wb')
    costs = [0. for i in range(10)]
    learn_rate = 0.0002
    momentum = 0.9
    batch_idx = np.arange(batch_size) + tr_samples
    for i in range(300000):
        scale = min(1.0, ((i+1) / 5000.0))
        if (((i + 1) % 10000) == 0):
            learn_rate = learn_rate * 0.95
        # get the indices of training samples for this batch update
        batch_idx += batch_size
        if (np.max(batch_idx) >= tr_samples):
            # we finished an "epoch", so we rejumble the training set
            Xtr, Ytr = row_shuffle(Xtr, Ytr)
            batch_idx = np.arange(batch_size)
        # set sgd and objective function hyperparams for this update
        model.set_sgd_params(lr=scale*learn_rate, mom_1=scale*momentum, mom_2=0.98)
        model.set_lam_kld(lam_kld_q2p=1.0, lam_kld_p2q=0.1)
        model.set_grad_noise(grad_noise=0.02)
        # perform a minibatch update and record the cost for this batch
        Xb = to_fX(Xtr.take(batch_idx, axis=0))
        Yb = to_fX(Ytr.take(batch_idx, axis=0))
        result = model.train_joint(Xb, Yb)
        costs = [(costs[j] + result[j]) for j in range(len(result))]

        # diagnostics
        if ((i % 250) == 0):
            costs = [(v / 250.0) for v in costs]
            str1 = "-- batch {0:d} --".format(i)
            str2 = "    total_cost: {0:.4f}".format(costs[0])
            str3 = "    nll_bound : {0:.4f}".format(costs[1])
            str4 = "    nll_term  : {0:.4f}".format(costs[2])
            str5 = "    kld_q2p   : {0:.4f}".format(costs[3])
            str6 = "    kld_p2q   : {0:.4f}".format(costs[4])
            str7 = "    reg_term  : {0:.4f}".format(costs[5])
            joint_str = "\n".join([str1, str2, str3, str4, str5, str6, str7])
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
            costs = [0.0 for v in costs]
        if ((i % 1000) == 0):
            model.save_model_params("{}_params.pkl".format(res_tag))
            # compute a small-sample estimate of NLL bound on validation set
            Xva, Yva = row_shuffle(Xva, Yva)
            Xb = to_fX(Xva[:5000])
            Yb = to_fX(Yva[:5000])
            va_costs = model.compute_nll_bound(Xb, Yb)
            str1 = "    va_nll_bound : {}".format(va_costs[1])
            str2 = "    va_nll_term  : {}".format(va_costs[2])
            str3 = "    va_kld_q2p   : {}".format(va_costs[3])
            joint_str = "\n".join([str1, str2, str3])
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
            # draw some independent samples from the model
            samp_count = 10
            samp_reps = 3
            x_in = Xva[:samp_count,:].repeat(samp_reps, axis=0)
            y_in = Yva[:samp_count,:].repeat(samp_reps, axis=0)
            x_samps, y_samps = model.sample_model(x_in, y_in, sample_source='p')
            # visualize sample prediction trajectories
            img_seq = seq_img_join(x_samps, y_samps, im_dim=(28,28), transposed=True)
            seq_len = len(img_seq)
            samp_count = img_seq[0].shape[0]
            seq_samps = np.zeros((seq_len*samp_count, img_seq[0].shape[1]))
            idx = 0
            for s1 in range(samp_count):
                for s2 in range(seq_len):
                    if use_binary:
                        seq_samps[idx] = binarize_data(img_seq[s2][s1])
                    else:
                        seq_samps[idx] = img_seq[s2][s1]
                    idx += 1
            file_name = "{0:s}_samples_b{1:d}.png".format(res_tag, i)
            utils.visualize_samples(seq_samps, file_name, num_rows=samp_count)
示例#14
0
def test_mnist_results(step_type='add',
                       imp_steps=6,
                       occ_dim=15,
                       drop_prob=0.0):
    #########################################
    # Format the result tag more thoroughly #
    #########################################
    dp_int = int(100.0 * drop_prob)
    result_tag = "{}GPSI_OD{}_DP{}_IS{}_{}_NA".format(RESULT_PATH, occ_dim,
                                                      dp_int, imp_steps,
                                                      step_type)

    ##########################
    # Get some training data #
    ##########################
    rng = np.random.RandomState(1234)
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, as_shared=False, zero_mean=False)
    Xtr = datasets[0][0]
    Xva = datasets[1][0]
    Xte = datasets[2][0]
    # Merge validation set and training set, and test on test set.
    Xtr = np.concatenate((Xtr, Xva), axis=0)
    Xva = Xte
    Xtr = to_fX(shift_and_scale_into_01(Xtr))
    Xva = to_fX(shift_and_scale_into_01(Xva))
    tr_samples = Xtr.shape[0]
    va_samples = Xva.shape[0]
    batch_size = 250
    batch_reps = 1
    all_pix_mean = np.mean(np.mean(Xtr, axis=1))
    data_mean = to_fX(all_pix_mean * np.ones((Xtr.shape[1], )))

    # Load parameters from a previously trained model
    print("Testing model load from file...")
    GPSI = load_gpsimputer_from_file(f_name="{}_PARAMS.pkl".format(result_tag), \
                                     rng=rng)

    ################################################################
    # Apply some updates, to check that they aren't totally broken #
    ################################################################
    log_name = "{}_FINAL_RESULTS_NEW.txt".format(result_tag)
    out_file = open(log_name, 'wb')

    Xva = row_shuffle(Xva)
    # record an estimate of performance on the test set
    str0 = "GUIDED SAMPLE BOUND:"
    print(str0)
    xi, xo, xm = construct_masked_data(Xva[:5000], drop_prob=drop_prob, \
                                       occ_dim=occ_dim, data_mean=data_mean)
    nll_0, kld_0 = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10, \
                                         use_guide_policy=True)
    xi, xo, xm = construct_masked_data(Xva[5000:], drop_prob=drop_prob, \
                                       occ_dim=occ_dim, data_mean=data_mean)
    nll_1, kld_1 = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10, \
                                         use_guide_policy=True)
    nll = np.concatenate((nll_0, nll_1))
    kld = np.concatenate((kld_0, kld_1))
    vfe = np.mean(nll) + np.mean(kld)
    str1 = "    va_nll_bound : {}".format(vfe)
    str2 = "    va_nll_term  : {}".format(np.mean(nll))
    str3 = "    va_kld_q2p   : {}".format(np.mean(kld))
    joint_str = "\n".join([str0, str1, str2, str3])
    print(joint_str)
    out_file.write(joint_str + "\n")
    out_file.flush()
    # record an estimate of performance on the test set
    str0 = "UNGUIDED SAMPLE BOUND:"
    print(str0)
    xi, xo, xm = construct_masked_data(Xva[:5000], drop_prob=drop_prob, \
                                       occ_dim=occ_dim, data_mean=data_mean)
    nll_0, kld_0 = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10, \
                                         use_guide_policy=False)
    xi, xo, xm = construct_masked_data(Xva[5000:], drop_prob=drop_prob, \
                                       occ_dim=occ_dim, data_mean=data_mean)
    nll_1, kld_1 = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10, \
                                         use_guide_policy=False)
    nll = np.concatenate((nll_0, nll_1))
    kld = np.concatenate((kld_0, kld_1))
    str1 = "    va_nll_bound : {}".format(np.mean(nll))
    str2 = "    va_nll_term  : {}".format(np.mean(nll))
    str3 = "    va_kld_q2p   : {}".format(np.mean(kld))
    joint_str = "\n".join([str0, str1, str2, str3])
    print(joint_str)
    out_file.write(joint_str + "\n")
    out_file.flush()
示例#15
0
def test_mnist(step_type='add', imp_steps=6, occ_dim=15, drop_prob=0.0):
    #########################################
    # Format the result tag more thoroughly #
    #########################################
    dp_int = int(100.0 * drop_prob)
    result_tag = "{}GPSI_OD{}_DP{}_IS{}_{}_NA".format(RESULT_PATH, occ_dim,
                                                      dp_int, imp_steps,
                                                      step_type)

    ##########################
    # Get some training data #
    ##########################
    rng = np.random.RandomState(1234)
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, as_shared=False, zero_mean=False)
    Xtr = datasets[0][0]
    Xva = datasets[1][0]
    Xte = datasets[2][0]
    # Merge validation set and training set, and test on test set.
    Xtr = np.concatenate((Xtr, Xva), axis=0)
    Xva = Xte
    Xtr = to_fX(shift_and_scale_into_01(Xtr))
    Xva = to_fX(shift_and_scale_into_01(Xva))
    tr_samples = Xtr.shape[0]
    va_samples = Xva.shape[0]
    batch_size = 200
    batch_reps = 1
    all_pix_mean = np.mean(np.mean(Xtr, axis=1))
    data_mean = to_fX(all_pix_mean * np.ones((Xtr.shape[1], )))

    ############################################################
    # Setup some parameters for the Iterative Refinement Model #
    ############################################################
    x_dim = Xtr.shape[1]
    z_dim = 100
    init_scale = 1.0
    use_bn = False

    x_in_sym = T.matrix('x_in_sym')
    x_out_sym = T.matrix('x_out_sym')
    x_mask_sym = T.matrix('x_mask_sym')

    #################
    # p_zi_given_xi #
    #################
    params = {}
    shared_config = \
    [ {'layer_type': 'fc',
       'in_chans': x_dim,
       'out_chans': 800,
       'activation': relu_actfun,
       'apply_bn': use_bn}, \
      {'layer_type': 'fc',
       'in_chans': 800,
       'out_chans': 800,
       'activation': relu_actfun,
       'apply_bn': use_bn} ]
    out_layer = {
        'layer_type': 'fc',
        'in_chans': 800,
        'out_chans': z_dim,
        'activation': relu_actfun,
        'apply_bn': False
    }
    output_config = [out_layer, out_layer]
    params['shared_config'] = shared_config
    params['output_config'] = output_config
    params['init_scale'] = 1.0
    params['build_theano_funcs'] = False
    p_zi_given_xi = HydraNet(rng=rng, Xd=x_in_sym, \
            params=params, shared_param_dicts=None)
    p_zi_given_xi.init_biases(0.0)
    ###################
    # p_sip1_given_zi #
    ###################
    params = {}
    shared_config = \
    [ {'layer_type': 'fc',
       'in_chans': z_dim,
       'out_chans': 800,
       'activation': relu_actfun,
       'apply_bn': use_bn}, \
      {'layer_type': 'fc',
       'in_chans': 800,
       'out_chans': 800,
       'activation': relu_actfun,
       'apply_bn': use_bn} ]
    out_layer = {
        'layer_type': 'fc',
        'in_chans': 800,
        'out_chans': x_dim,
        'activation': relu_actfun,
        'apply_bn': False
    }
    output_config = [out_layer, out_layer, out_layer]
    params['shared_config'] = shared_config
    params['output_config'] = output_config
    params['init_scale'] = 1.0
    params['build_theano_funcs'] = False
    p_sip1_given_zi = HydraNet(rng=rng, Xd=x_in_sym, \
            params=params, shared_param_dicts=None)
    p_sip1_given_zi.init_biases(0.0)

    #################
    # q_zi_given_xi #
    #################
    params = {}
    shared_config = \
    [ {'layer_type': 'fc',
       'in_chans': (x_dim+x_dim),
       'out_chans': 800,
       'activation': relu_actfun,
       'apply_bn': use_bn}, \
      {'layer_type': 'fc',
       'in_chans': 800,
       'out_chans': 800,
       'activation': relu_actfun,
       'apply_bn': use_bn} ]
    out_layer = {
        'layer_type': 'fc',
        'in_chans': 800,
        'out_chans': z_dim,
        'activation': relu_actfun,
        'apply_bn': False
    }
    output_config = [out_layer, out_layer]
    params['shared_config'] = shared_config
    params['output_config'] = output_config
    params['init_scale'] = 1.0
    params['build_theano_funcs'] = False
    q_zi_given_xi = HydraNet(rng=rng, Xd=x_in_sym, \
            params=params, shared_param_dicts=None)
    q_zi_given_xi.init_biases(0.0)

    ###########################################################
    # Define parameters for the GPSImputer, and initialize it #
    ###########################################################
    print("Building the GPSImputer...")
    gpsi_params = {}
    gpsi_params['x_dim'] = x_dim
    gpsi_params['z_dim'] = z_dim
    # switch between direct construction and construction via p_x_given_si
    gpsi_params['imp_steps'] = imp_steps
    gpsi_params['step_type'] = step_type
    gpsi_params['x_type'] = 'bernoulli'
    gpsi_params['obs_transform'] = 'sigmoid'
    GPSI = GPSImputer(rng=rng,
                      x_in=x_in_sym,
                      x_out=x_out_sym,
                      x_mask=x_mask_sym,
                      p_zi_given_xi=p_zi_given_xi,
                      p_sip1_given_zi=p_sip1_given_zi,
                      q_zi_given_xi=q_zi_given_xi,
                      params=gpsi_params,
                      shared_param_dicts=None)

    ################################################################
    # Apply some updates, to check that they aren't totally broken #
    ################################################################
    log_name = "{}_RESULTS.txt".format(result_tag)
    out_file = open(log_name, 'wb')
    costs = [0. for i in range(10)]
    learn_rate = 0.0001
    momentum = 0.90
    batch_idx = np.arange(batch_size) + tr_samples
    for i in range(200000):
        scale = min(1.0, ((i + 1) / 5000.0))
        if (((i + 1) % 15000) == 0):
            learn_rate = learn_rate * 0.95
        # get the indices of training samples for this batch update
        batch_idx += batch_size
        if (np.max(batch_idx) >= tr_samples):
            # we finished an "epoch", so we rejumble the training set
            Xtr = row_shuffle(Xtr)
            batch_idx = np.arange(batch_size)
        # set sgd and objective function hyperparams for this update
        GPSI.set_sgd_params(lr=scale*learn_rate, \
                            mom_1=scale*momentum, mom_2=0.98)
        GPSI.set_train_switch(1.0)
        GPSI.set_lam_nll(lam_nll=1.0)
        GPSI.set_lam_kld(lam_kld_q=1.0, lam_kld_p=0.1, lam_kld_g=0.0)
        GPSI.set_lam_l2w(1e-5)
        # perform a minibatch update and record the cost for this batch
        xb = to_fX(Xtr.take(batch_idx, axis=0))
        xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \
                                        occ_dim=occ_dim, data_mean=data_mean)
        result = GPSI.train_joint(xi, xo, xm, batch_reps)
        # do diagnostics and general training tracking
        costs = [(costs[j] + result[j]) for j in range(len(result) - 1)]
        if ((i % 500) == 0):
            costs = [(v / 500.0) for v in costs]
            str1 = "-- batch {0:d} --".format(i)
            str2 = "    joint_cost: {0:.4f}".format(costs[0])
            str3 = "    nll_bound : {0:.4f}".format(costs[1])
            str4 = "    nll_cost  : {0:.4f}".format(costs[2])
            str5 = "    kld_cost  : {0:.4f}".format(costs[3])
            str6 = "    reg_cost  : {0:.4f}".format(costs[4])
            joint_str = "\n".join([str1, str2, str3, str4, str5, str6])
            print(joint_str)
            out_file.write(joint_str + "\n")
            out_file.flush()
            costs = [0.0 for v in costs]
        if ((i % 1000) == 0):
            Xva = row_shuffle(Xva)
            # record an estimate of performance on the test set
            xi, xo, xm = construct_masked_data(Xva[0:5000], drop_prob=drop_prob, \
                                               occ_dim=occ_dim, data_mean=data_mean)
            nll, kld = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10)
            vfe = np.mean(nll) + np.mean(kld)
            str1 = "    va_nll_bound : {}".format(vfe)
            str2 = "    va_nll_term  : {}".format(np.mean(nll))
            str3 = "    va_kld_q2p   : {}".format(np.mean(kld))
            joint_str = "\n".join([str1, str2, str3])
            print(joint_str)
            out_file.write(joint_str + "\n")
            out_file.flush()
        if ((i % 2000) == 0):
            GPSI.save_to_file("{}_PARAMS.pkl".format(result_tag))
            # Get some validation samples for evaluating model performance
            xb = to_fX(Xva[0:100])
            xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \
                                    occ_dim=occ_dim, data_mean=data_mean)
            xi = np.repeat(xi, 2, axis=0)
            xo = np.repeat(xo, 2, axis=0)
            xm = np.repeat(xm, 2, axis=0)
            # draw some sample imputations from the model
            samp_count = xi.shape[0]
            _, model_samps = GPSI.sample_imputer(xi,
                                                 xo,
                                                 xm,
                                                 use_guide_policy=False)
            seq_len = len(model_samps)
            seq_samps = np.zeros(
                (seq_len * samp_count, model_samps[0].shape[1]))
            idx = 0
            for s1 in range(samp_count):
                for s2 in range(seq_len):
                    seq_samps[idx] = model_samps[s2][s1]
                    idx += 1
            file_name = "{0:s}_samples_ng_b{1:d}.png".format(result_tag, i)
            utils.visualize_samples(seq_samps, file_name, num_rows=20)
示例#16
0
def test_dA(learning_rate=0.1,
            training_epochs=30,
            dataset='./data/mnist.pkl.gz',
            batch_size=25,
            output_folder='dA_plots'):
    """
    Blargh!
    """
    datasets = load_udm(dataset)
    train_set_x, train_set_y = datasets[0]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images

    if not os.path.isdir(output_folder):
        os.makedirs(output_folder)
    os.chdir(output_folder)
    ####################################
    # BUILDING THE MODEL NO CORRUPTION #
    ####################################

    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2**30))

    da = dA(numpy_rng=rng,
            theano_rng=theano_rng,
            input=x,
            n_visible=28 * 28,
            n_hidden=500)

    cost, updates = da.get_cost_updates(corruption_level=0.,
                                        learning_rate=learning_rate)

    train_da = theano.function(inputs=[index],
                               outputs=[cost],
                               updates=updates,
                               givens={
                                   x:
                                   train_set_x[index * batch_size:(index + 1) *
                                               batch_size, :]
                               })

    ############
    # TRAINING #
    ############

    # go through training epochs
    for epoch in xrange(training_epochs):
        # go through training set
        c = []
        t1 = time.clock()
        for batch_index in xrange(n_train_batches):
            c.append(train_da(batch_index))
        t2 = time.clock()

        print "Training epoch {0:d}, cost {1:.4f}, time {2:.4f}".format( \
                epoch, numpy.mean(c), (t2 - t1))

        image = PIL.Image.fromarray(
            tile_raster_images(X=da.W.get_value(borrow=True).T,
                               img_shape=(28, 28),
                               tile_shape=(10, 10),
                               tile_spacing=(1, 1)))
        image.save('filters_corruption_00.png')

    #####################################
    # BUILDING THE MODEL CORRUPTION 30% #
    #####################################

    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2**30))

    da = dA(numpy_rng=rng,
            theano_rng=theano_rng,
            input=x,
            n_visible=28 * 28,
            n_hidden=500)

    cost, updates = da.get_cost_updates(corruption_level=0.3,
                                        learning_rate=learning_rate)

    train_da = theano.function(
        [index],
        cost,
        updates=updates,
        givens={x: train_set_x[index * batch_size:(index + 1) * batch_size]})

    ############
    # TRAINING #
    ############

    # go through training epochs
    for epoch in xrange(training_epochs):
        # go through training set
        c = []
        t1 = time.clock()
        for batch_index in xrange(n_train_batches):
            c.append(train_da(batch_index))
        t2 = time.clock()

        print "Training epoch {0:d}, cost {1:.4f}, time {2:.4f}".format( \
                epoch, numpy.mean(c), (t2 - t1))

        image = PIL.Image.fromarray(
            tile_raster_images(X=da.W.get_value(borrow=True).T,
                               img_shape=(28, 28),
                               tile_shape=(10, 10),
                               tile_spacing=(1, 1)))
        image.save('filters_corruption_30.png')

    os.chdir('../')
def test_mnist(step_type='add',
               occ_dim=15,
               drop_prob=0.0):
    #########################################
    # Format the result tag more thoroughly #
    #########################################
    dp_int = int(100.0 * drop_prob)
    result_tag = "{}GPSI_OD{}_DP{}_{}_NA".format(RESULT_PATH, occ_dim, dp_int, step_type)

    ##########################
    # Get some training data #
    ##########################
    rng = np.random.RandomState(1234)
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, as_shared=False, zero_mean=False)
    Xtr = datasets[0][0]
    Xva = datasets[1][0]
    Xtr = to_fX(shift_and_scale_into_01(Xtr))
    Xva = to_fX(shift_and_scale_into_01(Xva))
    tr_samples = Xtr.shape[0]
    va_samples = Xva.shape[0]
    batch_size = 250
    batch_reps = 1
    all_pix_mean = np.mean(np.mean(Xtr, axis=1))
    data_mean = to_fX( all_pix_mean * np.ones((Xtr.shape[1],)) )

    ############################################################
    # Setup some parameters for the Iterative Refinement Model #
    ############################################################
    obs_dim = Xtr.shape[1]
    z_dim = 100
    imp_steps = 5
    init_scale = 1.0

    x_in_sym = T.matrix('x_in_sym')
    x_out_sym = T.matrix('x_out_sym')
    x_mask_sym = T.matrix('x_mask_sym')

    #################
    # p_zi_given_xi #
    #################
    params = {}
    shared_config = [obs_dim, 1000, 1000]
    top_config = [shared_config[-1], z_dim]
    params['shared_config'] = shared_config
    params['mu_config'] = top_config
    params['sigma_config'] = top_config
    params['activation'] = relu_actfun
    params['init_scale'] = init_scale
    params['vis_drop'] = 0.0
    params['hid_drop'] = 0.0
    params['bias_noise'] = 0.0
    params['input_noise'] = 0.0
    params['build_theano_funcs'] = False
    p_zi_given_xi = InfNet(rng=rng, Xd=x_in_sym, \
            params=params, shared_param_dicts=None)
    p_zi_given_xi.init_biases(0.2)
    ###################
    # p_xip1_given_zi #
    ###################
    params = {}
    shared_config = [z_dim, 1000, 1000]
    output_config = [obs_dim, obs_dim]
    params['shared_config'] = shared_config
    params['output_config'] = output_config
    params['activation'] = relu_actfun
    params['init_scale'] = init_scale
    params['vis_drop'] = 0.0
    params['hid_drop'] = 0.0
    params['bias_noise'] = 0.0
    params['input_noise'] = 0.0
    params['build_theano_funcs'] = False
    p_xip1_given_zi = HydraNet(rng=rng, Xd=x_in_sym, \
            params=params, shared_param_dicts=None)
    p_xip1_given_zi.init_biases(0.2)
    ###################
    # q_zi_given_x_xi #
    ###################
    params = {}
    shared_config = [(obs_dim + obs_dim), 1000, 1000]
    top_config = [shared_config[-1], z_dim]
    params['shared_config'] = shared_config
    params['mu_config'] = top_config
    params['sigma_config'] = top_config
    params['activation'] = relu_actfun
    params['init_scale'] = init_scale
    params['vis_drop'] = 0.0
    params['hid_drop'] = 0.0
    params['bias_noise'] = 0.0
    params['input_noise'] = 0.0
    params['build_theano_funcs'] = False
    q_zi_given_x_xi = InfNet(rng=rng, Xd=x_in_sym, \
            params=params, shared_param_dicts=None)
    q_zi_given_x_xi.init_biases(0.2)


    ###########################################################
    # Define parameters for the GPSImputer, and initialize it #
    ###########################################################
    print("Building the GPSImputer...")
    gpsi_params = {}
    gpsi_params['obs_dim'] = obs_dim
    gpsi_params['z_dim'] = z_dim
    gpsi_params['imp_steps'] = imp_steps
    gpsi_params['step_type'] = step_type
    gpsi_params['x_type'] = 'bernoulli'
    gpsi_params['obs_transform'] = 'sigmoid'
    GPSI = GPSImputer(rng=rng, 
            x_in=x_in_sym, x_out=x_out_sym, x_mask=x_mask_sym, \
            p_zi_given_xi=p_zi_given_xi, \
            p_xip1_given_zi=p_xip1_given_zi, \
            q_zi_given_x_xi=q_zi_given_x_xi, \
            params=gpsi_params, \
            shared_param_dicts=None)

    # # test model saving
    # print("Testing model save to file...")
    # GPSI.save_to_file("AAA_GPSI_SAVE_TEST.pkl")
    # # test model loading
    # print("Testing model load from file...")
    # GPSI = load_gpsimputer_from_file(f_name="AAA_GPSI_SAVE_TEST.pkl", rng=rng)

    ################################################################
    # Apply some updates, to check that they aren't totally broken #
    ################################################################
    log_name = "{}_RESULTS.txt".format(result_tag)
    out_file = open(log_name, 'wb')
    costs = [0. for i in range(10)]
    learn_rate = 0.0002
    momentum = 0.5
    batch_idx = np.arange(batch_size) + tr_samples
    for i in range(200000):
        scale = min(1.0, ((i+1) / 5000.0))
        if (((i + 1) % 15000) == 0):
            learn_rate = learn_rate * 0.92
        if (i > 10000):
            momentum = 0.90
        else:
            momentum = 0.50
        # get the indices of training samples for this batch update
        batch_idx += batch_size
        if (np.max(batch_idx) >= tr_samples):
            # we finished an "epoch", so we rejumble the training set
            Xtr = row_shuffle(Xtr)
            batch_idx = np.arange(batch_size)
        # set sgd and objective function hyperparams for this update
        GPSI.set_sgd_params(lr=scale*learn_rate, \
                            mom_1=scale*momentum, mom_2=0.98)
        GPSI.set_train_switch(1.0)
        GPSI.set_lam_nll(lam_nll=1.0)
        GPSI.set_lam_kld(lam_kld_p=0.1, lam_kld_q=0.9)
        GPSI.set_lam_l2w(1e-4)
        # perform a minibatch update and record the cost for this batch
        xb = to_fX( Xtr.take(batch_idx, axis=0) )
        xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \
                                        occ_dim=occ_dim, data_mean=data_mean)
        result = GPSI.train_joint(xi, xo, xm, batch_reps)
        # do diagnostics and general training tracking
        costs = [(costs[j] + result[j]) for j in range(len(result)-1)]
        if ((i % 250) == 0):
            costs = [(v / 250.0) for v in costs]
            str1 = "-- batch {0:d} --".format(i)
            str2 = "    joint_cost: {0:.4f}".format(costs[0])
            str3 = "    nll_bound : {0:.4f}".format(costs[1])
            str4 = "    nll_cost  : {0:.4f}".format(costs[2])
            str5 = "    kld_cost  : {0:.4f}".format(costs[3])
            str6 = "    reg_cost  : {0:.4f}".format(costs[4])
            joint_str = "\n".join([str1, str2, str3, str4, str5, str6])
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
            costs = [0.0 for v in costs]
        if ((i % 1000) == 0):
            Xva = row_shuffle(Xva)
            # record an estimate of performance on the test set
            xi, xo, xm = construct_masked_data(Xva[0:5000], drop_prob=drop_prob, \
                                               occ_dim=occ_dim, data_mean=data_mean)
            nll, kld = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10)
            vfe = np.mean(nll) + np.mean(kld)
            str1 = "    va_nll_bound : {}".format(vfe)
            str2 = "    va_nll_term  : {}".format(np.mean(nll))
            str3 = "    va_kld_q2p   : {}".format(np.mean(kld))
            joint_str = "\n".join([str1, str2, str3])
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
        if ((i % 5000) == 0):
            GPSI.save_to_file("{}_PARAMS.pkl".format(result_tag))
            # Get some validation samples for evaluating model performance
            xb = to_fX( Xva[0:100] )
            xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \
                                    occ_dim=occ_dim, data_mean=data_mean)
            xi = np.repeat(xi, 2, axis=0)
            xo = np.repeat(xo, 2, axis=0)
            xm = np.repeat(xm, 2, axis=0)
            # draw some sample imputations from the model
            samp_count = xi.shape[0]
            _, model_samps = GPSI.sample_imputer(xi, xo, xm, use_guide_policy=False)
            seq_len = len(model_samps)
            seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1]))
            idx = 0
            for s1 in range(samp_count):
                for s2 in range(seq_len):
                    seq_samps[idx] = model_samps[s2][s1]
                    idx += 1
            file_name = "{0:s}_samples_ng_b{1:d}.png".format(result_tag, i)
            utils.visualize_samples(seq_samps, file_name, num_rows=20)
            # get visualizations of policy parameters
            file_name = "{0:s}_gen_gen_weights_b{1:d}.png".format(result_tag, i)
            W = GPSI.gen_gen_weights.get_value(borrow=False)
            utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20)
            file_name = "{0:s}_gen_inf_weights_b{1:d}.png".format(result_tag, i)
            W = GPSI.gen_inf_weights.get_value(borrow=False).T
            utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20)
示例#18
0
def test_imocld_imp_mnist(step_type='add',
                          occ_dim=14,
                          drop_prob=0.0,
                          attention=False):
    ##########################
    # Get some training data #
    ##########################
    rng = np.random.RandomState(1234)
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, as_shared=False, zero_mean=False)
    Xtr = datasets[0][0]
    Xva = datasets[1][0]
    Xte = datasets[2][0]
    # Merge validation set and training set, and test on test set.
    Xtr = np.concatenate((Xtr, Xva), axis=0)
    Xva = Xte
    Xtr = to_fX(shift_and_scale_into_01(Xtr))
    Xva = to_fX(shift_and_scale_into_01(Xva))
    tr_samples = Xtr.shape[0]
    va_samples = Xva.shape[0]
    batch_size = 200

    ############################################################
    # Setup some parameters for the Iterative Refinement Model #
    ############################################################
    x_dim = Xtr.shape[1]
    write_dim = 300
    enc_dim = 300
    dec_dim = 300
    mix_dim = 20
    z_dim = 100
    n_iter = 25
    dp_int = int(100.0 * drop_prob)

    rnninits = {
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }
    inits = {
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }

    att_tag = "NA"  # attention not implemented yet

    # setup the reader and writer (shared by primary and guide policies)
    read_dim = 2 * x_dim  # dimension of output from reader_mlp
    reader_mlp = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits)
    writer_mlp = MLP([None, None], [dec_dim, write_dim, x_dim], \
                     name="writer_mlp", **inits)

    # mlps for setting conditionals over z_mix
    mix_var_mlp = CondNet([Tanh()], [x_dim, 250, mix_dim], \
                          name="mix_var_mlp", **inits)
    mix_enc_mlp = CondNet([Tanh()], [x_dim, 250, mix_dim], \
                          name="mix_enc_mlp", **inits)
    # mlp for decoding z_mix into a distribution over initial LSTM states
    mix_dec_mlp = MLP([Tanh(), Tanh()], \
                      [mix_dim, 250, (2*enc_dim + 2*dec_dim + 2*enc_dim)], \
                      name="mix_dec_mlp", **inits)
    # mlps for processing inputs to LSTMs
    var_mlp_in = MLP([Identity()], [(read_dim + dec_dim), 4*enc_dim], \
                     name="var_mlp_in", **inits)
    enc_mlp_in = MLP([Identity()], [(read_dim + dec_dim), 4*enc_dim], \
                     name="enc_mlp_in", **inits)
    dec_mlp_in = MLP([Identity()], [               z_dim, 4*dec_dim], \
                     name="dec_mlp_in", **inits)
    # mlps for turning LSTM outputs into conditionals over z_gen
    var_mlp_out = CondNet([], [enc_dim, z_dim], name="var_mlp_out", **inits)
    enc_mlp_out = CondNet([], [enc_dim, z_dim], name="enc_mlp_out", **inits)
    # LSTMs for the actual LSTMs (obviously, perhaps)
    var_rnn = BiasedLSTM(dim=enc_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="var_rnn", **rnninits)
    enc_rnn = BiasedLSTM(dim=enc_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="enc_rnn", **rnninits)
    dec_rnn = BiasedLSTM(dim=dec_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="dec_rnn", **rnninits)

    draw = IMoCLDrawModels(
        n_iter,
        step_type=step_type,  # step_type can be 'add' or 'jump'
        reader_mlp=reader_mlp,
        writer_mlp=writer_mlp,
        mix_enc_mlp=mix_enc_mlp,
        mix_dec_mlp=mix_dec_mlp,
        mix_var_mlp=mix_var_mlp,
        enc_mlp_in=enc_mlp_in,
        enc_mlp_out=enc_mlp_out,
        enc_rnn=enc_rnn,
        dec_mlp_in=dec_mlp_in,
        dec_rnn=dec_rnn,
        var_mlp_in=var_mlp_in,
        var_mlp_out=var_mlp_out,
        var_rnn=var_rnn)
    draw.initialize()

    # build the cost gradients, training function, samplers, etc.
    draw.build_model_funcs()

    #draw.load_model_params(f_name="TBCLM_IMP_MNIST_PARAMS_OD{}_DP{}_{}_{}.pkl".format(occ_dim, dp_int, step_type, att_tag))

    ################################################################
    # Apply some updates, to check that they aren't totally broken #
    ################################################################
    print("Beginning to train the model...")
    out_file = open(
        "TBCLM_IMP_MNIST_RESULTS_OD{}_DP{}_{}_{}.txt".format(
            occ_dim, dp_int, step_type, att_tag), 'wb')
    out_file.flush()
    costs = [0. for i in range(10)]
    learn_rate = 0.0002
    momentum = 0.9
    batch_idx = np.arange(batch_size) + tr_samples
    for i in range(250000):
        scale = min(1.0, ((i + 1) / 1000.0))
        if (((i + 1) % 10000) == 0):
            learn_rate = learn_rate * 0.95
        # get the indices of training samples for this batch update
        batch_idx += batch_size
        if (np.max(batch_idx) >= tr_samples):
            # we finished an "epoch", so we rejumble the training set
            Xtr = row_shuffle(Xtr)
            batch_idx = np.arange(batch_size)
        # set sgd and objective function hyperparams for this update
        zero_ary = np.zeros((1, ))
        draw.lr.set_value(to_fX(zero_ary + learn_rate))
        draw.mom_1.set_value(to_fX(zero_ary + momentum))
        draw.mom_2.set_value(to_fX(zero_ary + 0.99))

        # perform a minibatch update and record the cost for this batch
        Xb = to_fX(Xtr.take(batch_idx, axis=0))
        _, Xb, Mb = construct_masked_data(Xb, drop_prob=drop_prob, \
                                    occ_dim=occ_dim, data_mean=None)
        result = draw.train_joint(Xb, Mb)

        costs = [(costs[j] + result[j]) for j in range(len(result))]
        if ((i % 200) == 0):
            costs = [(v / 200.0) for v in costs]
            str1 = "-- batch {0:d} --".format(i)
            str2 = "    total_cost: {0:.4f}".format(costs[0])
            str3 = "    nll_bound : {0:.4f}".format(costs[1])
            str4 = "    nll_term  : {0:.4f}".format(costs[2])
            str5 = "    kld_q2p   : {0:.4f}".format(costs[3])
            str6 = "    kld_p2q   : {0:.4f}".format(costs[4])
            str7 = "    reg_term  : {0:.4f}".format(costs[5])
            joint_str = "\n".join([str1, str2, str3, str4, str5, str6, str7])
            print(joint_str)
            out_file.write(joint_str + "\n")
            out_file.flush()
            costs = [0.0 for v in costs]
        if ((i % 1000) == 0):
            draw.save_model_params(
                "TBCLM_IMP_MNIST_PARAMS_OD{}_DP{}_{}_{}.pkl".format(
                    occ_dim, dp_int, step_type, att_tag))
            # compute a small-sample estimate of NLL bound on validation set
            Xva = row_shuffle(Xva)
            Xb = to_fX(Xva[:5000])
            _, Xb, Mb = construct_masked_data(Xb, drop_prob=drop_prob, \
                                    occ_dim=occ_dim, data_mean=None)
            va_costs = draw.compute_nll_bound(Xb, Mb)
            str1 = "    va_nll_bound : {}".format(va_costs[1])
            str2 = "    va_nll_term  : {}".format(va_costs[2])
            str3 = "    va_kld_q2p   : {}".format(va_costs[3])
            joint_str = "\n".join([str1, str2, str3])
            print(joint_str)
            out_file.write(joint_str + "\n")
            out_file.flush()
            # draw some independent samples from the model
            Xb = to_fX(Xva[:100])
            _, Xb, Mb = construct_masked_data(Xb, drop_prob=drop_prob, \
                                    occ_dim=occ_dim, data_mean=None)
            samples, _ = draw.do_sample(Xb, Mb)
            n_iter, N, D = samples.shape
            samples = samples.reshape((n_iter, N, 28, 28))
            for j in xrange(n_iter):
                img = img_grid(samples[j, :, :, :])
                img.save(
                    "TBCLM-IMP-MNIST-OD{0:d}-DP{1:d}-{2:s}-samples-{3:03d}.png"
                    .format(occ_dim, dp_int, step_type, j))
def pretrain_osm(lam_kld=0.0):
    # Initialize a source of randomness
    rng = np.random.RandomState(1234)

    # Load some data to train/validate/test with
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, zero_mean=False)
    Xtr = datasets[0][0]
    Xtr = Xtr.get_value(borrow=False)
    Xva = datasets[2][0]
    Xva = Xva.get_value(borrow=False)
    print("Xtr.shape: {0:s}, Xva.shape: {1:s}".format(str(Xtr.shape),str(Xva.shape)))

    # get and set some basic dataset information
    Xtr_mean = np.mean(Xtr, axis=0)
    tr_samples = Xtr.shape[0]
    va_samples = Xva.shape[0]
    batch_size = 100
    batch_reps = 5

    # setup some symbolic variables and stuff
    Xd = T.matrix('Xd_base')
    Xc = T.matrix('Xc_base')
    Xm = T.matrix('Xm_base')
    data_dim = Xtr.shape[1]
    prior_sigma = 1.0

    ##########################
    # NETWORK CONFIGURATIONS #
    ##########################
    gn_params = {}
    shared_config = [PRIOR_DIM, 1000, 1000]
    top_config = [shared_config[-1], data_dim]
    gn_params['shared_config'] = shared_config
    gn_params['mu_config'] = top_config
    gn_params['sigma_config'] = top_config
    gn_params['activation'] = relu_actfun
    gn_params['init_scale'] = 1.4
    gn_params['lam_l2a'] = 0.0
    gn_params['vis_drop'] = 0.0
    gn_params['hid_drop'] = 0.0
    gn_params['bias_noise'] = 0.0
    gn_params['input_noise'] = 0.0
    # choose some parameters for the continuous inferencer
    in_params = {}
    shared_config = [data_dim, 1000, 1000]
    top_config = [shared_config[-1], PRIOR_DIM]
    in_params['shared_config'] = shared_config
    in_params['mu_config'] = top_config
    in_params['sigma_config'] = top_config
    in_params['activation'] = relu_actfun
    in_params['init_scale'] = 1.4
    in_params['lam_l2a'] = 0.0
    in_params['vis_drop'] = 0.0
    in_params['hid_drop'] = 0.0
    in_params['bias_noise'] = 0.0
    in_params['input_noise'] = 0.0
    # Initialize the base networks for this OneStageModel
    IN = InfNet(rng=rng, Xd=Xd, prior_sigma=prior_sigma, \
            params=in_params, shared_param_dicts=None)
    GN = InfNet(rng=rng, Xd=Xd, prior_sigma=prior_sigma, \
            params=gn_params, shared_param_dicts=None)
    # Initialize biases in IN and GN
    IN.init_biases(0.2)
    GN.init_biases(0.2)

    #########################
    # INITIALIZE THE GIPAIR #
    #########################
    osm_params = {}
    osm_params['x_type'] = 'bernoulli'
    osm_params['xt_transform'] = 'sigmoid'
    osm_params['logvar_bound'] = LOGVAR_BOUND
    OSM = OneStageModel(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, \
            p_x_given_z=GN, q_z_given_x=IN, \
            x_dim=data_dim, z_dim=PRIOR_DIM, params=osm_params)
    OSM.set_lam_l2w(1e-5)
    safe_mean = (0.9 * Xtr_mean) + 0.05
    safe_mean_logit = np.log(safe_mean / (1.0 - safe_mean))
    OSM.set_output_bias(safe_mean_logit)
    OSM.set_input_bias(-Xtr_mean)

    ######################
    # BASIC VAE TRAINING #
    ######################
    out_file = open(RESULT_PATH+"pt_osm_results.txt", 'wb')
    # Set initial learning rate and basic SGD hyper parameters
    obs_costs = np.zeros((batch_size,))
    costs = [0. for i in range(10)]
    learn_rate = 0.0005
    for i in range(150000):
        scale = min(1.0, float(i) / 10000.0)
        if ((i > 1) and ((i % 20000) == 0)):
            learn_rate = learn_rate * 0.9
        # do a minibatch update of the model, and compute some costs
        tr_idx = npr.randint(low=0,high=tr_samples,size=(batch_size,))
        Xd_batch = Xtr.take(tr_idx, axis=0)
        Xc_batch = 0.0 * Xd_batch
        Xm_batch = 0.0 * Xd_batch
        # do a minibatch update of the model, and compute some costs
        OSM.set_sgd_params(lr_1=(scale*learn_rate), mom_1=0.5, mom_2=0.98)
        OSM.set_lam_nll(1.0)
        OSM.set_lam_kld(lam_kld_1=(1.0 + (scale*(lam_kld-1.0))), lam_kld_2=0.0)
        result = OSM.train_joint(Xd_batch, Xc_batch, Xm_batch, batch_reps)
        costs = [(costs[j] + result[j]) for j in range(len(result))]
        if ((i % 1000) == 0):
            # record and then reset the cost trackers
            costs = [(v / 1000.0) for v in costs]
            str_1 = "-- batch {0:d} --".format(i)
            str_2 = "    joint_cost: {0:.4f}".format(costs[0])
            str_3 = "    nll_cost  : {0:.4f}".format(costs[1])
            str_4 = "    kld_cost  : {0:.4f}".format(costs[2])
            str_5 = "    reg_cost  : {0:.4f}".format(costs[3])
            costs = [0.0 for v in costs]
            # print out some diagnostic information
            joint_str = "\n".join([str_1, str_2, str_3, str_4, str_5])
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
        if ((i % 2000) == 0):
            Xva = row_shuffle(Xva)
            model_samps = OSM.sample_from_prior(500)
            file_name = RESULT_PATH+"pt_osm_samples_b{0:d}_XG.png".format(i)
            utils.visualize_samples(model_samps, file_name, num_rows=20)
            # compute information about free-energy on validation set
            file_name = RESULT_PATH+"pt_osm_free_energy_b{0:d}.png".format(i)
            fe_terms = OSM.compute_fe_terms(Xva[0:2500], 20)
            fe_mean = np.mean(fe_terms[0]) + np.mean(fe_terms[1])
            fe_str = "    nll_bound : {0:.4f}".format(fe_mean)
            print(fe_str)
            out_file.write(fe_str+"\n")
            utils.plot_scatter(fe_terms[1], fe_terms[0], file_name, \
                    x_label='Posterior KLd', y_label='Negative Log-likelihood')
            # compute information about posterior KLds on validation set
            file_name = RESULT_PATH+"pt_osm_post_klds_b{0:d}.png".format(i)
            post_klds = OSM.compute_post_klds(Xva[0:2500])
            post_dim_klds = np.mean(post_klds, axis=0)
            utils.plot_stem(np.arange(post_dim_klds.shape[0]), post_dim_klds, \
                    file_name)
        if ((i % 5000) == 0):
            IN.save_to_file(f_name=RESULT_PATH+"pt_osm_params_b{0:d}_IN.pkl".format(i))
            GN.save_to_file(f_name=RESULT_PATH+"pt_osm_params_b{0:d}_GN.pkl".format(i))
    IN.save_to_file(f_name=RESULT_PATH+"pt_osm_params_IN.pkl")
    GN.save_to_file(f_name=RESULT_PATH+"pt_osm_params_GN.pkl")
    return
示例#20
0
def test_seq_cond_gen_sequence(step_type='add'):
    ##############################
    # File tag, for output stuff #
    ##############################
    result_tag = "{}BBB_SCG".format(RESULT_PATH)

    ##########################
    # Get some training data #
    ##########################
    rng = np.random.RandomState(1234)
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, as_shared=False, zero_mean=False)
    # get training/validation/test images
    Xtr = datasets[0][0]
    Xva = datasets[1][0]
    Xte = datasets[2][0]
    Xtr = to_fX(shift_and_scale_into_01(Xtr))
    Xva = to_fX(shift_and_scale_into_01(Xva))
    Xte = to_fX(shift_and_scale_into_01(Xte))
    obs_dim = Xtr.shape[1]
    # get label representations
    tr_samples = Xtr.shape[0]
    va_samples = Xva.shape[0]
    batch_size = 200
    step_reps = 3

    ############################################################
    # Setup some parameters for the Iterative Refinement Model #
    ############################################################
    total_steps = step_reps * 28
    init_steps = step_reps
    exit_rate = 0.0
    nll_weight = 1.0 / step_reps
    x_dim = 28
    y_dim = 28
    z_dim = 100
    rnn_dim = 300
    write_dim = 250
    mlp_dim = 250

    def visualize_attention(sampler_result, pre_tag="AAA", post_tag="AAA"):
        # get generated predictions
        seq_len = sampler_result[0].shape[0]
        samp_count = sampler_result[0].shape[1]
        x_dim = sampler_result[0].shape[2]
        seq_samps = np.zeros((samp_count, 28*28))
        for samp in range(samp_count):
            step = 0
            samp_vals = np.zeros((28,28))
            for col in range(28):
                col_vals = np.zeros((28,))
                for rep in range(step_reps):
                    if (rep == (step_reps-1)):
                        col_vals = sampler_result[0][step,samp,:]
                    step += 1
                samp_vals[:,col] = col_vals
            seq_samps[samp,:] = samp_vals.ravel()
        file_name = "{0:s}_traj_xs_{1:s}.png".format(pre_tag, post_tag)
        utils.visualize_samples(seq_samps, file_name, num_rows=10)
        # get sequential attention maps
        seq_samps = np.zeros((samp_count, 28*28))
        for samp in range(samp_count):
            step = 0
            samp_vals = np.zeros((28,28))
            for col in range(28):
                col_vals = np.zeros((28,))
                for rep in range(step_reps):
                    col_vals = col_vals + sampler_result[1][step,samp,:x_dim]
                    col_vals = col_vals + sampler_result[1][step,samp,x_dim:]
                    step += 1
                samp_vals[:,col] = col_vals / (2.0*step_reps)
            seq_samps[samp,:] = samp_vals.ravel()
        file_name = "{0:s}_traj_att_maps_{1:s}.png".format(pre_tag, post_tag)
        utils.visualize_samples(seq_samps, file_name, num_rows=10)
        # get sequential attention maps (read out values)
        seq_samps = np.zeros((samp_count, 28*28))
        for samp in range(samp_count):
            step = 0
            samp_vals = np.zeros((28,28))
            for col in range(28):
                col_vals = np.zeros((28,))
                for rep in range(step_reps):
                    col_vals = col_vals + sampler_result[2][step,samp,:x_dim]
                    col_vals = col_vals + sampler_result[2][step,samp,x_dim:]
                    step += 1
                samp_vals[:,col] = col_vals / (2.0*step_reps)
            seq_samps[samp,:] = samp_vals.ravel()
        file_name = "{0:s}_traj_read_outs_{1:s}.png".format(pre_tag, post_tag)
        utils.visualize_samples(seq_samps, file_name, num_rows=10)
        return

    def batch_reshape(Xb, reps=step_reps):
        # reshape for stuff
        bs = Xb.shape[0]
        xb = Xb.reshape((bs, 28, 28)).swapaxes(0,2).swapaxes(1,2)
        xb = xb.repeat(reps, axis=0)
        return xb

    rnninits = {
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }
    inits = {
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }

    read_N = 2 # inner/outer grid dimension for reader
    read_dim = 2*read_N   # total number of "pixels" read by reader
    reader_mlp = SimpleAttentionReader1d(x_dim=x_dim, con_dim=rnn_dim,
                                         N=read_N, init_scale=2.0, **inits)

    writer_mlp = MLP([None, None], [rnn_dim, write_dim, y_dim], \
                     name="writer_mlp", **inits)

    # mlps for processing inputs to LSTMs
    con_mlp_in = MLP([Identity()], [                       z_dim, 4*rnn_dim], \
                     name="con_mlp_in", **inits)
    var_mlp_in = MLP([Identity()], [(y_dim + read_dim + rnn_dim), 4*rnn_dim], \
                     name="var_mlp_in", **inits)
    gen_mlp_in = MLP([Identity()], [        (read_dim + rnn_dim), 4*rnn_dim], \
                     name="gen_mlp_in", **inits)

    # mlps for turning LSTM outputs into conditionals over z_gen
    con_mlp_out = CondNet([], [rnn_dim, z_dim], name="con_mlp_out", **inits)
    gen_mlp_out = CondNet([], [rnn_dim, z_dim], name="gen_mlp_out", **inits)
    var_mlp_out = CondNet([], [rnn_dim, z_dim], name="var_mlp_out", **inits)

    # LSTMs for the actual LSTMs (obviously, perhaps)
    con_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="con_rnn", **rnninits)
    gen_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="gen_rnn", **rnninits)
    var_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="var_rnn", **rnninits)

    SCG = SeqCondGen(
                x_and_y_are_seqs=True, # this test uses sequential x/y
                total_steps=total_steps,
                init_steps=init_steps,
                exit_rate=exit_rate,
                nll_weight=nll_weight,
                step_type=step_type,
                x_dim=x_dim,
                y_dim=y_dim,
                reader_mlp=reader_mlp,
                writer_mlp=writer_mlp,
                con_mlp_in=con_mlp_in,
                con_rnn=con_rnn,
                gen_mlp_in=gen_mlp_in,
                gen_mlp_out=gen_mlp_out,
                gen_rnn=gen_rnn,
                var_mlp_in=var_mlp_in,
                var_mlp_out=var_mlp_out,
                var_rnn=var_rnn)
    SCG.initialize()

    compile_start_time = time.time()

    # build the cost gradients, training function, samplers, etc.
    SCG.build_attention_funcs()

    ###########################################
    # Sample and draw attention trajectories. #
    ###########################################
    samp_count = 100
    Xb = Xva[:samp_count,:]
    Xb = batch_reshape(Xb, reps=step_reps)
    print("Xb.shape: {}".format(Xb.shape))
    result = SCG.sample_attention(Xb, Xb)
    visualize_attention(result, pre_tag=result_tag, post_tag="b0")
    print("TESTED SAMPLER!")
    Xva = row_shuffle(Xva)
    Xb = Xva[:500]
    Xb = batch_reshape(Xb, reps=step_reps)
    va_costs = SCG.simple_nll_bound(Xb, Xb)
    print("nll_bound : {}".format(va_costs[0]))
    print("nll_term  : {}".format(va_costs[1]))
    print("kld_q2p   : {}".format(va_costs[2]))
    print("TESTED NLL BOUND!")

    SCG.build_model_funcs()

    compile_end_time = time.time()
    compile_minutes = (compile_end_time - compile_start_time) / 60.0
    print("THEANO COMPILE TIME (MIN): {}".format(compile_minutes))

    #SCG.load_model_params(f_name="SCG_params.pkl")

    ################################################################
    # Apply some updates, to check that they aren't totally broken #
    ################################################################
    print("Beginning to train the model...")
    out_file = open("{}_results.txt".format(result_tag), 'wb')
    out_file.flush()
    costs = [0. for i in range(10)]
    learn_rate = 0.0001
    momentum = 0.75
    batch_idx = np.arange(batch_size) + tr_samples
    for i in range(250000):
        scale = min(1.0, ((i+1) / 2500.0))
        if (((i + 1) % 10000) == 0):
            learn_rate = learn_rate * 0.95
        if (i > 10000):
            momentum = 0.95
        else:
            momentum = 0.75
        # get the indices of training samples for this batch update
        batch_idx += batch_size
        if (np.max(batch_idx) >= tr_samples):
            # we finished an "epoch", so we rejumble the training set
            Xtr = row_shuffle(Xtr)
            batch_idx = np.arange(batch_size)
        # set sgd and objective function hyperparams for this update
        SCG.set_sgd_params(lr=learn_rate, mom_1=momentum, mom_2=0.99)
        SCG.set_lam_kld(lam_kld_q2p=0.95, lam_kld_p2q=0.05, lam_kld_p2g=0.0)
        # perform a minibatch update and record the cost for this batch
        Xb = Xtr.take(batch_idx, axis=0)
        Xb = batch_reshape(Xb, reps=step_reps)
        result = SCG.train_joint(Xb, Xb)
        costs = [(costs[j] + result[j]) for j in range(len(result))]

        # output diagnostic information and checkpoint parameters, etc.
        if ((i % 250) == 0):
            costs = [(v / 250.0) for v in costs]
            str1 = "-- batch {0:d} --".format(i)
            str2 = "    total_cost: {0:.4f}".format(costs[0])
            str3 = "    nll_bound : {0:.4f}".format(costs[1])
            str4 = "    nll_term  : {0:.4f}".format(costs[2])
            str5 = "    kld_q2p   : {0:.4f}".format(costs[3])
            str6 = "    kld_p2q   : {0:.4f}".format(costs[4])
            str7 = "    kld_p2g   : {0:.4f}".format(costs[5])
            str8 = "    reg_term  : {0:.4f}".format(costs[6])
            joint_str = "\n".join([str1, str2, str3, str4, str5, str6, str7, str8])
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
            costs = [0.0 for v in costs]
        if ((i % 500) == 0): #((i % 1000) == 0):
            SCG.save_model_params("{}_params.pkl".format(result_tag))
            # compute a small-sample estimate of NLL bound on validation set
            Xva = row_shuffle(Xva)
            Xb = Xva[:500]
            Xb = batch_reshape(Xb, reps=step_reps)
            va_costs = SCG.compute_nll_bound(Xb, Xb)
            str1 = "    va_nll_bound : {}".format(va_costs[1])
            str2 = "    va_nll_term  : {}".format(va_costs[2])
            str3 = "    va_kld_q2p   : {}".format(va_costs[3])
            joint_str = "\n".join([str1, str2, str3])
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
            ###########################################
            # Sample and draw attention trajectories. #
            ###########################################
            post_tag = "b{}".format(i)
            Xb = Xva[:100,:]
            Xb = batch_reshape(Xb, reps=step_reps)
            result = SCG.sample_attention(Xb, Xb)
            visualize_attention(result, pre_tag=result_tag, post_tag=post_tag)
def test_mnist(step_type='add',
               imp_steps=6,
               occ_dim=15,
               drop_prob=0.0):
    #########################################
    # Format the result tag more thoroughly #
    #########################################
    dp_int = int(100.0 * drop_prob)
    result_tag = "{}GPSI_conv_bn_OD{}_DP{}_IS{}_{}_NA".format(RESULT_PATH, occ_dim, dp_int, imp_steps, step_type)

    ##########################
    # Get some training data #
    ##########################
    rng = np.random.RandomState(1234)
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, as_shared=False, zero_mean=False)
    Xtr = datasets[0][0]
    Xva = datasets[1][0]
    Xte = datasets[2][0]
    # Merge validation set and training set, and test on test set.
    Xtr = np.concatenate((Xtr, Xva), axis=0)
    Xva = Xte
    Xtr = to_fX(shift_and_scale_into_01(Xtr))
    Xva = to_fX(shift_and_scale_into_01(Xva))
    tr_samples = Xtr.shape[0]
    va_samples = Xva.shape[0]
    batch_size = 200
    batch_reps = 1
    all_pix_mean = np.mean(np.mean(Xtr, axis=1))
    data_mean = to_fX( all_pix_mean * np.ones((Xtr.shape[1],)) )

    ############################################################
    # Setup some parameters for the Iterative Refinement Model #
    ############################################################
    x_dim = Xtr.shape[1]
    z_dim = 100
    init_scale = 1.0
    use_bn = True

    x_in_sym = T.matrix('x_in_sym')
    x_out_sym = T.matrix('x_out_sym')
    x_mask_sym = T.matrix('x_mask_sym')

    #################
    # p_zi_given_xi #
    #################
    params = {}
    shared_config = \
    [ {'layer_type': 'conv',
       'in_chans': 1,   # in shape:  (batch, 784)
       'out_chans': 64, # out shape: (batch, 64, 14, 14)
       'activation': relu_actfun,
       'filt_dim': 5,
       'conv_stride': 'double',
       'apply_bn': use_bn,
       'shape_func_in': lambda x: T.reshape(x, (-1, 1, 28, 28))}, \
      {'layer_type': 'conv',
       'in_chans': 64,   # in shape:  (batch, 64, 14, 14)
       'out_chans': 128, # out shape: (batch, 128, 7, 7)
       'activation': relu_actfun,
       'filt_dim': 5,
       'conv_stride': 'double',
       'apply_bn': use_bn,
       'shape_func_out': lambda x: T.flatten(x, 2)}, \
      {'layer_type': 'fc',
       'in_chans': 128*7*7,
       'out_chans': 256,
       'activation': relu_actfun,
       'apply_bn': use_bn} ]
    output_config = \
    [ {'layer_type': 'fc',
       'in_chans': 256,
       'out_chans': z_dim,
       'activation': relu_actfun,
       'apply_bn': False}, \
      {'layer_type': 'fc',
       'in_chans': 256,
       'out_chans': z_dim,
       'activation': relu_actfun,
       'apply_bn': False} ]
    params['shared_config'] = shared_config
    params['output_config'] = output_config
    params['init_scale'] = init_scale
    params['build_theano_funcs'] = False
    p_zi_given_xi = HydraNet(rng=rng, Xd=x_in_sym, \
            params=params, shared_param_dicts=None)
    p_zi_given_xi.init_biases(0.0)
    ###################
    # p_sip1_given_zi #
    ###################
    params = {}
    shared_config = \
    [ {'layer_type': 'fc',
       'in_chans': z_dim,
       'out_chans': 256,
       'activation': relu_actfun,
       'apply_bn': use_bn}, \
      {'layer_type': 'fc',
       'in_chans': 256,
       'out_chans': 7*7*128,
       'activation': relu_actfun,
       'apply_bn': use_bn,
       'shape_func_out': lambda x: T.reshape(x, (-1, 128, 7, 7))}, \
      {'layer_type': 'conv',
       'in_chans': 128, # in shape:  (batch, 128, 7, 7)
       'out_chans': 64, # out shape: (batch, 64, 14, 14)
       'activation': relu_actfun,
       'filt_dim': 5,
       'conv_stride': 'half',
       'apply_bn': use_bn} ]
    output_config = \
    [ {'layer_type': 'conv',
       'in_chans': 64, # in shape:  (batch, 64, 14, 14)
       'out_chans': 1, # out shape: (batch, 1, 28, 28)
       'activation': relu_actfun,
       'filt_dim': 5,
       'conv_stride': 'half',
       'apply_bn': False,
       'shape_func_out': lambda x: T.flatten(x, 2)}, \
      {'layer_type': 'conv',
       'in_chans': 64,
       'out_chans': 1,
       'activation': relu_actfun,
       'filt_dim': 5,
       'conv_stride': 'half',
       'apply_bn': False,
       'shape_func_out': lambda x: T.flatten(x, 2)}, \
      {'layer_type': 'conv',
       'in_chans': 64,
       'out_chans': 1,
       'activation': relu_actfun,
       'filt_dim': 5,
       'conv_stride': 'half',
       'apply_bn': False,
       'shape_func_out': lambda x: T.flatten(x, 2)} ]
    params['shared_config'] = shared_config
    params['output_config'] = output_config
    params['init_scale'] = init_scale
    params['build_theano_funcs'] = False
    p_sip1_given_zi = HydraNet(rng=rng, Xd=x_in_sym, \
            params=params, shared_param_dicts=None)
    p_sip1_given_zi.init_biases(0.0)

    #################
    # q_zi_given_xi #
    #################
    params = {}
    shared_config = \
    [ {'layer_type': 'conv',
       'in_chans': 2,   # in shape:  (batch, 784+784)
       'out_chans': 64, # out shape: (batch, 64, 14, 14)
       'activation': relu_actfun,
       'filt_dim': 5,
       'conv_stride': 'double',
       'apply_bn': use_bn,
       'shape_func_in': lambda x: T.reshape(x, (-1, 2, 28, 28))}, \
      {'layer_type': 'conv',
       'in_chans': 64,   # in shape:  (batch, 64, 14, 14)
       'out_chans': 128, # out shape: (batch, 128, 7, 7)
       'activation': relu_actfun,
       'filt_dim': 5,
       'conv_stride': 'double',
       'apply_bn': use_bn,
       'shape_func_out': lambda x: T.flatten(x, 2)}, \
      {'layer_type': 'fc',
       'in_chans': 128*7*7,
       'out_chans': 256,
       'activation': relu_actfun,
       'apply_bn': use_bn} ]
    output_config = \
    [ {'layer_type': 'fc',
       'in_chans': 256,
       'out_chans': z_dim,
       'activation': relu_actfun,
       'apply_bn': False}, \
      {'layer_type': 'fc',
       'in_chans': 256,
       'out_chans': z_dim,
       'activation': relu_actfun,
       'apply_bn': False} ]
    params['shared_config'] = shared_config
    params['output_config'] = output_config
    params['init_scale'] = init_scale
    params['build_theano_funcs'] = False
    q_zi_given_xi = HydraNet(rng=rng, Xd=x_in_sym, \
            params=params, shared_param_dicts=None)
    q_zi_given_xi.init_biases(0.0)

    ###########################################################
    # Define parameters for the GPSImputer, and initialize it #
    ###########################################################
    print("Building the GPSImputer...")
    gpsi_params = {}
    gpsi_params['x_dim'] = x_dim
    gpsi_params['z_dim'] = z_dim
    # switch between direct construction and construction via p_x_given_si
    gpsi_params['imp_steps'] = imp_steps
    gpsi_params['step_type'] = step_type
    gpsi_params['x_type'] = 'bernoulli'
    gpsi_params['obs_transform'] = 'sigmoid'
    GPSI = GPSImputer(rng=rng,
            x_in=x_in_sym, x_out=x_out_sym, x_mask=x_mask_sym,
            p_zi_given_xi=p_zi_given_xi,
            p_sip1_given_zi=p_sip1_given_zi,
            q_zi_given_xi=q_zi_given_xi,
            params=gpsi_params,
            shared_param_dicts=None)

    ################################################################
    # Apply some updates, to check that they aren't totally broken #
    ################################################################
    log_name = "{}_RESULTS.txt".format(result_tag)
    out_file = open(log_name, 'wb')
    costs = [0. for i in range(10)]
    learn_rate = 0.0001
    momentum = 0.90
    batch_idx = np.arange(batch_size) + tr_samples
    for i in range(200000):
        scale = min(1.0, ((i+1) / 5000.0))
        if (((i + 1) % 15000) == 0):
            learn_rate = learn_rate * 0.95
        # get the indices of training samples for this batch update
        batch_idx += batch_size
        if (np.max(batch_idx) >= tr_samples):
            # we finished an "epoch", so we rejumble the training set
            Xtr = row_shuffle(Xtr)
            batch_idx = np.arange(batch_size)
        # set sgd and objective function hyperparams for this update
        GPSI.set_sgd_params(lr=scale*learn_rate, \
                            mom_1=scale*momentum, mom_2=0.98)
        GPSI.set_train_switch(1.0)
        GPSI.set_lam_nll(lam_nll=1.0)
        GPSI.set_lam_kld(lam_kld_q=1.0, lam_kld_p=0.1, lam_kld_g=0.0)
        GPSI.set_lam_l2w(1e-5)
        # perform a minibatch update and record the cost for this batch
        xb = to_fX( Xtr.take(batch_idx, axis=0) )
        xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \
                                        occ_dim=occ_dim, data_mean=data_mean)
        result = GPSI.train_joint(xi, xo, xm, batch_reps)
        # do diagnostics and general training tracking
        costs = [(costs[j] + result[j]) for j in range(len(result)-1)]
        if ((i % 500) == 0):
            costs = [(v / 500.0) for v in costs]
            str1 = "-- batch {0:d} --".format(i)
            str2 = "    joint_cost: {0:.4f}".format(costs[0])
            str3 = "    nll_bound : {0:.4f}".format(costs[1])
            str4 = "    nll_cost  : {0:.4f}".format(costs[2])
            str5 = "    kld_cost  : {0:.4f}".format(costs[3])
            str6 = "    reg_cost  : {0:.4f}".format(costs[4])
            joint_str = "\n".join([str1, str2, str3, str4, str5, str6])
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
            costs = [0.0 for v in costs]
        if ((i % 1000) == 0):
            Xva = row_shuffle(Xva)
            # record an estimate of performance on the test set
            xi, xo, xm = construct_masked_data(Xva[0:5000], drop_prob=drop_prob, \
                                               occ_dim=occ_dim, data_mean=data_mean)
            nll, kld = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10)
            vfe = np.mean(nll) + np.mean(kld)
            str1 = "    va_nll_bound : {}".format(vfe)
            str2 = "    va_nll_term  : {}".format(np.mean(nll))
            str3 = "    va_kld_q2p   : {}".format(np.mean(kld))
            joint_str = "\n".join([str1, str2, str3])
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
        if ((i % 2000) == 0):
            #GPSI.save_to_file("{}_PARAMS.pkl".format(result_tag))
            # Get some validation samples for evaluating model performance
            xb = to_fX( Xva[0:100] )
            xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \
                                    occ_dim=occ_dim, data_mean=data_mean)
            xi = np.repeat(xi, 2, axis=0)
            xo = np.repeat(xo, 2, axis=0)
            xm = np.repeat(xm, 2, axis=0)
            # draw some sample imputations from the model
            samp_count = xi.shape[0]
            _, model_samps = GPSI.sample_imputer(xi, xo, xm, use_guide_policy=False)
            seq_len = len(model_samps)
            seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1]))
            idx = 0
            for s1 in range(samp_count):
                for s2 in range(seq_len):
                    seq_samps[idx] = model_samps[s2][s1]
                    idx += 1
            file_name = "{0:s}_samples_ng_b{1:d}.png".format(result_tag, i)
            utils.visualize_samples(seq_samps, file_name, num_rows=20)
示例#22
0
def test_seq_cond_gen_static(step_type='add'):
    ##############################
    # File tag, for output stuff #
    ##############################
    result_tag = "{}AAA_SCG".format(RESULT_PATH)

    ##########################
    # Get some training data #
    ##########################
    rng = np.random.RandomState(1234)
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, as_shared=False, zero_mean=False)
    # get training/validation/test images
    Xtr = datasets[0][0]
    Xva = datasets[1][0]
    Xte = datasets[2][0]
    Xtr = to_fX(shift_and_scale_into_01(Xtr))
    Xva = to_fX(shift_and_scale_into_01(Xva))
    Xte = to_fX(shift_and_scale_into_01(Xte))
    obs_dim = Xtr.shape[1]
    # get label representations
    y_reps = 10
    Ytr = one_hot_np(datasets[0][1]-1, cat_dim=10).repeat(y_reps, axis=1)
    Yva = one_hot_np(datasets[1][1]-1, cat_dim=10).repeat(y_reps, axis=1)
    Yte = one_hot_np(datasets[2][1]-1, cat_dim=10).repeat(y_reps, axis=1)
    label_dim = Ytr.shape[1]
    # merge image and lagel representations
    print("Xtr.shape: {}".format(Xtr.shape))
    print("Ytr.shape: {}".format(Ytr.shape))
    XYtr = to_fX( np.hstack( [Xtr, Ytr] ) )
    XYva = to_fX( np.hstack( [Xva, Yva] ) )
    tr_samples = XYtr.shape[0]
    va_samples = XYva.shape[0]
    batch_size = 200

    def split_xy(xy_ary):
        x_ary = xy_ary[:,:obs_dim]
        y_ary = xy_ary[:,obs_dim:]
        return x_ary, y_ary

    ############################################################
    # Setup some parameters for the Iterative Refinement Model #
    ############################################################
    total_steps = 10
    init_steps = 3
    exit_rate = 0.2
    x_dim = obs_dim
    y_dim = obs_dim + label_dim
    z_dim = 100
    rnn_dim = 400
    write_dim = 400
    mlp_dim = 400

    def visualize_attention(result, pre_tag="AAA", post_tag="AAA"):
        seq_len = result[0].shape[0]
        samp_count = result[0].shape[1]
        # get generated predictions
        x_samps = np.zeros((seq_len*samp_count, obs_dim))
        y_samps = np.zeros((seq_len*samp_count, label_dim))
        idx = 0
        for s1 in range(samp_count):
            for s2 in range(seq_len):
                x_samps[idx] = result[0][s2,s1,:obs_dim]
                y_samps[idx] = result[0][s2,s1,obs_dim:]
                # add ticks at the corners of label predictions, to make them
                # easier to parse visually.
                max_val = np.mean(result[0][s2,s1,obs_dim:])
                y_samps[idx][0] = max_val
                y_samps[idx][9] = max_val
                y_samps[idx][-1] = max_val
                y_samps[idx][-10] = max_val
                idx += 1
        file_name = "{0:s}_traj_xs_{1:s}.png".format(pre_tag, post_tag)
        utils.visualize_samples(x_samps, file_name, num_rows=20)
        file_name = "{0:s}_traj_ys_{1:s}.png".format(pre_tag, post_tag)
        utils.visualize_samples(y_samps, file_name, num_rows=20)
        # get sequential attention maps
        seq_samps = np.zeros((seq_len*samp_count, x_dim))
        idx = 0
        for s1 in range(samp_count):
            for s2 in range(seq_len):
                seq_samps[idx] = result[1][s2,s1,:x_dim] + result[1][s2,s1,x_dim:]
                idx += 1
        file_name = "{0:s}_traj_att_maps_{1:s}.png".format(pre_tag, post_tag)
        utils.visualize_samples(seq_samps, file_name, num_rows=20)
        # get sequential attention maps (read out values)
        seq_samps = np.zeros((seq_len*samp_count, x_dim))
        idx = 0
        for s1 in range(samp_count):
            for s2 in range(seq_len):
                seq_samps[idx] = result[2][s2,s1,:x_dim] + result[2][s2,s1,x_dim:]
                idx += 1
        file_name = "{0:s}_traj_read_outs_{1:s}.png".format(pre_tag, post_tag)
        utils.visualize_samples(seq_samps, file_name, num_rows=20)
        return

    rnninits = {
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }
    inits = {
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }

    read_N = 2 # inner/outer grid dimension for reader
    reader_mlp = SimpleAttentionReader2d(x_dim=x_dim, con_dim=rnn_dim,
                                         width=28, height=28, N=read_N,
                                         img_scale=0.2, att_scale=0.5,
                                         **inits)
    read_dim = reader_mlp.read_dim # total number of "pixels" read by reader

    writer_mlp = MLP([None, None], [rnn_dim, write_dim, y_dim], \
                     name="writer_mlp", **inits)

    # mlps for processing inputs to LSTMs
    con_mlp_in = MLP([Identity()], [                       z_dim, 4*rnn_dim], \
                     name="con_mlp_in", **inits)
    var_mlp_in = MLP([Identity()], [(y_dim + read_dim + rnn_dim), 4*rnn_dim], \
                     name="var_mlp_in", **inits)
    gen_mlp_in = MLP([Identity()], [        (read_dim + rnn_dim), 4*rnn_dim], \
                     name="gen_mlp_in", **inits)

    # mlps for turning LSTM outputs into conditionals over z_gen
    con_mlp_out = CondNet([Rectifier(), Rectifier()], \
                          [rnn_dim, mlp_dim, mlp_dim, z_dim], \
                          name="con_mlp_out", **inits)
    gen_mlp_out = CondNet([], [rnn_dim, z_dim], name="gen_mlp_out", **inits)
    var_mlp_out = CondNet([], [rnn_dim, z_dim], name="var_mlp_out", **inits)

    # LSTMs for the actual LSTMs (obviously, perhaps)
    con_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="con_rnn", **rnninits)
    gen_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="gen_rnn", **rnninits)
    var_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="var_rnn", **rnninits)

    SeqCondGen_doc_str = \
    """
    SeqCondGen -- constructs conditional densities under time constraints.

    This model sequentially constructs a conditional density estimate by taking
    repeated glimpses at the input x, and constructing a hypothesis about the
    output y. The objective is maximum likelihood for (x,y) pairs drawn from
    some training set. We learn a proper generative model, using variational
    inference -- which can be interpreted as a sort of guided policy search.

    The input pairs (x, y) can be either "static" or "sequential". In the
    static case, the same x and y are used at every step of the hypothesis
    construction loop. In the sequential case, x and y can change at each step
    of the loop.

    Parameters:
        x_and_y_are_seqs: boolean telling whether the conditioning information
                          and prediction targets are sequential.
        total_steps: total number of steps in sequential estimation process
        init_steps: number of steps prior to first NLL measurement
        exit_rate: probability of exiting following each non "init" step
                   **^^ THIS IS SET TO 0 WHEN USING SEQUENTIAL INPUT ^^**
        nll_weight: weight for the prediction NLL term at each step.
                   **^^ THIS IS IGNORED WHEN USING STATIC INPUT ^^**
        step_type: whether to use "additive" steps or "jump" steps
                   -- jump steps predict directly from the controller LSTM's
                      "hidden" state (a.k.a. its memory cells).
        x_dim: dimension of inputs on which to condition
        y_dim: dimension of outputs to predict
        reader_mlp: used for reading from the input
        writer_mlp: used for writing to the output prediction
        con_mlp_in: preprocesses input to the "controller" LSTM
        con_rnn: the "controller" LSTM
        con_mlp_out: CondNet for distribution over z given con_rnn
        gen_mlp_in: preprocesses input to the "generator" LSTM
        gen_rnn: the "generator" LSTM
        gen_mlp_out: CondNet for distribution over z given gen_rnn
        var_mlp_in: preprocesses input to the "variational" LSTM
        var_rnn: the "variational" LSTM
        var_mlp_out: CondNet for distribution over z given gen_rnn
    """

    SCG = SeqCondGen(
                x_and_y_are_seqs=False, # this test doesn't use sequential x/y
                total_steps=total_steps,
                init_steps=init_steps,
                exit_rate=exit_rate,
                nll_weight=0.0, # ignored, because x_and_y_are_seqs == False
                step_type=step_type,
                x_dim=x_dim,
                y_dim=y_dim,
                reader_mlp=reader_mlp,
                writer_mlp=writer_mlp,
                con_mlp_in=con_mlp_in,
                con_mlp_out=con_mlp_out,
                con_rnn=con_rnn,
                gen_mlp_in=gen_mlp_in,
                gen_mlp_out=gen_mlp_out,
                gen_rnn=gen_rnn,
                var_mlp_in=var_mlp_in,
                var_mlp_out=var_mlp_out,
                var_rnn=var_rnn)
    SCG.initialize()

    compile_start_time = time.time()

    # build the attention trajectory sampler
    SCG.build_attention_funcs()

    # quick test of attention trajectory sampler
    samp_count = 100
    XYb = XYva[:samp_count,:]
    Xb, Yb = split_xy(XYb)
    #Xb = Xva[:samp_count]
    result = SCG.sample_attention(Xb, XYb)
    visualize_attention(result, pre_tag=result_tag, post_tag="b0")

    # build the main model functions (i.e. training and cost functions)
    SCG.build_model_funcs()

    compile_end_time = time.time()
    compile_minutes = (compile_end_time - compile_start_time) / 60.0
    print("THEANO COMPILE TIME (MIN): {}".format(compile_minutes))

    #SCG.load_model_params(f_name="SCG_params.pkl")

    ################################################################
    # Apply some updates, to check that they aren't totally broken #
    ################################################################
    print("Beginning to train the model...")
    out_file = open("{}_results.txt".format(result_tag), 'wb')
    out_file.flush()
    costs = [0. for i in range(10)]
    learn_rate = 0.0001
    momentum = 0.8
    batch_idx = np.arange(batch_size) + tr_samples
    for i in range(250000):
        scale = min(1.0, ((i+1) / 2500.0))
        if (((i + 1) % 10000) == 0):
            learn_rate = learn_rate * 0.95
        if (i > 10000):
            momentum = 0.95
        else:
            momentum = 0.8
        # get the indices of training samples for this batch update
        batch_idx += batch_size
        if (np.max(batch_idx) >= tr_samples):
            # we finished an "epoch", so we rejumble the training set
            XYtr = row_shuffle(XYtr)
            #Xtr = row_shuffle(Xtr)
            batch_idx = np.arange(batch_size)
        # set sgd and objective function hyperparams for this update
        SCG.set_sgd_params(lr=learn_rate, mom_1=momentum, mom_2=0.99)
        SCG.set_lam_kld(lam_kld_q2p=0.95, lam_kld_p2q=0.05, lam_kld_p2g=0.1)
        # perform a minibatch update and record the cost for this batch
        XYb = XYtr.take(batch_idx, axis=0)
        Xb, Yb = split_xy(XYb)
        #Xb = Xtr.take(batch_idx, axis=0)
        result = SCG.train_joint(Xb, XYb)
        costs = [(costs[j] + result[j]) for j in range(len(result))]

        # output diagnostic information and checkpoint parameters, etc.
        if ((i % 250) == 0):
            costs = [(v / 250.0) for v in costs]
            str1 = "-- batch {0:d} --".format(i)
            str2 = "    total_cost: {0:.4f}".format(costs[0])
            str3 = "    nll_bound : {0:.4f}".format(costs[1])
            str4 = "    nll_term  : {0:.4f}".format(costs[2])
            str5 = "    kld_q2p   : {0:.4f}".format(costs[3])
            str6 = "    kld_p2q   : {0:.4f}".format(costs[4])
            str7 = "    kld_p2g   : {0:.4f}".format(costs[5])
            str8 = "    reg_term  : {0:.4f}".format(costs[6])
            joint_str = "\n".join([str1, str2, str3, str4, str5, str6, str7, str8])
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
            costs = [0.0 for v in costs]
        if ((i % 500) == 0): #((i % 1000) == 0):
            SCG.save_model_params("{}_params.pkl".format(result_tag))
            # compute a small-sample estimate of NLL bound on validation set
            XYva = row_shuffle(XYva)
            XYb = XYva[:1000]
            Xb, Yb = split_xy(XYb)
            #Xva = row_shuffle(Xva)
            #Xb = Xva[:1000]
            va_costs = SCG.compute_nll_bound(Xb, XYb)
            str1 = "    va_nll_bound : {}".format(va_costs[1])
            str2 = "    va_nll_term  : {}".format(va_costs[2])
            str3 = "    va_kld_q2p   : {}".format(va_costs[3])
            joint_str = "\n".join([str1, str2, str3])
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
            ###########################################
            # Sample and draw attention trajectories. #
            ###########################################
            samp_count = 100
            XYb = XYva[:samp_count,:]
            Xb, Yb = split_xy(XYb)
            #Xb = Xva[:samp_count]
            result = SCG.sample_attention(Xb, XYb)
            post_tag = "b{0:d}".format(i)
            visualize_attention(result, pre_tag=result_tag, post_tag=post_tag)
def test_oi_seq_cond_gen(attention=False):
    ##########################
    # Get some training data #
    ##########################
    rng = np.random.RandomState(1234)
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, as_shared=False, zero_mean=False)
    Xtr = datasets[0][0]
    Xva = datasets[1][0]
    Xte = datasets[2][0]
    # Merge validation set and training set, and test on test set.
    #Xtr = np.concatenate((Xtr, Xva), axis=0)
    #Xva = Xte
    Xtr = to_fX(shift_and_scale_into_01(Xtr))
    Xva = to_fX(shift_and_scale_into_01(Xva))
    tr_samples = Xtr.shape[0]
    va_samples = Xva.shape[0]
    batch_size = 200

    ############################################################
    # Setup some parameters for the Iterative Refinement Model #
    ############################################################
    x_dim = 28
    outer_steps = 27
    inner_steps = 5
    rnn_dim = 128
    write_dim = 64
    mlp_dim = 128
    z_dim = 50

    rnninits = {
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }
    inits = {
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }

    # setup the reader and writer
    if attention:
        read_N = 3 # inner/outer grid dimension for reader
        reader_mlp = SimpleAttentionReader1d(x_dim=x_dim, con_dim=rnn_dim,
                                             N=read_N, init_scale=2.0, **inits)
        read_dim = reader_mlp.read_dim
        att_tag = "YA"
    else:
        read_dim = 2*x_dim
        reader_mlp = Reader(x_dim=x_dim, dec_dim=rnn_dim, **inits)
        att_tag = "NA"
    writer_mlp = MLP([None, None], [rnn_dim, write_dim, x_dim], \
                     name="writer_mlp", **inits)

    # mlps for processing inputs to LSTMs
    con_mlp_in = MLP([Identity()], [                       z_dim, 4*rnn_dim], \
                     name="con_mlp_in", **inits)
    gen_mlp_in = MLP([Identity()], [        (read_dim + rnn_dim), 4*rnn_dim], \
                     name="gen_mlp_in", **inits)
    var_mlp_in = MLP([Identity()], [(x_dim + read_dim + rnn_dim), 4*rnn_dim], \
                     name="var_mlp_in", **inits)
    mem_mlp_in = MLP([Identity()], [                   2*rnn_dim, 4*rnn_dim], \
                     name="mem_mlp_in", **inits)


    # mlps for turning LSTM outputs into conditionals over z_gen
    gen_mlp_out = CondNet([], [rnn_dim, z_dim], name="gen_mlp_out", **inits)
    var_mlp_out = CondNet([], [rnn_dim, z_dim], name="var_mlp_out", **inits)
    mem_mlp_out = MLP([Identity()], [rnn_dim, 2*rnn_dim], \
                      name="mem_mlp_out", **inits)
    # LSTMs for the actual LSTMs (obviously, perhaps)
    con_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="con_rnn", **rnninits)
    gen_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="gen_rnn", **rnninits)
    var_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="var_rnn", **rnninits)
    mem_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="mem_rnn", **rnninits)

    OISeqCondGen_doc_str = \
    """
    OISeqCondGen -- a model for predicting inputs, given previous inputs.

    For each input in a sequence, this model sequentially builds a prediction
    for the next input. Each of these predictions conditions directly on the
    previous input, and indirectly on even earlier inputs. Conditioning on the
    current input is either "fully informed" or "attention based". Conditioning
    on even earlier inputs is through state that is carried across predictions
    using, e.g., an LSTM.

    Parameters:
        obs_dim: dimension of inputs to observe and predict
        outer_steps: #predictions to make
        inner_steps: #steps when constructing each prediction
        reader_mlp: used for reading from the current input
        writer_mlp: used for writing to prediction of the next input
        con_mlp_in: preprocesses input to the "controller" LSTM
        con_rnn: the "controller" LSTM
        gen_mlp_in: preprocesses input to the "generator" LSTM
        gen_rnn: the "generator" LSTM
        gen_mlp_out: CondNet for distribution over z given gen_rnn
        var_mlp_in: preprocesses input to the "variational" LSTM
        var_rnn: the "variational" LSTM
        var_mlp_out: CondNet for distribution over z given gen_rnn
        mem_mlp_in: preprocesses input to the "memory" LSTM
        mem_rnn: the "memory" LSTM (this stores inter-prediction state)
        mem_mlp_out: emits initial controller state for each prediction
    """

    IMS = OISeqCondGen(
                obs_dim=x_dim,
                outer_steps=outer_steps,
                inner_steps=inner_steps,
                reader_mlp=reader_mlp,
                writer_mlp=writer_mlp,
                con_mlp_in=con_mlp_in,
                con_rnn=con_rnn,
                gen_mlp_in=gen_mlp_in,
                gen_mlp_out=gen_mlp_out,
                gen_rnn=gen_rnn,
                var_mlp_in=var_mlp_in,
                var_mlp_out=var_mlp_out,
                var_rnn=var_rnn,
                mem_mlp_in=mem_mlp_in,
                mem_mlp_out=mem_mlp_out,
                mem_rnn=mem_rnn
    )
    IMS.initialize()

    # build the cost gradients, training function, samplers, etc.
    IMS.build_model_funcs()

    #IMS.load_model_params(f_name="SRRM_params.pkl")

    ################################################################
    # Apply some updates, to check that they aren't totally broken #
    ################################################################
    print("Beginning to train the model...")
    out_file = open("IMS_results.txt", 'wb')
    out_file.flush()
    costs = [0. for i in range(10)]
    learn_rate = 0.0001
    momentum = 0.75
    batch_idx = np.arange(batch_size) + tr_samples
    for i in range(250000):
        scale = min(1.0, ((i+1) / 2500.0))
        if (((i + 1) % 10000) == 0):
            learn_rate = learn_rate * 0.95
        if (i > 10000):
            momentum = 0.95
        else:
            momentum = 0.75
        # get the indices of training samples for this batch update
        batch_idx += batch_size
        if (np.max(batch_idx) >= tr_samples):
            # we finished an "epoch", so we rejumble the training set
            Xtr = row_shuffle(Xtr)
            batch_idx = np.arange(batch_size)
        # set sgd and objective function hyperparams for this update
        zero_ary = np.zeros((1,))
        IMS.lr.set_value(to_fX(zero_ary + learn_rate))
        IMS.mom_1.set_value(to_fX(zero_ary + momentum))
        IMS.mom_2.set_value(to_fX(zero_ary + 0.99))

        # perform a minibatch update and record the cost for this batch
        Xb = to_fX(Xtr.take(batch_idx, axis=0))
        Xb = Xb.reshape(batch_size, x_dim, x_dim).swapaxes(0,2).swapaxes(1,2)
        result = IMS.train_joint(Xb)

        costs = [(costs[j] + result[j]) for j in range(len(result))]
        if ((i % 100) == 0):
            costs = [(v / 100.0) for v in costs]
            str1 = "-- batch {0:d} --".format(i)
            str2 = "    total_cost: {0:.4f}".format(costs[0])
            str3 = "    nll_bound : {0:.4f}".format(costs[1])
            str4 = "    nll_term  : {0:.4f}".format(costs[2])
            str5 = "    kld_q2p   : {0:.4f}".format(costs[3])
            str6 = "    kld_p2q   : {0:.4f}".format(costs[4])
            str7 = "    reg_term  : {0:.4f}".format(costs[5])
            joint_str = "\n".join([str1, str2, str3, str4, str5, str6, str7])
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
            costs = [0.0 for v in costs]
        if ((i % 1000) == 0):
            IMS.save_model_params("IMS_params.pkl")
            # compute a small-sample estimate of NLL bound on validation set
            Xva = row_shuffle(Xva)
            Xb = to_fX(Xva[:5000])
            Xb = Xb.reshape(batch_size, x_dim, x_dim).swapaxes(0,2).swapaxes(1,2)
            va_costs = IMS.compute_nll_bound(Xb)
            str1 = "    va_nll_bound : {}".format(va_costs[1])
            str2 = "    va_nll_term  : {}".format(va_costs[2])
            str3 = "    va_kld_q2p   : {}".format(va_costs[3])
            joint_str = "\n".join([str1, str2, str3])
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
示例#24
0
def check_mnist_walkout():
    # DERPA DERPA DOO
    KLD_PATH = "MNIST_WALKOUT_TEST_KLD/"
    VAE_PATH = "MNIST_WALKOUT_TEST_VAE/"
    RESULT_PATH = "MNIST_WALKOUT_RESULTS/"

    # Initialize a source of randomness
    rng = np.random.RandomState(1234)

    # Load some data to train/validate/test with
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, zero_mean=False)
    Xtr_shared = datasets[0][0]
    Xva_shared = datasets[1][0]
    Xtr = Xtr_shared.get_value(borrow=False).astype(theano.config.floatX)
    Xva = Xva_shared.get_value(borrow=False).astype(theano.config.floatX)
    tr_samples = Xtr.shape[0]
    va_samples = Xva.shape[0]
    batch_size = 100
    batch_reps = 5

    # Construct a GenNet and an InfNet, then test constructor for GIPair.
    # Do basic testing, to make sure classes aren't completely broken.
    Xp = T.matrix('Xp_base')
    Xd = T.matrix('Xd_base')
    Xc = T.matrix('Xc_base')
    Xm = T.matrix('Xm_base')
    data_dim = Xtr.shape[1]
    prior_sigma = 1.0

    p_vals_kld, v_vals_kld, p_vals_vae, v_vals_vae = [], [], [], []
    kl_vals_kld, ll_vals_kld, kl_vals_vae, ll_vals_vae = [], [], [], []

    ########################################################
    # CHECK MODEL BEHAVIOR AT DIFFERENT STAGES OF TRAINING #
    ########################################################
    for i in range(10000,200000):
        if ((i % 10000) == 0):
            if (i <= 80000):
                net_type = 'gip'
                b = i
            else:
                net_type = 'walk'
                b = i - 80000
            #############################################################
            # Process the GIPair trained with strong KLd regularization #
            #############################################################
            gn_fname = KLD_PATH + "pt_{0:s}_params_b{1:d}_GN.pkl".format(net_type, b)
            in_fname = KLD_PATH + "pt_{0:s}_params_b{1:d}_IN.pkl".format(net_type, b)
            IN = INet.load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd)
            GN = GNet.load_gennet_from_file(f_name=gn_fname, rng=rng, Xp=Xp)
            IN.set_sigma_scale(1.0)
            prior_dim = GN.latent_dim
            post_klds_kld = posterior_klds(IN, Xtr, 5000, 5)
            # Initialize the GIPair
            GIP_KLD = GIPair(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, g_net=GN, i_net=IN, \
                    data_dim=data_dim, prior_dim=prior_dim, params=None)
            GIP_KLD.set_lam_l2w(1e-4)
            GIP_KLD.set_lam_nll(1.0)
            GIP_KLD.set_lam_kld(1.0)
            # draw samples freely from the generative model's prior
            Xs = GIP_KLD.sample_from_prior(20*20)
            file_name = RESULT_PATH + "prior_samples_b{0:d}_kld.png".format(i)
            utils.visualize_samples(Xs, file_name, num_rows=20)
            # test Parzen density estimator built from prior samples
            Xs = GIP_KLD.sample_from_prior(10000, sigma=1.0)
            parzen_vals_kld = cross_validate_sigma(Xs, Xva, [0.1, 0.13, 0.15, 0.18, 0.2], 50)
            # get variational bound info
            var_vals_kld = GIP_KLD.compute_ll_bound(Xva)
            # record info about variational and parzen bounds
            p_vals_kld.append(parzen_vals_kld[1])
            v_vals_kld.append(np.mean(var_vals_kld[0]))
            ################################################################
            # Process the GIPair trained with basic VAE KLd regularization #
            ################################################################
            gn_fname = VAE_PATH + "pt_{0:s}_params_b{1:d}_GN.pkl".format(net_type, b)
            in_fname = VAE_PATH + "pt_{0:s}_params_b{1:d}_IN.pkl".format(net_type, b)
            IN = INet.load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd)
            GN = GNet.load_gennet_from_file(f_name=gn_fname, rng=rng, Xp=Xp)
            IN.set_sigma_scale(1.0)
            prior_dim = GN.latent_dim
            post_klds_vae = posterior_klds(IN, Xtr, 5000, 5)
            # Initialize the GIPair
            GIP_VAE = GIPair(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, g_net=GN, i_net=IN, \
                    data_dim=data_dim, prior_dim=prior_dim, params=None)
            GIP_VAE.set_lam_l2w(1e-4)
            GIP_VAE.set_lam_nll(1.0)
            GIP_VAE.set_lam_kld(1.0)
            # draw samples freely from the generative model's prior
            Xs = GIP_VAE.sample_from_prior(20*20)
            file_name = RESULT_PATH + "prior_samples_b{0:d}_vae.png".format(i)
            utils.visualize_samples(Xs, file_name, num_rows=20)
            # test Parzen density estimator built from prior samples
            Xs = GIP_VAE.sample_from_prior(10000, sigma=1.0)
            parzen_vals_vae = cross_validate_sigma(Xs, Xva, [0.12, 0.15, 0.18, 0.20, 0.25], 50)
            # get variational bound info
            var_vals_vae = GIP_VAE.compute_ll_bound(Xva)
            # record info about variational and parzen bounds
            p_vals_vae.append(parzen_vals_vae[1])
            v_vals_vae.append(np.mean(var_vals_vae[0]))
            ########################
            # Plot posterior KLds. #
            ########################
            file_name = RESULT_PATH + "post_klds_b{0:d}.pdf".format(i)
            draw_posterior_kld_hist( \
                    np.asarray(post_klds_kld), np.asarray(post_klds_vae), file_name, bins=30)
            if i in [20000, 50000, 80000, 110000, 150000, 190000]:
                # select random random indices into the validation set
                va_idx = npr.randint(0,high=va_samples,size=(150,))
                # record information about their current variational bounds
                kl_vals_kld.extend([v for v in var_vals_kld[1][va_idx]])
                ll_vals_kld.extend([v for v in var_vals_kld[2][va_idx]])
                kl_vals_vae.extend([v for v in var_vals_vae[1][va_idx]])
                ll_vals_vae.extend([v for v in var_vals_vae[2][va_idx]])
                # do some plotting
                s1_name = RESULT_PATH + "parzen_vs_variational.pdf"
                s2_name = RESULT_PATH + "kld_vs_likelihood.pdf"
                draw_parzen_vs_variational_scatter(p_vals_kld, v_vals_kld, \
                        p_vals_vae, v_vals_vae, f_name=s1_name)
                draw_kld_vs_likelihood_scatter(kl_vals_kld, ll_vals_kld, \
                        kl_vals_vae, ll_vals_vae, f_name=s2_name)
    return
示例#25
0
def test_cA(start_rate=0.01, decay_rate=1.0, training_epochs=20,
            dataset='./data/mnist.pkl.gz',
            batch_size=50, output_folder='CAE_plots', contraction_level=0.1):
    """
    This demo is tested on MNIST

    :type learning_rate: float
    :param learning_rate: learning rate used for training the contracting
                          AutoEncoder

    :type training_epochs: int
    :param training_epochs: number of epochs used for training

    :type dataset: string
    :param dataset: path to the picked dataset

    """
    datasets = load_udm(dataset,as_shared=True)
    train_set_x, train_set_y = datasets[0]

    learning_rate = theano.shared(numpy.asarray(start_rate,
        dtype=theano.config.floatX))

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()    # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images

    if not os.path.isdir(output_folder):
        os.makedirs(output_folder)
    os.chdir(output_folder)
    ####################################
    #        BUILDING THE MODEL        #
    ####################################

    rng = numpy.random.RandomState(12345)

    cae = cA(numpy_rng=rng, input=x,
            n_visible=(28 * 28), n_hidden=500, n_batchsize=batch_size)

    cost, updates = cae.get_cost_updates(contraction_level=contraction_level,
                                        learning_rate=learning_rate)

    print '... building Theano functions:',
    # Build a function for updating the CAE parameters
    train_cae = theano.function([index], [T.mean(cae.L_rec), cae.L_jacob],
                               updates=updates,
                               givens={x: train_set_x[index * batch_size:
                                                    (index + 1) * batch_size]})
    # Build a function for decaying the learning rate
    set_learning_rate = theano.function(inputs=[], outputs=learning_rate, \
            updates={learning_rate: learning_rate * decay_rate})
    print 'done'

    start_time = time.clock()

    ############
    # TRAINING #
    ############

    # go through training epochs
    for epoch in xrange(training_epochs):
        c = []
        # Cycle over all training minibatches for this epoch...
        print "Epoch {0:d}:".format(epoch),
        stdout.flush()
        for batch_index in xrange(n_train_batches):
            c.append(train_cae(batch_index))
            if ((batch_index % (n_train_batches / 40)) == 0):
                print ".",
                stdout.flush()
        print " "
        update_the_rate = set_learning_rate()
        # Display diagnostics for the most recent epoch of training
        c_array = numpy.vstack(c)
        print "-- reconstruction: {0:.4f}, jacobian: {1:.4f}".format( \
                numpy.mean(c_array[0]), numpy.mean(numpy.sqrt(c_array[1])))
        # Visualize filters in their current state
        image = PIL.Image.fromarray(tile_raster_images(
            X=cae.W.get_value(borrow=True).T,
            img_shape=(28, 28), tile_shape=(10, 10),
            tile_spacing=(1, 1)))
        image.save('cae_filters.png')
        # Save the CAE parameters to disk
        W = cae.W.get_value(borrow=False)
        b_encode = cae.W.get_value(borrow=False)
        b_decode = cae.b_prime.get_value(borrow=False)
        np.save('cae_W.npy',W)
        np.save('cae_b_encode.npy',b_encode)
        np.save('cae_b_decode.npy',b_decode)

    # Record total training time, just for kicks
    end_time = time.clock()
    training_time = (end_time - start_time)
    # Print some jibber-jabber
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((training_time) / 60.))
    # Eject, eJect, EJECT!
    os.chdir('../')
示例#26
0
def check_mnist_recon():
    # DERPA DERPA DOO
    KLD_PATH = "MNIST_WALKOUT_TEST_KLD/"
    VAE_PATH = "MNIST_WALKOUT_TEST_VAE/"
    RESULT_PATH = "MNIST_WALKOUT_RESULTS/"

    # Initialize a source of randomness
    rng = np.random.RandomState(1234)

    # Load some data to train/validate/test with
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, zero_mean=False)
    Xtr_shared = datasets[0][0]
    Xva_shared = datasets[1][0]
    Xtr = Xtr_shared.get_value(borrow=False).astype(theano.config.floatX)
    Xva = Xva_shared.get_value(borrow=False).astype(theano.config.floatX)
    Xtr_mean = np.mean(Xtr, axis=0, keepdims=True)
    Xtr_mean = (0.0 * Xtr_mean) + np.mean(Xtr_mean)
    tr_samples = Xtr.shape[0]
    va_samples = Xva.shape[0]
    batch_size = 100
    batch_reps = 5

    # Construct a GenNet and an InfNet, then test constructor for GIPair.
    # Do basic testing, to make sure classes aren't completely broken.
    Xp = T.matrix('Xp_base')
    Xd = T.matrix('Xd_base')
    Xc = T.matrix('Xc_base')
    Xm = T.matrix('Xm_base')
    data_dim = Xtr.shape[1]
    prior_sigma = 1.0

    #############################################################
    # Process the GIPair trained with strong KLd regularization #
    #############################################################
    gn_fname = KLD_PATH + "pt_walk_params_b120000_GN.pkl"
    in_fname = KLD_PATH + "pt_walk_params_b120000_IN.pkl"
    IN = INet.load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd)
    GN = GNet.load_gennet_from_file(f_name=gn_fname, rng=rng, Xp=Xp)
    IN.set_sigma_scale(1.0)
    prior_dim = GN.latent_dim
    # Initialize the GIPair
    GIP_KLD = GIPair(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, g_net=GN, i_net=IN, \
            data_dim=data_dim, prior_dim=prior_dim, params=None)

    ################################################################
    # Process the GIPair trained with basic VAE KLd regularization #
    ################################################################
    gn_fname = VAE_PATH + "pt_walk_params_b120000_GN.pkl"
    in_fname = VAE_PATH + "pt_walk_params_b120000_IN.pkl"
    IN = INet.load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd)
    GN = GNet.load_gennet_from_file(f_name=gn_fname, rng=rng, Xp=Xp)
    IN.set_sigma_scale(1.0)
    prior_dim = GN.latent_dim
    # Initialize the GIPair
    GIP_VAE = GIPair(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, g_net=GN, i_net=IN, \
            data_dim=data_dim, prior_dim=prior_dim, params=None)

    for trial in range(15):
        #########################################################
        # DRAW THE SAMPLE OBSERVATIONS AND MASKS FOR THIS TRIAL #
        #########################################################
        va_idx = npr.randint(low=0, high=Xva.shape[0], size=(15,))
        Xc_batch = Xva.take(va_idx, axis=0)
        Xd_batch = np.repeat(Xtr_mean, Xc_batch.shape[0], axis=0).astype(theano.config.floatX)
        Xm_rand = sample_masks(Xc_batch, drop_prob=0.2)
        Xm_patch = sample_patch_masks(Xc_batch, (28,28), (14,14))
        Xm_batch = Xm_rand * Xm_patch

        #####################################
        # COMPARE SAMPLES IN A NORMAL CHAIN #
        #####################################
        # draw some chains of samples from the VAE loop
        result_kld = GIP_KLD.sample_from_chain(Xc_batch, loop_iters=51)
        result_vae = GIP_VAE.sample_from_chain(Xc_batch, loop_iters=51)
        chain_samples_kld = []
        chain_samples_vae = []
        for i in range(len(result_kld['data samples'])):
            if (((i % 10) == 0) or (i == 1)):
                chain_samples_kld.append(result_kld['data samples'][i])
                chain_samples_vae.append(result_vae['data samples'][i])
        # interleave the chain samples for beauteous display
        chain_samples_both = []
        for i in range(len(chain_samples_kld)):
            Xs_kld = chain_samples_kld[i]
            Xs_vae = chain_samples_vae[i]
            joint_samples = np.zeros((2*Xs_kld.shape[0], Xs_kld.shape[1]))
            for j in range(Xs_kld.shape[0]):
                joint_samples[2*j] = Xs_kld[j]
                joint_samples[2*j + 1] = Xs_vae[j]
            chain_samples_both.append(joint_samples)
        chain_len = len(chain_samples_both)
        Xs = np.vstack(chain_samples_both)
        file_name = RESULT_PATH + "FIG_CHAIN_{0:d}.png".format(trial)
        utils.visualize_samples(Xs, file_name, num_rows=chain_len)

        #############################################
        # COMPARE SAMPLES IN A RECONSTRUCTION CHAIN #
        #############################################
        # draw some chains of samples from the VAE loop
        result_kld = GIP_KLD.sample_from_chain(Xd_batch, X_c=Xc_batch, \
                X_m=Xm_batch, loop_iters=10)
        result_vae = GIP_VAE.sample_from_chain(Xd_batch, X_c=Xc_batch, \
                X_m=Xm_batch, loop_iters=10)
        recon_samples_kld = []
        recon_samples_vae = []
        for i in range(len(result_kld['data samples'])):
            if (((i % 2) == 0) or (i == 1)):
                recon_samples_kld.append(result_kld['data samples'][i])
                recon_samples_vae.append(result_vae['data samples'][i])
        # interleave the recon samples for beauteous display
        recon_samples_both = []
        for i in range(len(recon_samples_kld)):
            Xs_kld = recon_samples_kld[i]
            Xs_vae = recon_samples_vae[i]
            joint_samples = np.zeros((2*Xs_kld.shape[0], Xs_kld.shape[1]))
            for j in range(Xs_kld.shape[0]):
                joint_samples[2*j] = Xs_kld[j]
                joint_samples[2*j + 1] = Xs_vae[j]
            recon_samples_both.append(joint_samples)
        recon_len = len(recon_samples_both)
        Xs = np.vstack(recon_samples_both)
        file_name = RESULT_PATH + "FIG_RECON_{0:d}.png".format(trial)
        utils.visualize_samples(Xs, file_name, num_rows=recon_len)
    return
示例#27
0
def batch_test_ss_mlp_pt(test_count=10, su_count=1000):
    """Setup basic test for semisupervised EAR-regularized MLP."""
    # Set some reasonable sgd parameters
    sgd_params = {}
    sgd_params['start_rate'] = 0.01
    sgd_params['decay_rate'] = 0.998
    sgd_params['wt_norm_bound'] = 3.5
    sgd_params['epochs'] = 1000
    sgd_params['batch_size'] = 100
    sgd_params['result_tag'] = '---'
    # Set some reasonable mlp parameters
    mlp_params = {}
    # Set up some proto-networks
    pc0 = [28*28, 800, 800, 11]
    mlp_params['proto_configs'] = [pc0]
    # Set up some spawn networks
    sc0 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True}
    sc1 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True}
    mlp_params['spawn_configs'] = [sc0, sc1]
    mlp_params['spawn_weights'] = [0.0, 1.0]
    # Set remaining params
    mlp_params['ear_type'] = 5
    mlp_params['ear_lam'] = 1.0
    mlp_params['lam_l2a'] = 1e-2
    mlp_params['reg_all_obs'] = True

    for test_num in range(test_count):
        rng_seed = test_num
        sgd_params['result_tag'] = "test_{0:d}".format(test_num)

        # Initialize a random number generator for this test
        rng = np.random.RandomState(rng_seed)
        # Load some data to train/validate/test with
        dataset = 'data/mnist.pkl.gz'
        datasets = load_udm(dataset, zero_mean=False)

        # Construct the EarNet object that we will be training
        x_in = T.matrix('x_in')
        NET = EarNet(rng=rng, input=x_in, params=mlp_params)
        init_biases(NET, b_init=0.05)

        ##########################################
        # First, pretrain each layer in the mlp. #
        ##########################################
        sgd_params['result_tag'] = "ss_ear_pt_s{0:d}_t{1:d}".format(su_count,test_num)
        sgd_params['batch_size'] = 25
        sgd_params['start_rate'] = 0.02
        sgd_params['epochs'] = 40
        for i in range(len(NET.dae_costs)):
            print("==================================================")
            print("Pretraining hidden layer(s) at depth {0:d}".format(i+1))
            print("==================================================")
            train_dae(NET, i, sgd_params, datasets)

        # Load some data to train/validate/test with
        rng = np.random.RandomState(rng_seed)
        dataset = 'data/mnist.pkl.gz'
        datasets = load_udm_ss(dataset, su_count, rng, zero_mean=False)
        # Run semisupervised training on the given MLP
        sgd_params['batch_size'] = 100
        sgd_params['start_rate'] = 0.04
        # Train with weak EAR regularization
        sgd_params['top_only'] = True
        sgd_params['epochs'] = 5
        NET.set_ear_lam(0.0)
        train_ss_mlp(NET, sgd_params, datasets)
        COMMENT="""
        # Train with no EAR regularization
        sgd_params['top_only'] = False
        sgd_params['epochs'] = 100
        NET.set_ear_lam(0.0)
        train_ss_mlp(NET, sgd_params, datasets)
        """
        # Train with weak EAR regularization
        sgd_params['top_only'] = False
        sgd_params['epochs'] = 5
        NET.set_ear_lam(0.5)
        train_ss_mlp(NET, sgd_params, datasets)
        # Train with weak EAR regularization
        sgd_params['epochs'] = 10
        NET.set_ear_lam(1.0)
        train_ss_mlp(NET, sgd_params, datasets)
        # Train with more EAR regularization
        sgd_params['epochs'] = 15
        NET.set_ear_lam(1.5)
        train_ss_mlp(NET, sgd_params, datasets)
        # Train with more EAR regularization
        sgd_params['epochs'] = 20
        NET.set_ear_lam(2.0)
        train_ss_mlp(NET, sgd_params, datasets)
        # Train with more EAR regularization
        sgd_params['epochs'] = 100
        NET.set_ear_lam(3.0)
        train_ss_mlp(NET, sgd_params, datasets)
    return
示例#28
0
def test_gip_sigma_scale_mnist():
    from LogPDFs import cross_validate_sigma
    # Simple test code, to check that everything is basically functional.
    print("TESTING...")

    # Initialize a source of randomness
    rng = np.random.RandomState(12345)

    # Load some data to train/validate/test with
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, zero_mean=False)
    Xtr = datasets[0][0]
    Xtr = Xtr.get_value(borrow=False)
    Xva = datasets[2][0]
    Xva = Xva.get_value(borrow=False)
    print("Xtr.shape: {0:s}, Xva.shape: {1:s}".format(str(Xtr.shape),
                                                      str(Xva.shape)))

    # get and set some basic dataset information
    tr_samples = Xtr.shape[0]
    batch_size = 100
    Xtr_mean = np.mean(Xtr, axis=0, keepdims=True)
    Xtr_mean = (0.0 * Xtr_mean) + np.mean(Xtr)
    Xc_mean = np.repeat(Xtr_mean, batch_size,
                        axis=0).astype(theano.config.floatX)

    # Symbolic inputs
    Xd = T.matrix(name='Xd')
    Xc = T.matrix(name='Xc')
    Xm = T.matrix(name='Xm')
    Xt = T.matrix(name='Xt')

    # Load inferencer and generator from saved parameters
    gn_fname = "MNIST_WALKOUT_TEST_MED_KLD/pt_osm_params_b80000_GN.pkl"
    in_fname = "MNIST_WALKOUT_TEST_MED_KLD/pt_osm_params_b80000_IN.pkl"
    IN = load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd)
    GN = load_infnet_from_file(f_name=gn_fname, rng=rng, Xd=Xd)
    x_dim = IN.shared_layers[0].in_dim
    z_dim = IN.mu_layers[-1].out_dim
    # construct a GIPair with the loaded InfNet and GenNet
    osm_params = {}
    osm_params['x_type'] = 'gaussian'
    osm_params['xt_transform'] = 'sigmoid'
    osm_params['logvar_bound'] = LOGVAR_BOUND
    OSM = OneStageModel(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, \
            p_x_given_z=GN, q_z_given_x=IN, \
            x_dim=x_dim, z_dim=z_dim, params=osm_params)
    # compute variational likelihood bound and its sub-components
    Xva = row_shuffle(Xva)
    Xb = Xva[0:5000]
    file_name = "AX_MNIST_MAX_KLD_POST_KLDS.png"
    post_klds = OSM.compute_post_klds(Xb)
    post_dim_klds = np.mean(post_klds, axis=0)
    utils.plot_stem(np.arange(post_dim_klds.shape[0]), post_dim_klds, \
            file_name)
    # compute information about free-energy on validation set
    file_name = "AX_MNIST_MAX_KLD_FREE_ENERGY.png"
    fe_terms = OSM.compute_fe_terms(Xb, 20)
    utils.plot_scatter(fe_terms[1], fe_terms[0], file_name, \
            x_label='Posterior KLd', y_label='Negative Log-likelihood')

    # bound_results = OSM.compute_ll_bound(Xva)
    # ll_bounds = bound_results[0]
    # post_klds = bound_results[1]
    # log_likelihoods = bound_results[2]
    # max_lls = bound_results[3]
    # print("mean ll bound: {0:.4f}".format(np.mean(ll_bounds)))
    # print("mean posterior KLd: {0:.4f}".format(np.mean(post_klds)))
    # print("mean log-likelihood: {0:.4f}".format(np.mean(log_likelihoods)))
    # print("mean max log-likelihood: {0:.4f}".format(np.mean(max_lls)))
    # print("min ll bound: {0:.4f}".format(np.min(ll_bounds)))
    # print("max posterior KLd: {0:.4f}".format(np.max(post_klds)))
    # print("min log-likelihood: {0:.4f}".format(np.min(log_likelihoods)))
    # print("min max log-likelihood: {0:.4f}".format(np.min(max_lls)))
    # # compute some information about the approximate posteriors
    # post_stats = OSM.compute_post_stats(Xva, 0.0*Xva, 0.0*Xva)
    # all_post_klds = np.sort(post_stats[0].ravel()) # post KLds for each obs and dim
    # obs_post_klds = np.sort(post_stats[1]) # summed post KLds for each obs
    # post_dim_klds = post_stats[2] # average post KLds for each post dim
    # post_dim_vars = post_stats[3] # average squared mean for each post dim
    # utils.plot_line(np.arange(all_post_klds.shape[0]), all_post_klds, "AAA_ALL_POST_KLDS.png")
    # utils.plot_line(np.arange(obs_post_klds.shape[0]), obs_post_klds, "AAA_OBS_POST_KLDS.png")
    # utils.plot_stem(np.arange(post_dim_klds.shape[0]), post_dim_klds, "AAA_POST_DIM_KLDS.png")
    # utils.plot_stem(np.arange(post_dim_vars.shape[0]), post_dim_vars, "AAA_POST_DIM_VARS.png")

    # draw many samples from the GIP
    for i in range(5):
        tr_idx = npr.randint(low=0, high=tr_samples, size=(100, ))
        Xd_batch = Xtr.take(tr_idx, axis=0)
        Xs = []
        for row in range(3):
            Xs.append([])
            for col in range(3):
                sample_lists = OSM.sample_from_chain(Xd_batch[0:10,:], loop_iters=100, \
                        sigma_scale=1.0)
                Xs[row].append(group_chains(sample_lists['data samples']))
        Xs, block_im_dim = block_video(Xs, (28, 28), (3, 3))
        to_video(Xs,
                 block_im_dim,
                 "AX_MNIST_MAX_KLD_CHAIN_VIDEO_{0:d}.avi".format(i),
                 frame_rate=10)
    file_name = "AX_MNIST_MAX_KLD_PRIOR_SAMPLE.png"
    Xs = OSM.sample_from_prior(20 * 20)
    utils.visualize_samples(Xs, file_name, num_rows=20)
    # # test Parzen density estimator built from prior samples
    # Xs = OSM.sample_from_prior(10000)
    # [best_sigma, best_ll, best_lls] = \
    #         cross_validate_sigma(Xs, Xva, [0.12, 0.14, 0.15, 0.16, 0.18], 20)
    # sort_idx = np.argsort(best_lls)
    # sort_idx = sort_idx[0:400]
    # utils.plot_line(np.arange(sort_idx.shape[0]), best_lls[sort_idx], "A_MNIST_MAX_KLD_BEST_LLS_1.png")
    # utils.visualize_samples(Xva[sort_idx], "A_MNIST_MAX_KLD_BAD_DIGITS_1.png", num_rows=20)
    return
示例#29
0
def test_with_model_init():
    ##########################
    # Get some training data #
    ##########################
    rng = np.random.RandomState(1234)
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, zero_mean=False)
    Xtr_shared = datasets[0][0]
    Xva_shared = datasets[1][0]
    Xtr = Xtr_shared.get_value(borrow=False).astype(theano.config.floatX)
    Xva = Xva_shared.get_value(borrow=False).astype(theano.config.floatX)
    tr_samples = Xtr.shape[0]
    batch_size = 200
    batch_reps = 1

    ############################################################
    # Setup some parameters for the Iterative Refinement Model #
    ############################################################
    obs_dim = Xtr.shape[1]
    z_dim = 20
    h_dim = 100
    x_type = 'bernoulli'

    # some InfNet instances to build the TwoStageModel from
    X_sym = T.matrix('X_sym')

    ########################
    # p_s0_obs_given_z_obs #
    ########################
    params = {}
    shared_config = [z_dim, 250, 250]
    top_config = [shared_config[-1], obs_dim]
    params['shared_config'] = shared_config
    params['mu_config'] = top_config
    params['sigma_config'] = top_config
    params['activation'] = relu_actfun
    params['init_scale'] = 1.2
    params['lam_l2a'] = 1e-3
    params['vis_drop'] = 0.0
    params['hid_drop'] = 0.0
    params['bias_noise'] = 0.0
    params['input_noise'] = 0.0
    params['build_theano_funcs'] = False
    p_s0_obs_given_z_obs = InfNet(rng=rng, Xd=X_sym, \
            params=params, shared_param_dicts=None)
    p_s0_obs_given_z_obs.init_biases(0.2)
    #################
    # p_hi_given_si #
    #################
    params = {}
    shared_config = [obs_dim, 250, 250]
    top_config = [shared_config[-1], h_dim]
    params['shared_config'] = shared_config
    params['mu_config'] = top_config
    params['sigma_config'] = top_config
    params['activation'] = relu_actfun
    params['init_scale'] = 1.2
    params['lam_l2a'] = 0.0
    params['vis_drop'] = 0.0
    params['hid_drop'] = 0.0
    params['bias_noise'] = 0.0
    params['input_noise'] = 0.0
    params['build_theano_funcs'] = False
    p_hi_given_si = InfNet(rng=rng, Xd=X_sym, \
            params=params, shared_param_dicts=None)
    p_hi_given_si.init_biases(0.2)
    ######################
    # p_sip1_given_si_hi #
    ######################
    params = {}
    shared_config = [h_dim, 250, 250]
    top_config = [shared_config[-1], obs_dim]
    params['shared_config'] = shared_config
    params['mu_config'] = top_config
    params['sigma_config'] = top_config
    params['activation'] = relu_actfun
    params['init_scale'] = 1.2
    params['lam_l2a'] = 0.0
    params['vis_drop'] = 0.0
    params['hid_drop'] = 0.0
    params['bias_noise'] = 0.0
    params['input_noise'] = 0.0
    params['build_theano_funcs'] = False
    p_sip1_given_si_hi = InfNet(rng=rng, Xd=X_sym, \
            params=params, shared_param_dicts=None)
    p_sip1_given_si_hi.init_biases(0.2)
    ###############
    # q_z_given_x #
    ###############
    params = {}
    shared_config = [obs_dim, 250, 250]
    top_config = [shared_config[-1], z_dim]
    params['shared_config'] = shared_config
    params['mu_config'] = top_config
    params['sigma_config'] = top_config
    params['activation'] = relu_actfun
    params['init_scale'] = 1.2
    params['lam_l2a'] = 0.0
    params['vis_drop'] = 0.0
    params['hid_drop'] = 0.0
    params['bias_noise'] = 0.0
    params['input_noise'] = 0.0
    params['build_theano_funcs'] = False
    q_z_given_x = InfNet(rng=rng, Xd=X_sym, \
            params=params, shared_param_dicts=None)
    q_z_given_x.init_biases(0.2)
    ###################
    # q_hi_given_x_si #
    ###################
    params = {}
    shared_config = [(obs_dim + obs_dim), 500, 500]
    top_config = [shared_config[-1], h_dim]
    params['shared_config'] = shared_config
    params['mu_config'] = top_config
    params['sigma_config'] = top_config
    params['activation'] = relu_actfun
    params['init_scale'] = 1.2
    params['lam_l2a'] = 0.0
    params['vis_drop'] = 0.0
    params['hid_drop'] = 0.0
    params['bias_noise'] = 0.0
    params['input_noise'] = 0.0
    params['build_theano_funcs'] = False
    q_hi_given_x_si = InfNet(rng=rng, Xd=X_sym, \
            params=params, shared_param_dicts=None)
    q_hi_given_x_si.init_biases(0.2)


    ################################################################
    # Define parameters for the MultiStageModel, and initialize it #
    ################################################################
    print("Building the MultiStageModel...")
    msm_params = {}
    msm_params['x_type'] = x_type
    msm_params['obs_transform'] = 'sigmoid'
    MSM = MultiStageModel(rng=rng, x_in=X_sym, \
            p_s0_obs_given_z_obs=p_s0_obs_given_z_obs, \
            p_hi_given_si=p_hi_given_si, \
            p_sip1_given_si_hi=p_sip1_given_si_hi, \
            q_z_given_x=q_z_given_x, \
            q_hi_given_x_si=q_hi_given_x_si, \
            obs_dim=obs_dim, z_dim=z_dim, h_dim=h_dim, \
            model_init_obs=True, ir_steps=2, \
            params=msm_params)
    obs_mean = (0.9 * np.mean(Xtr, axis=0)) + 0.05
    obs_mean_logit = np.log(obs_mean / (1.0 - obs_mean))
    MSM.set_input_bias(-obs_mean)
    MSM.set_obs_bias(0.1*obs_mean_logit)

    ################################################################
    # Apply some updates, to check that they aren't totally broken #
    ################################################################
    costs = [0. for i in range(10)]
    learn_rate = 0.0003
    momentum = 0.8
    for i in range(300000):
        scale = min(1.0, ((i+1) / 10000.0))
        extra_kl = max(0.0, ((50000.0 - i) / 50000.0))
        if (((i + 1) % 10000) == 0):
            learn_rate = learn_rate * 0.95
        # randomly sample a minibatch
        tr_idx = npr.randint(low=0,high=tr_samples,size=(batch_size,))
        Xb = binarize_data(Xtr.take(tr_idx, axis=0))
        Xb = Xb.astype(theano.config.floatX)
        # set sgd and objective function hyperparams for this update
        MSM.set_sgd_params(lr_1=scale*learn_rate, lr_2=scale*learn_rate, \
                           mom_1=(scale*momentum), mom_2=0.98)
        MSM.set_train_switch(1.0)
        MSM.set_l1l2_weight(1.0)
        MSM.set_lam_nll(lam_nll=1.0)
        MSM.set_lam_kld(lam_kld_1=(1.0+extra_kl), lam_kld_2=(1.0+extra_kl))
        MSM.set_lam_l2w(1e-6)
        MSM.set_kzg_weight(0.01)
        # perform a minibatch update and record the cost for this batch
        result = MSM.train_joint(Xb, batch_reps)
        costs = [(costs[j] + result[j]) for j in range(len(result))]
        if ((i % 500) == 0):
            costs = [(v / 500.0) for v in costs]
            print("-- batch {0:d} --".format(i))
            print("    joint_cost: {0:.4f}".format(costs[0]))
            print("    nll_cost  : {0:.4f}".format(costs[1]))
            print("    kld_cost  : {0:.4f}".format(costs[2]))
            print("    reg_cost  : {0:.4f}".format(costs[3]))
            costs = [0.0 for v in costs]
        if (((i % 2000) == 0) or ((i < 10000) and ((i % 1000) == 0))):
            Xva = row_shuffle(Xva)
            # draw some independent random samples from the model
            samp_count = 200
            model_samps = MSM.sample_from_prior(samp_count)
            seq_len = len(model_samps)
            seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1]))
            idx = 0
            for s1 in range(samp_count): 
                for s2 in range(seq_len):
                    seq_samps[idx] = model_samps[s2][s1]
                    idx += 1
            file_name = "MX_SAMPLES_b{0:d}.png".format(i)
            utils.visualize_samples(seq_samps, file_name, num_rows=20)
            # visualize some important weights in the model
            file_name = "MX_INF_1_WEIGHTS_b{0:d}.png".format(i)
            W = MSM.inf_1_weights.get_value(borrow=False).T
            utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20)
            file_name = "MX_INF_2_WEIGHTS_b{0:d}.png".format(i)
            W = MSM.inf_2_weights.get_value(borrow=False).T
            utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20)
            file_name = "MX_GEN_1_WEIGHTS_b{0:d}.png".format(i)
            W = MSM.gen_1_weights.get_value(borrow=False)
            utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20)
            file_name = "MX_GEN_2_WEIGHTS_b{0:d}.png".format(i)
            W = MSM.gen_2_weights.get_value(borrow=False)
            utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20)
            file_name = "MX_GEN_INF_WEIGHTS_b{0:d}.png".format(i)
            W = MSM.gen_inf_weights.get_value(borrow=False).T
            utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20)
            # compute information about posterior KLds on validation set
            post_klds = MSM.compute_post_klds(Xva[0:5000])
            file_name = "MX_H0_KLDS_b{0:d}.png".format(i)
            utils.plot_stem(np.arange(post_klds[0].shape[1]), \
                    np.mean(post_klds[0], axis=0), file_name)
            file_name = "MX_HI_COND_KLDS_b{0:d}.png".format(i)
            utils.plot_stem(np.arange(post_klds[1].shape[1]), \
                    np.mean(post_klds[1], axis=0), file_name)
            file_name = "MX_HI_GLOB_KLDS_b{0:d}.png".format(i)
            utils.plot_stem(np.arange(post_klds[2].shape[1]), \
                    np.mean(post_klds[2], axis=0), file_name)
            # compute information about free-energy on validation set
            file_name = "MX_FREE_ENERGY_b{0:d}.png".format(i)
            fe_terms = MSM.compute_fe_terms(binarize_data(Xva[0:5000]), 20)
            fe_mean = np.mean(fe_terms[0]) + np.mean(fe_terms[1])
            print("    nll_bound : {0:.4f}".format(fe_mean))
            utils.plot_scatter(fe_terms[1], fe_terms[0], file_name, \
                    x_label='Posterior KLd', y_label='Negative Log-likelihood')
    return
示例#30
0
def test_with_model_init():
    ##########################
    # Get some training data #
    ##########################
    rng = np.random.RandomState(1234)
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, zero_mean=False)
    Xtr_shared = datasets[0][0]
    Xva_shared = datasets[1][0]
    Xtr = Xtr_shared.get_value(borrow=False).astype(theano.config.floatX)
    Xva = Xva_shared.get_value(borrow=False).astype(theano.config.floatX)
    tr_samples = Xtr.shape[0]
    batch_size = 500
    batch_reps = 1

    ############################################################
    # Setup some parameters for the Iterative Refinement Model #
    ############################################################
    obs_dim = Xtr.shape[1]
    z_rnn_dim = 25
    z_obs_dim = 5
    jnt_dim = obs_dim + z_rnn_dim
    h_dim = 100
    x_type = 'bernoulli'
    prior_sigma = 1.0

    # some InfNet instances to build the TwoStageModel from
    X_sym = T.matrix('X_sym')

    ########################
    # p_s0_obs_given_z_obs #
    ########################
    params = {}
    shared_config = [z_obs_dim, 250, 250]
    top_config = [shared_config[-1], obs_dim]
    params['shared_config'] = shared_config
    params['mu_config'] = top_config
    params['sigma_config'] = top_config
    params['activation'] = softplus_actfun
    params['init_scale'] = 1.2
    params['lam_l2a'] = 1e-3
    params['vis_drop'] = 0.0
    params['hid_drop'] = 0.0
    params['bias_noise'] = 0.0
    params['input_noise'] = 0.0
    params['build_theano_funcs'] = False
    p_s0_obs_given_z_obs = InfNet(rng=rng, Xd=X_sym, prior_sigma=prior_sigma, \
            params=params, shared_param_dicts=None)
    p_s0_obs_given_z_obs.init_biases(0.2)
    #################
    # p_hi_given_si #
    #################
    params = {}
    shared_config = [jnt_dim, 500, 500]
    top_config = [shared_config[-1], h_dim]
    params['shared_config'] = shared_config
    params['mu_config'] = top_config
    params['sigma_config'] = top_config
    params['activation'] = softplus_actfun
    params['init_scale'] = 1.2
    params['lam_l2a'] = 0.0
    params['vis_drop'] = 0.0
    params['hid_drop'] = 0.0
    params['bias_noise'] = 0.0
    params['input_noise'] = 0.0
    params['build_theano_funcs'] = False
    p_hi_given_si = InfNet(rng=rng, Xd=X_sym, prior_sigma=prior_sigma, \
            params=params, shared_param_dicts=None)
    p_hi_given_si.init_biases(0.2)
    ######################
    # p_sip1_given_si_hi #
    ######################
    params = {}
    shared_config = [(h_dim + z_rnn_dim), 500, 500]
    top_config = [shared_config[-1], obs_dim]
    params['shared_config'] = shared_config
    params['mu_config'] = top_config
    params['sigma_config'] = top_config
    params['activation'] = softplus_actfun
    params['init_scale'] = 1.2
    params['lam_l2a'] = 0.0
    params['vis_drop'] = 0.0
    params['hid_drop'] = 0.0
    params['bias_noise'] = 0.0
    params['input_noise'] = 0.0
    params['build_theano_funcs'] = False
    p_sip1_given_si_hi = InfNet(rng=rng, Xd=X_sym, prior_sigma=prior_sigma, \
            params=params, shared_param_dicts=None)
    p_sip1_given_si_hi.init_biases(0.2)
    ###############
    # q_z_given_x #
    ###############
    params = {}
    shared_config = [obs_dim, 250, 250]
    top_config = [shared_config[-1], (z_rnn_dim + z_obs_dim)]
    params['shared_config'] = shared_config
    params['mu_config'] = top_config
    params['sigma_config'] = top_config
    params['activation'] = softplus_actfun
    params['init_scale'] = 1.2
    params['lam_l2a'] = 0.0
    params['vis_drop'] = 0.0
    params['hid_drop'] = 0.0
    params['bias_noise'] = 0.0
    params['input_noise'] = 0.0
    params['build_theano_funcs'] = False
    q_z_given_x = InfNet(rng=rng, Xd=X_sym, prior_sigma=prior_sigma, \
            params=params, shared_param_dicts=None)
    q_z_given_x.init_biases(0.2)
    ###################
    # q_hi_given_x_si #
    ###################
    params = {}
    shared_config = [(obs_dim + jnt_dim), 500, 500]
    top_config = [shared_config[-1], h_dim]
    params['shared_config'] = shared_config
    params['mu_config'] = top_config
    params['sigma_config'] = top_config
    params['activation'] = softplus_actfun
    params['init_scale'] = 1.2
    params['lam_l2a'] = 0.0
    params['vis_drop'] = 0.0
    params['hid_drop'] = 0.0
    params['bias_noise'] = 0.0
    params['input_noise'] = 0.0
    params['build_theano_funcs'] = False
    q_hi_given_x_si = InfNet(rng=rng, Xd=X_sym, prior_sigma=prior_sigma, \
            params=params, shared_param_dicts=None)
    q_hi_given_x_si.init_biases(0.2)


    ################################################################
    # Define parameters for the MultiStageModel, and initialize it #
    ################################################################
    print("Building the MultiStageModel...")
    msm_params = {}
    msm_params['x_type'] = x_type
    msm_params['obs_transform'] = 'sigmoid'
    MSM = MultiStageModel(rng=rng, x_in=X_sym, \
            p_s0_obs_given_z_obs=p_s0_obs_given_z_obs, \
            p_hi_given_si=p_hi_given_si, \
            p_sip1_given_si_hi=p_sip1_given_si_hi, \
            q_z_given_x=q_z_given_x, \
            q_hi_given_x_si=q_hi_given_x_si, \
            obs_dim=obs_dim, z_rnn_dim=z_rnn_dim, z_obs_dim=z_obs_dim, \
            h_dim=h_dim, model_init_obs=False, model_init_rnn=True, \
            ir_steps=3, params=msm_params)
    obs_mean = (0.9 * np.mean(Xtr, axis=0)) + 0.05
    obs_mean_logit = np.log(obs_mean / (1.0 - obs_mean))
    MSM.set_input_bias(-obs_mean)
    MSM.set_obs_bias(0.1*obs_mean_logit)

    ################################################################
    # Apply some updates, to check that they aren't totally broken #
    ################################################################
    costs = [0. for i in range(10)]
    learn_rate = 0.003
    momentum = 0.5
    for i in range(300000):
        scale = min(1.0, ((i+1) / 5000.0))
        l1l2_weight = 1.0 #min(1.0, ((i+1) / 2500.0))
        if (((i + 1) % 10000) == 0):
            learn_rate = learn_rate * 0.92
        if (i > 100000):
            momentum = 0.80
        if (i > 50000):
            momentum = 0.65
        else:
            momentum = 0.50
        # randomly sample a minibatch
        tr_idx = npr.randint(low=0,high=tr_samples,size=(batch_size,))
        Xb = binarize_data(Xtr.take(tr_idx, axis=0))
        Xb = Xb.astype(theano.config.floatX)
        # set sgd and objective function hyperparams for this update
        MSM.set_sgd_params(lr_1=scale*learn_rate, lr_2=scale*learn_rate, \
                mom_1=(scale*momentum), mom_2=0.99)
        MSM.set_train_switch(1.0)
        MSM.set_l1l2_weight(l1l2_weight)
        MSM.set_lam_nll(lam_nll=1.0)
        MSM.set_lam_kld(lam_kld_1=1.0, lam_kld_2=1.0)
        MSM.set_lam_l2w(1e-5)
        MSM.set_kzg_weight(0.01)
        # perform a minibatch update and record the cost for this batch
        result = MSM.train_joint(Xb, batch_reps)
        costs = [(costs[j] + result[j]) for j in range(len(result))]
        if ((i % 500) == 0):
            costs = [(v / 500.0) for v in costs]
            print("-- batch {0:d} --".format(i))
            print("    joint_cost: {0:.4f}".format(costs[0]))
            print("    nll_cost  : {0:.4f}".format(costs[1]))
            print("    kld_cost  : {0:.4f}".format(costs[2]))
            print("    reg_cost  : {0:.4f}".format(costs[3]))
            costs = [0.0 for v in costs]
        if (((i % 2000) == 0) or ((i < 10000) and ((i % 1000) == 0))):
            Xva = row_shuffle(Xva)
            # draw some independent random samples from the model
            samp_count = 200
            model_samps = MSM.sample_from_prior(samp_count)
            seq_len = len(model_samps)
            seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1]))
            idx = 0
            for s1 in range(samp_count): 
                for s2 in range(seq_len):
                    seq_samps[idx] = model_samps[s2][s1]
                    idx += 1
            file_name = "MZ_SAMPLES_b{0:d}.png".format(i)
            utils.visualize_samples(seq_samps, file_name, num_rows=20)
            # visualize some important weights in the model
            file_name = "MZ_INF_1_WEIGHTS_b{0:d}.png".format(i)
            W = MSM.inf_1_weights.get_value(borrow=False).T
            utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20)
            file_name = "MZ_INF_2_WEIGHTS_b{0:d}.png".format(i)
            W = MSM.inf_2_weights.get_value(borrow=False).T
            utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20)
            file_name = "MZ_GEN_1_WEIGHTS_b{0:d}.png".format(i)
            W = MSM.gen_1_weights.get_value(borrow=False)
            utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20)
            file_name = "MZ_GEN_2_WEIGHTS_b{0:d}.png".format(i)
            W = MSM.gen_2_weights.get_value(borrow=False)
            utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20)
            file_name = "MZ_GEN_INF_WEIGHTS_b{0:d}.png".format(i)
            W = MSM.gen_inf_weights.get_value(borrow=False).T
            utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20)
            # compute information about posterior KLds on validation set
            post_klds = MSM.compute_post_klds(Xva[0:5000])
            file_name = "MZ_H0_KLDS_b{0:d}.png".format(i)
            utils.plot_stem(np.arange(post_klds[0].shape[1]), \
                    np.mean(post_klds[0], axis=0), file_name)
            file_name = "MZ_HI_COND_KLDS_b{0:d}.png".format(i)
            utils.plot_stem(np.arange(post_klds[1].shape[1]), \
                    np.mean(post_klds[1], axis=0), file_name)
            file_name = "MZ_HI_GLOB_KLDS_b{0:d}.png".format(i)
            utils.plot_stem(np.arange(post_klds[2].shape[1]), \
                    np.mean(post_klds[2], axis=0), file_name)
            # compute information about free-energy on validation set
            file_name = "MZ_FREE_ENERGY_b{0:d}.png".format(i)
            fe_terms = MSM.compute_fe_terms(binarize_data(Xva[0:5000]), 20)
            fe_mean = np.mean(fe_terms[0]) + np.mean(fe_terms[1])
            print("    nll_bound : {0:.4f}".format(fe_mean))
            utils.plot_scatter(fe_terms[1], fe_terms[0], file_name, \
                    x_label='Posterior KLd', y_label='Negative Log-likelihood')
    return
示例#31
0
def pretrain_osm(lam_kld=0.0):
    # Initialize a source of randomness
    rng = np.random.RandomState(1234)

    # Load some data to train/validate/test with
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, zero_mean=False)
    Xtr = datasets[0][0]
    Xtr = Xtr.get_value(borrow=False)
    Xva = datasets[2][0]
    Xva = Xva.get_value(borrow=False)
    print("Xtr.shape: {0:s}, Xva.shape: {1:s}".format(str(Xtr.shape),
                                                      str(Xva.shape)))

    # get and set some basic dataset information
    Xtr_mean = np.mean(Xtr, axis=0)
    tr_samples = Xtr.shape[0]
    va_samples = Xva.shape[0]
    batch_size = 100
    batch_reps = 5

    # setup some symbolic variables and stuff
    Xd = T.matrix('Xd_base')
    Xc = T.matrix('Xc_base')
    Xm = T.matrix('Xm_base')
    data_dim = Xtr.shape[1]
    prior_sigma = 1.0

    ##########################
    # NETWORK CONFIGURATIONS #
    ##########################
    gn_params = {}
    shared_config = [PRIOR_DIM, 1000, 1000]
    top_config = [shared_config[-1], data_dim]
    gn_params['shared_config'] = shared_config
    gn_params['mu_config'] = top_config
    gn_params['sigma_config'] = top_config
    gn_params['activation'] = relu_actfun
    gn_params['init_scale'] = 1.4
    gn_params['lam_l2a'] = 0.0
    gn_params['vis_drop'] = 0.0
    gn_params['hid_drop'] = 0.0
    gn_params['bias_noise'] = 0.0
    gn_params['input_noise'] = 0.0
    # choose some parameters for the continuous inferencer
    in_params = {}
    shared_config = [data_dim, 1000, 1000]
    top_config = [shared_config[-1], PRIOR_DIM]
    in_params['shared_config'] = shared_config
    in_params['mu_config'] = top_config
    in_params['sigma_config'] = top_config
    in_params['activation'] = relu_actfun
    in_params['init_scale'] = 1.4
    in_params['lam_l2a'] = 0.0
    in_params['vis_drop'] = 0.0
    in_params['hid_drop'] = 0.0
    in_params['bias_noise'] = 0.0
    in_params['input_noise'] = 0.0
    # Initialize the base networks for this OneStageModel
    IN = InfNet(rng=rng, Xd=Xd, prior_sigma=prior_sigma, \
            params=in_params, shared_param_dicts=None)
    GN = InfNet(rng=rng, Xd=Xd, prior_sigma=prior_sigma, \
            params=gn_params, shared_param_dicts=None)
    # Initialize biases in IN and GN
    IN.init_biases(0.2)
    GN.init_biases(0.2)

    #########################
    # INITIALIZE THE GIPAIR #
    #########################
    osm_params = {}
    osm_params['x_type'] = 'bernoulli'
    osm_params['xt_transform'] = 'sigmoid'
    osm_params['logvar_bound'] = LOGVAR_BOUND
    OSM = OneStageModel(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, \
            p_x_given_z=GN, q_z_given_x=IN, \
            x_dim=data_dim, z_dim=PRIOR_DIM, params=osm_params)
    OSM.set_lam_l2w(1e-5)
    safe_mean = (0.9 * Xtr_mean) + 0.05
    safe_mean_logit = np.log(safe_mean / (1.0 - safe_mean))
    OSM.set_output_bias(safe_mean_logit)
    OSM.set_input_bias(-Xtr_mean)

    ######################
    # BASIC VAE TRAINING #
    ######################
    out_file = open(RESULT_PATH + "pt_osm_results.txt", 'wb')
    # Set initial learning rate and basic SGD hyper parameters
    obs_costs = np.zeros((batch_size, ))
    costs = [0. for i in range(10)]
    learn_rate = 0.0005
    for i in range(150000):
        scale = min(1.0, float(i) / 10000.0)
        if ((i > 1) and ((i % 20000) == 0)):
            learn_rate = learn_rate * 0.9
        # do a minibatch update of the model, and compute some costs
        tr_idx = npr.randint(low=0, high=tr_samples, size=(batch_size, ))
        Xd_batch = Xtr.take(tr_idx, axis=0)
        Xc_batch = 0.0 * Xd_batch
        Xm_batch = 0.0 * Xd_batch
        # do a minibatch update of the model, and compute some costs
        OSM.set_sgd_params(lr_1=(scale * learn_rate), mom_1=0.5, mom_2=0.98)
        OSM.set_lam_nll(1.0)
        OSM.set_lam_kld(lam_kld_1=(1.0 + (scale * (lam_kld - 1.0))),
                        lam_kld_2=0.0)
        result = OSM.train_joint(Xd_batch, Xc_batch, Xm_batch, batch_reps)
        costs = [(costs[j] + result[j]) for j in range(len(result))]
        if ((i % 1000) == 0):
            # record and then reset the cost trackers
            costs = [(v / 1000.0) for v in costs]
            str_1 = "-- batch {0:d} --".format(i)
            str_2 = "    joint_cost: {0:.4f}".format(costs[0])
            str_3 = "    nll_cost  : {0:.4f}".format(costs[1])
            str_4 = "    kld_cost  : {0:.4f}".format(costs[2])
            str_5 = "    reg_cost  : {0:.4f}".format(costs[3])
            costs = [0.0 for v in costs]
            # print out some diagnostic information
            joint_str = "\n".join([str_1, str_2, str_3, str_4, str_5])
            print(joint_str)
            out_file.write(joint_str + "\n")
            out_file.flush()
        if ((i % 2000) == 0):
            Xva = row_shuffle(Xva)
            model_samps = OSM.sample_from_prior(500)
            file_name = RESULT_PATH + "pt_osm_samples_b{0:d}_XG.png".format(i)
            utils.visualize_samples(model_samps, file_name, num_rows=20)
            # compute information about free-energy on validation set
            file_name = RESULT_PATH + "pt_osm_free_energy_b{0:d}.png".format(i)
            fe_terms = OSM.compute_fe_terms(Xva[0:2500], 20)
            fe_mean = np.mean(fe_terms[0]) + np.mean(fe_terms[1])
            fe_str = "    nll_bound : {0:.4f}".format(fe_mean)
            print(fe_str)
            out_file.write(fe_str + "\n")
            utils.plot_scatter(fe_terms[1], fe_terms[0], file_name, \
                    x_label='Posterior KLd', y_label='Negative Log-likelihood')
            # compute information about posterior KLds on validation set
            file_name = RESULT_PATH + "pt_osm_post_klds_b{0:d}.png".format(i)
            post_klds = OSM.compute_post_klds(Xva[0:2500])
            post_dim_klds = np.mean(post_klds, axis=0)
            utils.plot_stem(np.arange(post_dim_klds.shape[0]), post_dim_klds, \
                    file_name)
        if ((i % 5000) == 0):
            IN.save_to_file(f_name=RESULT_PATH +
                            "pt_osm_params_b{0:d}_IN.pkl".format(i))
            GN.save_to_file(f_name=RESULT_PATH +
                            "pt_osm_params_b{0:d}_GN.pkl".format(i))
    IN.save_to_file(f_name=RESULT_PATH + "pt_osm_params_IN.pkl")
    GN.save_to_file(f_name=RESULT_PATH + "pt_osm_params_GN.pkl")
    return
示例#32
0
def test_with_model_init():
    ##########################
    # Get some training data #
    ##########################
    rng = np.random.RandomState(1234)
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, as_shared=False, zero_mean=False)
    Xtr = to_fX(datasets[0][0])
    Xva = to_fX(datasets[1][0])
    Ytr = datasets[0][1]
    Yva = datasets[1][1]
    Xtr_class_groups = make_class_groups(Xtr, Ytr)

    tr_samples = Xtr.shape[0]
    va_samples = Xva.shape[0]
    batch_size = 300

    BD = lambda ary: binarize_data(ary)

    ############################################################
    # Setup some parameters for the Iterative Refinement Model #
    ############################################################
    obs_dim = Xtr.shape[1]
    z_dim = 32
    h_dim = 100
    ir_steps = 2
    init_scale = 1.0
    
    x_type = 'bernoulli'

    # some InfNet instances to build the TwoStageModel from
    x_in = T.matrix('x_in')
    x_pos = T.matrix('x_pos')
    y_in = T.lvector('y_in')

    #################
    # p_hi_given_si #
    #################
    params = {}
    shared_config = [obs_dim, 500, 500]
    top_config = [shared_config[-1], h_dim]
    params['shared_config'] = shared_config
    params['mu_config'] = top_config
    params['sigma_config'] = top_config
    params['activation'] = relu_actfun
    params['init_scale'] = init_scale
    params['lam_l2a'] = 0.0
    params['vis_drop'] = 0.0
    params['hid_drop'] = 0.0
    params['bias_noise'] = 0.0
    params['input_noise'] = 0.0
    params['build_theano_funcs'] = False
    p_hi_given_si = InfNet(rng=rng, Xd=x_in, \
            params=params, shared_param_dicts=None)
    p_hi_given_si.init_biases(0.2)
    ######################
    # p_sip1_given_si_hi #
    ######################
    params = {}
    shared_config = [(h_dim + obs_dim), 500, 500]
    top_config = [shared_config[-1], obs_dim]
    params['shared_config'] = shared_config
    params['mu_config'] = top_config
    params['sigma_config'] = top_config
    params['activation'] = relu_actfun
    params['init_scale'] = init_scale
    params['lam_l2a'] = 0.0
    params['vis_drop'] = 0.0
    params['hid_drop'] = 0.0
    params['bias_noise'] = 0.0
    params['input_noise'] = 0.0
    params['build_theano_funcs'] = False
    p_sip1_given_si_hi = InfNet(rng=rng, Xd=x_in, \
            params=params, shared_param_dicts=None)
    p_sip1_given_si_hi.init_biases(0.2)
    ################
    # p_s0_given_z #
    ################
    params = {}
    shared_config = [z_dim, 500, 500]
    top_config = [shared_config[-1], obs_dim]
    params['shared_config'] = shared_config
    params['mu_config'] = top_config
    params['sigma_config'] = top_config
    params['activation'] = relu_actfun
    params['init_scale'] = init_scale
    params['lam_l2a'] = 0.0
    params['vis_drop'] = 0.0
    params['hid_drop'] = 0.0
    params['bias_noise'] = 0.0
    params['input_noise'] = 0.0
    params['build_theano_funcs'] = False
    p_s0_given_z = InfNet(rng=rng, Xd=x_in, \
            params=params, shared_param_dicts=None)
    p_s0_given_z.init_biases(0.2)
    ###############
    # q_z_given_x #
    ###############
    params = {}
    shared_config = [obs_dim, (500, 4), (500, 4)]
    top_config = [shared_config[-1], z_dim]
    params['shared_config'] = shared_config
    params['mu_config'] = top_config
    params['sigma_config'] = top_config
    params['activation'] = relu_actfun
    params['init_scale'] = init_scale
    params['lam_l2a'] = 0.0
    params['vis_drop'] = 0.2
    params['hid_drop'] = 0.5
    params['bias_noise'] = 0.0
    params['input_noise'] = 0.0
    params['build_theano_funcs'] = False
    q_z_given_x = InfNet(rng=rng, Xd=x_in, \
            params=params, shared_param_dicts=None)
    q_z_given_x.init_biases(0.0)
    ###################
    # q_hi_given_x_si #
    ###################
    params = {}
    shared_config = [(obs_dim + obs_dim), 800, 800]
    top_config = [shared_config[-1], h_dim]
    params['shared_config'] = shared_config
    params['mu_config'] = top_config
    params['sigma_config'] = top_config
    params['activation'] = relu_actfun
    params['init_scale'] = init_scale
    params['lam_l2a'] = 0.0
    params['vis_drop'] = 0.0
    params['hid_drop'] = 0.0
    params['bias_noise'] = 0.0
    params['input_noise'] = 0.0
    params['build_theano_funcs'] = False
    q_hi_given_x_si = InfNet(rng=rng, Xd=x_in, \
            params=params, shared_param_dicts=None)
    q_hi_given_x_si.init_biases(0.2)


    ################################################################
    # Define parameters for the MultiStageModel, and initialize it #
    ################################################################
    print("Building the MultiStageModel...")
    msm_params = {}
    msm_params['x_type'] = x_type
    msm_params['obs_transform'] = 'sigmoid'
    MSM = MultiStageModelSS(rng=rng, \
            x_in=x_in, x_pos=x_pos, y_in=y_in, \
            p_s0_given_z=p_s0_given_z, \
            p_hi_given_si=p_hi_given_si, \
            p_sip1_given_si_hi=p_sip1_given_si_hi, \
            q_z_given_x=q_z_given_x, \
            q_hi_given_x_si=q_hi_given_x_si, \
            class_count=10, \
            obs_dim=obs_dim, z_dim=z_dim, h_dim=h_dim, \
            ir_steps=ir_steps, params=msm_params)
    MSM.set_lam_class(lam_class=20.0)
    MSM.set_lam_nll(lam_nll=1.0)
    MSM.set_lam_kld(lam_kld_z=1.0, lam_kld_q2p=0.9, \
                    lam_kld_p2q=0.1)
    MSM.set_lam_l2w(1e-4)
    MSM.set_drop_rate(0.0)
    MSM.q_hi_given_x_si.set_bias_noise(0.0)
    MSM.p_hi_given_si.set_bias_noise(0.0)
    MSM.p_sip1_given_si_hi.set_bias_noise(0.0)

    ################################################################
    # Apply some updates, to check that they aren't totally broken #
    ################################################################
    out_file = open("MSS_A_RESULTS.txt", 'wb')
    costs = [0. for i in range(10)]
    learn_rate = 0.0002
    momentum = 0.5
    batch_idx = np.arange(batch_size) + tr_samples
    for i in range(250000):
        scale = min(1.0, ((i+1) / 2000.0))
        if (((i + 1) % 10000) == 0):
            learn_rate = learn_rate * 0.95
        if (i > 20000):
            momentum = 0.90
        else:
            momentum = 0.50
        # get the indices of training samples for this batch update
        batch_idx += batch_size
        if (np.max(batch_idx) >= tr_samples):
            # we finished an "epoch", so we rejumble the training set
            Xtr, Ytr = row_shuffle(Xtr, Ytr)
            batch_idx = np.arange(batch_size)
        # set sgd and objective function hyperparams for this update
        MSM.set_sgd_params(lr_1=scale*learn_rate, lr_2=scale*learn_rate, \
                           mom_1=scale*momentum, mom_2=0.99)
        MSM.set_train_switch(1.0)
        # perform a minibatch update and record the cost for this batch
        Xi_tr = Xtr.take(batch_idx, axis=0)
        Yi_tr = Ytr.take(batch_idx, axis=0)
        Xp_tr, Xn_tr = sample_class_groups(Yi_tr, Xtr_class_groups)
        result = MSM.train_joint(BD(Xi_tr), BD(Xp_tr), Yi_tr)
        costs = [(costs[j] + result[j]) for j in range(len(result)-1)]
        # output useful information about training progress
        if ((i % 500) == 0):
            costs = [(v / 500.0) for v in costs]
            str1 = "-- batch {0:d} --".format(i)
            str2 = "    joint_cost  : {0:.4f}".format(costs[0])
            str3 = "    class_cost  : {0:.4f}".format(costs[1])
            str4 = "    nll_cost    : {0:.4f}".format(costs[2])
            str5 = "    kld_cost    : {0:.4f}".format(costs[3])
            str6 = "    reg_cost    : {0:.4f}".format(costs[4])
            joint_str = "\n".join([str1, str2, str3, str4, str5, str6])
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
            costs = [0.0 for v in costs]
        if (((i % 2000) == 0) or ((i < 10000) and ((i % 1000) == 0))):
            # Get some validation samples for computing diagnostics
            Xva, Yva = row_shuffle(Xva, Yva)
            Xb_va = Xva[0:2500]
            Yb_va = Yva[0:2500]
            # draw some independent random samples from the model
            samp_count = 200
            model_samps = MSM.sample_from_prior(samp_count)
            seq_len = len(model_samps)
            seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1]))
            idx = 0
            for s1 in range(samp_count):
                for s2 in range(seq_len):
                    seq_samps[idx] = model_samps[s2][s1]
                    idx += 1
            file_name = "MSS_A_SAMPLES_IND_b{0:d}.png".format(i)
            utils.visualize_samples(seq_samps, file_name, num_rows=20)
            # draw some conditional random samples from the model
            Xs = Xb_va[0:50] # only use validation set samples
            Xs = np.repeat(Xs, 4, axis=0)
            samp_count = Xs.shape[0]
            utils.visualize_samples(seq_samps, file_name, num_rows=20)
            # draw some conditional random samples from the model
            model_samps = MSM.sample_from_input(BD(Xs), guided_decoding=False)
            model_samps.append(Xs)
            seq_len = len(model_samps)
            seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1]))
            idx = 0
            for s1 in range(samp_count): 
                for s2 in range(seq_len):
                    seq_samps[idx] = model_samps[s2][s1]
                    idx += 1
            file_name = "MSS_A_SAMPLES_CND_UD_b{0:d}.png".format(i)
            utils.visualize_samples(seq_samps, file_name, num_rows=20)
            # compute information about posterior KLds on validation set
            raw_costs = MSM.compute_raw_costs(BD(Xb_va), BD(Xb_va))
            init_nll, init_kld, q2p_kld, p2q_kld, step_nll, step_kld = raw_costs
            file_name = "MSS_A_H0_KLDS_b{0:d}.png".format(i)
            utils.plot_stem(np.arange(init_kld.shape[1]), \
                    np.mean(init_kld, axis=0), file_name)
            file_name = "MSS_A_HI_Q2P_KLDS_b{0:d}.png".format(i)
            utils.plot_stem(np.arange(q2p_kld.shape[1]), \
                    np.mean(q2p_kld, axis=0), file_name)
            file_name = "MSS_A_HI_P2Q_KLDS_b{0:d}.png".format(i)
            utils.plot_stem(np.arange(p2q_kld.shape[1]), \
                    np.mean(p2q_kld, axis=0), file_name)
            # draw weights for the initial encoder/classifier
            file_name = "MSS_A_QZX_WEIGHTS_b{0:d}.png".format(i)
            W = q_z_given_x.shared_layers[0].W.get_value(borrow=False).T
            utils.visualize_samples(W, file_name, num_rows=20)
            # compute free-energy terms on training samples
            fe_terms = MSM.compute_fe_terms(BD(Xtr[0:2500]), BD(Xtr[0:2500]), 30)
            fe_nll = np.mean(fe_terms[0])
            fe_kld = np.mean(fe_terms[1])
            fe_joint = fe_nll + fe_kld
            joint_str = "    vfe-tr: {0:.4f}, nll: ({1:.4f}, {2:.4f}, {3:.4f}), kld: ({4:.4f}, {5:.4f}, {6:.4f})".format( \
                    fe_joint, fe_nll, np.min(fe_terms[0]), np.max(fe_terms[0]), fe_kld, np.min(fe_terms[1]), np.max(fe_terms[1]))
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
            # compute free-energy terms on validation samples
            fe_terms = MSM.compute_fe_terms(BD(Xb_va), BD(Xb_va), 30)
            fe_nll = np.mean(fe_terms[0])
            fe_kld = np.mean(fe_terms[1])
            fe_joint = fe_nll + fe_kld
            joint_str = "    vfe-va: {0:.4f}, nll: ({1:.4f}, {2:.4f}, {3:.4f}), kld: ({4:.4f}, {5:.4f}, {6:.4f})".format( \
                    fe_joint, fe_nll, np.min(fe_terms[0]), np.max(fe_terms[0]), fe_kld, np.min(fe_terms[1]), np.max(fe_terms[1]))
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
            # compute multi-sample estimate of classification error
            err_rate, err_idx, y_preds = MSM.class_error(Xb_va, Yb_va, \
                    samples=30, prep_func=BD)
            joint_str = "    va-class-error: {0:.4f}".format(err_rate)
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
            # draw some conditional random samples from the model
            Xs = Xb_va[err_idx] # use validation samples with class errors
            if (Xs.shape[0] > 50):
                Xs = Xs[:50]
            Xs = np.repeat(Xs, 4, axis=0)
            if ((Xs.shape[0] % 20) != 0):
                # round-off the number of error examples, for nice display
                remainder = Xs.shape[0] % 20
                Xs = Xs[:-remainder]
            samp_count = Xs.shape[0]
            # draw some conditional random samples from the model
            model_samps = MSM.sample_from_input(BD(Xs), guided_decoding=False)
            model_samps.append(Xs)
            seq_len = len(model_samps)
            seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1]))
            idx = 0
            for s1 in range(samp_count): 
                for s2 in range(seq_len):
                    seq_samps[idx] = model_samps[s2][s1]
                    idx += 1
            file_name = "MSS_A_SAMPLES_CND_ERR_b{0:d}.png".format(i)
            utils.visualize_samples(seq_samps, file_name, num_rows=20)
示例#33
0
def test_gi_pair():
    # Initialize a source of randomness
    rng = np.random.RandomState(1234)

    # Load some data to train/validate/test with
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, zero_mean=False)
    Xtr = datasets[0][0].get_value(borrow=False).astype(theano.config.floatX)
    tr_samples = Xtr.shape[0]

    # Construct a GenNet and an InfNet, then test constructor for GIPair.
    # Do basic testing, to make sure classes aren't completely broken.
    Xp = T.matrix('Xp_base')
    Xd = T.matrix('Xd_base')
    Xc = T.matrix('Xc_base')
    Xm = T.matrix('Xm_base')
    data_dim = Xtr.shape[1]
    prior_dim = 64
    prior_sigma = 1.0
    # Choose some parameters for the generator network
    gn_params = {}
    gn_config = [prior_dim, 1000, 1000, data_dim]
    gn_params['mlp_config'] = gn_config
    gn_params['activation'] = relu_actfun
    gn_params['out_type'] = 'bernoulli'
    gn_params['init_scale'] = 2.0
    gn_params['lam_l2a'] = 1e-2
    gn_params['vis_drop'] = 0.0
    gn_params['hid_drop'] = 0.0
    gn_params['bias_noise'] = 0.1
    # choose some parameters for the continuous inferencer
    in_params = {}
    shared_config = [data_dim, (250, 4), (250, 4)]
    top_config = [shared_config[-1], (125, 4), prior_dim]
    in_params['shared_config'] = shared_config
    in_params['mu_config'] = top_config
    in_params['sigma_config'] = top_config
    in_params['activation'] = relu_actfun
    in_params['init_scale'] = 2.0
    in_params['lam_l2a'] = 1e-2
    in_params['vis_drop'] = 0.0
    in_params['hid_drop'] = 0.0
    in_params['bias_noise'] = 0.1
    in_params['input_noise'] = 0.1
    # Initialize the base networks for this GIPair
    IN = InfNet(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, prior_sigma=prior_sigma, \
            params=in_params, shared_param_dicts=None)
    GN = GenNet(rng=rng, Xp=Xp, prior_sigma=prior_sigma, \
            params=gn_params, shared_param_dicts=None)
    # Initialize biases in IN and GN
    IN.init_biases(0.0)
    GN.init_biases(0.1)
    # Initialize the GIPair
    GIP = GIPair(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, g_net=GN, i_net=IN, \
            data_dim=data_dim, prior_dim=prior_dim, params=None)
    GIP.set_lam_l2w(1e-4)

    # Set initial learning rate and basic SGD hyper parameters
    learn_rate = 0.001
    for i in range(750000):
        scale = min(1.0, float(i) / 25000.0)
        if ((i+1 % 100000) == 0):
            learn_rate = learn_rate * 0.75
        GIP.set_all_sgd_params(learn_rate=(scale*learn_rate), momentum=0.95)
        GIP.set_lam_nll(lam_nll=1.0)
        GIP.set_lam_kld(lam_kld=(1.0 * scale))
        # get some data to train with
        tr_idx = npr.randint(low=0,high=tr_samples,size=(100,))
        Xd_batch = Xtr.take(tr_idx, axis=0) #binarize_data(Xtr.take(tr_idx, axis=0))
        Xc_batch = 0.0 * Xd_batch
        Xm_batch = 0.0 * Xd_batch
        # do a minibatch update of the model, and compute some costs
        outputs = GIP.train_joint(Xd_batch, Xc_batch, Xm_batch)
        joint_cost = 1.0 * outputs[0]
        data_nll_cost = 1.0 * outputs[1]
        post_kld_cost = 1.0 * outputs[2]
        other_reg_cost = 1.0 * outputs[3]
        if ((i % 1000) == 0):
            print("batch: {0:d}, joint_cost: {1:.4f}, data_nll_cost: {2:.4f}, post_kld_cost: {3:.4f}, other_reg_cost: {4:.4f}".format( \
                    i, joint_cost, data_nll_cost, post_kld_cost, other_reg_cost))
        if ((i % 5000) == 0):
            file_name = "GIP_CHAIN_SAMPLES_b{0:d}.png".format(i)
            Xd_samps = np.repeat(Xd_batch[0:10,:], 3, axis=0)
            sample_lists = GIP.sample_gil_from_data(Xd_samps, loop_iters=20)
            Xs = np.vstack(sample_lists["data samples"])
            utils.visualize_samples(Xs, file_name, num_rows=20)
            # draw inference net first layer weights
            file_name = "GIP_INF_WEIGHTS_b{0:d}.png".format(i)
            utils.visualize_net_layer(GIP.IN.shared_layers[0], file_name)
            # draw generator net final layer weights
            file_name = "GIP_GEN_WEIGHTS_b{0:d}.png".format(i)
            utils.visualize_net_layer(GIP.GN.mlp_layers[-1], file_name, use_transpose=True)
    print("TESTING COMPLETE!")
    return
示例#34
0
if __name__ == "__main__":
    # TEST CODE FOR MODEL SAVING AND LOADING
    from load_data import load_udm, load_udm_ss, load_mnist
    from NetLayers import relu_actfun, softplus_actfun, \
                          safe_softmax, safe_log

    # Simple test code, to check that everything is basically functional.
    print("TESTING...")

    # Initialize a source of randomness
    rng = np.random.RandomState(1234)

    # Load some data to train/validate/test with
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, zero_mean=False)
    Xtr = datasets[0][0]
    Xtr = Xtr.get_value(borrow=False)
    Xva = datasets[1][0]
    Xva = Xva.get_value(borrow=False)
    print("Xtr.shape: {0:s}, Xva.shape: {1:s}".format(str(Xtr.shape),
                                                      str(Xva.shape)))

    # get and set some basic dataset information
    tr_samples = Xtr.shape[0]
    data_dim = Xtr.shape[1]
    batch_size = 128
    prior_dim = 50
    prior_sigma = 1.0
    Xtr_mean = np.mean(Xtr, axis=0, keepdims=True)
    Xtr_mean = (0.0 * Xtr_mean) + np.mean(Xtr)
def train_walk_from_pretrained_osm(lam_kld=0.0):
    # Simple test code, to check that everything is basically functional.
    print("TESTING...")

    # Initialize a source of randomness
    rng = np.random.RandomState(1234)

    # Load some data to train/validate/test with
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, zero_mean=False)
    Xtr = datasets[0][0]
    Xtr = Xtr.get_value(borrow=False)
    Xva = datasets[2][0]
    Xva = Xva.get_value(borrow=False)
    print("Xtr.shape: {0:s}, Xva.shape: {1:s}".format(str(Xtr.shape),str(Xva.shape)))

    # get and set some basic dataset information
    tr_samples = Xtr.shape[0]
    va_samples = Xva.shape[0]
    data_dim = Xtr.shape[1]
    batch_size = 100
    batch_reps = 5
    prior_sigma = 1.0
    Xtr_mean = np.mean(Xtr, axis=0, keepdims=True)
    Xtr_mean = (0.0 * Xtr_mean) + np.mean(np.mean(Xtr,axis=1))
    Xc_mean = np.repeat(Xtr_mean, batch_size, axis=0)

    # Symbolic inputs
    Xd = T.matrix(name='Xd')
    Xc = T.matrix(name='Xc')
    Xm = T.matrix(name='Xm')
    Xt = T.matrix(name='Xt')

    ###############################
    # Setup discriminator network #
    ###############################
    # Set some reasonable mlp parameters
    dn_params = {}
    # Set up some proto-networks
    pc0 = [data_dim, (300, 4), (300, 4), 10]
    dn_params['proto_configs'] = [pc0]
    # Set up some spawn networks
    sc0 = {'proto_key': 0, 'input_noise': 0.0, 'bias_noise': 0.1, 'do_dropout': True}
    #sc1 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True}
    dn_params['spawn_configs'] = [sc0]
    dn_params['spawn_weights'] = [1.0]
    # Set remaining params
    dn_params['init_scale'] = 0.75
    dn_params['lam_l2a'] = 1e-2
    dn_params['vis_drop'] = 0.2
    dn_params['hid_drop'] = 0.5
    # Initialize a network object to use as the discriminator
    DN = PeaNet(rng=rng, Xd=Xd, params=dn_params)
    DN.init_biases(0.0)

    #######################################################
    # Load inferencer and generator from saved parameters #
    #######################################################
    gn_fname = RESULT_PATH+"pt_osm_params_b80000_GN.pkl"
    in_fname = RESULT_PATH+"pt_osm_params_b80000_IN.pkl"
    IN = load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd)
    GN = load_infnet_from_file(f_name=gn_fname, rng=rng, Xd=Xd)

    ########################################################
    # Define parameters for the VCGLoop, and initialize it #
    ########################################################
    print("Building the VCGLoop...")
    vcgl_params = {}
    vcgl_params['x_type'] = 'bernoulli'
    vcgl_params['xt_transform'] = 'sigmoid'
    vcgl_params['logvar_bound'] = LOGVAR_BOUND
    vcgl_params['cost_decay'] = 0.0
    vcgl_params['chain_type'] = 'walkout'
    vcgl_params['lam_l2d'] = 5e-2
    VCGL = VCGLoop(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, Xt=Xt, \
                 i_net=IN, g_net=GN, d_net=DN, chain_len=6, \
                 data_dim=data_dim, prior_dim=PRIOR_DIM, params=vcgl_params)

    out_file = open(RESULT_PATH+"pt_walk_results.txt", 'wb')
    ####################################################
    # Train the VCGLoop by unrolling and applying BPTT #
    ####################################################
    learn_rate = 0.0004
    cost_1 = [0. for i in range(10)]
    for i in range(50000):
        scale = float(min((i+1), 5000)) / 5000.0
        if ((i+1 % 25000) == 0):
            learn_rate = learn_rate * 0.9
        ########################################
        # TRAIN THE CHAIN IN FREE-RUNNING MODE #
        ########################################
        VCGL.set_all_sgd_params(learn_rate=(scale*learn_rate), \
                                mom_1=0.5, mom_2=0.99)
        VCGL.set_disc_weights(dweight_gn=25.0, dweight_dn=25.0)
        VCGL.set_lam_chain_nll(1.0)
        VCGL.set_lam_chain_kld(lam_kld)
        # get some data to train with
        tr_idx = npr.randint(low=0,high=tr_samples,size=(batch_size,))
        Xd_batch = Xtr.take(tr_idx, axis=0)
        Xc_batch = 0.0 * Xd_batch
        Xm_batch = 0.0 * Xd_batch
        # examples from the target distribution, to train discriminator
        tr_idx = npr.randint(low=0,high=tr_samples,size=(2*batch_size,))
        Xt_batch = Xtr.take(tr_idx, axis=0)
        # do a minibatch update of the model, and compute some costs
        outputs = VCGL.train_joint(Xd_batch, Xc_batch, Xm_batch, Xt_batch, batch_reps)
        cost_1 = [(cost_1[k] + 1.*outputs[k]) for k in range(len(outputs))]
        if ((i % 500) == 0):
            cost_1 = [(v / 500.0) for v in cost_1]
            o_str_1 = "batch: {0:d}, joint_cost: {1:.4f}, chain_nll_cost: {2:.4f}, chain_kld_cost: {3:.4f}, disc_cost_gn: {4:.4f}, disc_cost_dn: {5:.4f}".format( \
                    i, cost_1[0], cost_1[1], cost_1[2], cost_1[5], cost_1[6])
            print(o_str_1)
            cost_1 = [0. for v in cost_1]
        if ((i % 1000) == 0):
            tr_idx = npr.randint(low=0,high=Xtr.shape[0],size=(5,))
            va_idx = npr.randint(low=0,high=Xva.shape[0],size=(5,))
            Xd_batch = np.vstack([Xtr.take(tr_idx, axis=0), Xva.take(va_idx, axis=0)])
            # draw some chains of samples from the VAE loop
            file_name = RESULT_PATH+"pt_walk_chain_samples_b{0:d}.png".format(i)
            Xd_samps = np.repeat(Xd_batch, 3, axis=0)
            sample_lists = VCGL.OSM.sample_from_chain(Xd_samps, loop_iters=20)
            Xs = np.vstack(sample_lists["data samples"])
            utils.visualize_samples(Xs, file_name, num_rows=20)
            # draw some masked chains of samples from the VAE loop
            file_name = RESULT_PATH+"pt_walk_mask_samples_b{0:d}.png".format(i)
            Xd_samps = np.repeat(Xc_mean[0:Xd_batch.shape[0],:], 3, axis=0)
            Xc_samps = np.repeat(Xd_batch, 3, axis=0)
            Xm_rand = sample_masks(Xc_samps, drop_prob=0.0)
            Xm_patch = sample_patch_masks(Xc_samps, (28,28), (16,16))
            Xm_samps = Xm_rand * Xm_patch
            sample_lists = VCGL.OSM.sample_from_chain(Xd_samps, \
                    X_c=Xc_samps, X_m=Xm_samps, loop_iters=20)
            Xs = np.vstack(sample_lists["data samples"])
            utils.visualize_samples(Xs, file_name, num_rows=20)
            # draw some samples independently from the GenNet's prior
            file_name = RESULT_PATH+"pt_walk_prior_samples_b{0:d}.png".format(i)
            Xs = VCGL.sample_from_prior(20*20)
            utils.visualize_samples(Xs, file_name, num_rows=20)
        # DUMP PARAMETERS FROM TIME-TO-TIME
        if (i % 10000 == 0):
            DN.save_to_file(f_name=RESULT_PATH+"pt_walk_params_b{0:d}_DN.pkl".format(i))
            IN.save_to_file(f_name=RESULT_PATH+"pt_walk_params_b{0:d}_IN.pkl".format(i))
            GN.save_to_file(f_name=RESULT_PATH+"pt_walk_params_b{0:d}_GN.pkl".format(i))
    return
def test_imocld_imp_mnist(step_type='add', occ_dim=14, drop_prob=0.0, attention=False):
    ##########################
    # Get some training data #
    ##########################
    rng = np.random.RandomState(1234)
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, as_shared=False, zero_mean=False)
    Xtr = datasets[0][0]
    Xva = datasets[1][0]
    Xtr = to_fX(shift_and_scale_into_01(Xtr))
    Xva = to_fX(shift_and_scale_into_01(Xva))
    tr_samples = Xtr.shape[0]
    va_samples = Xva.shape[0]
    batch_size = 250

    ############################################################
    # Setup some parameters for the Iterative Refinement Model #
    ############################################################
    x_dim = Xtr.shape[1]
    write_dim = 300
    enc_dim = 300
    dec_dim = 300
    mix_dim = 20
    z_dim = 100
    n_iter = 16
    dp_int = int(100.0 * drop_prob)
    
    rnninits = {
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }
    inits = {
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }

    att_tag = "NA" # attention not implemented yet

    # setup the reader and writer (shared by primary and guide policies)
    read_dim = 2*x_dim # dimension of output from reader_mlp
    reader_mlp = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits)
    writer_mlp = MLP([None, None], [dec_dim, write_dim, x_dim], \
                     name="writer_mlp", **inits)
    
    # mlps for setting conditionals over z_mix
    mix_var_mlp = CondNet([Tanh()], [x_dim, 250, mix_dim], \
                          name="mix_var_mlp", **inits)
    mix_enc_mlp = CondNet([Tanh()], [x_dim, 250, mix_dim], \
                          name="mix_enc_mlp", **inits)
    # mlp for decoding z_mix into a distribution over initial LSTM states
    mix_dec_mlp = MLP([Tanh(), Tanh()], \
                      [mix_dim, 250, (2*enc_dim + 2*dec_dim + 2*enc_dim)], \
                      name="mix_dec_mlp", **inits)
    # mlps for processing inputs to LSTMs
    var_mlp_in = MLP([Identity()], [(read_dim + dec_dim), 4*enc_dim], \
                     name="var_mlp_in", **inits)
    enc_mlp_in = MLP([Identity()], [(read_dim + dec_dim), 4*enc_dim], \
                     name="enc_mlp_in", **inits)
    dec_mlp_in = MLP([Identity()], [               z_dim, 4*dec_dim], \
                     name="dec_mlp_in", **inits)
    # mlps for turning LSTM outputs into conditionals over z_gen
    var_mlp_out = CondNet([], [enc_dim, z_dim], name="var_mlp_out", **inits)
    enc_mlp_out = CondNet([], [enc_dim, z_dim], name="enc_mlp_out", **inits)
    # LSTMs for the actual LSTMs (obviously, perhaps)
    var_rnn = BiasedLSTM(dim=enc_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="var_rnn", **rnninits)
    enc_rnn = BiasedLSTM(dim=enc_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="enc_rnn", **rnninits)
    dec_rnn = BiasedLSTM(dim=dec_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="dec_rnn", **rnninits)

    draw = IMoCLDrawModels(
                n_iter,
                step_type=step_type, # step_type can be 'add' or 'jump'
                reader_mlp=reader_mlp,
                writer_mlp=writer_mlp,
                mix_enc_mlp=mix_enc_mlp,
                mix_dec_mlp=mix_dec_mlp,
                mix_var_mlp=mix_var_mlp,
                enc_mlp_in=enc_mlp_in,
                enc_mlp_out=enc_mlp_out,
                enc_rnn=enc_rnn,
                dec_mlp_in=dec_mlp_in,
                dec_rnn=dec_rnn,
                var_mlp_in=var_mlp_in,
                var_mlp_out=var_mlp_out,
                var_rnn=var_rnn)
    draw.initialize()

    # build the cost gradients, training function, samplers, etc.
    draw.build_model_funcs()

    #draw.load_model_params(f_name="TBCLM_IMP_PARAMS_OD{}_DP{}_{}_{}.pkl".format(occ_dim, dp_int, step_type, att_tag))

    ################################################################
    # Apply some updates, to check that they aren't totally broken #
    ################################################################
    print("Beginning to train the model...")
    out_file = open("TBCLM_IMP_RESULTS_OD{}_DP{}_{}_{}.txt".format(occ_dim, dp_int, step_type, att_tag), 'wb')
    out_file.flush()
    costs = [0. for i in range(10)]
    learn_rate = 0.0002
    momentum = 0.5
    batch_idx = np.arange(batch_size) + tr_samples
    for i in range(250000):
        scale = min(1.0, ((i+1) / 1000.0))
        if (((i + 1) % 10000) == 0):
            learn_rate = learn_rate * 0.95
        if (i > 10000):
            momentum = 0.90
        else:
            momentum = 0.50
        # get the indices of training samples for this batch update
        batch_idx += batch_size
        if (np.max(batch_idx) >= tr_samples):
            # we finished an "epoch", so we rejumble the training set
            Xtr = row_shuffle(Xtr)
            batch_idx = np.arange(batch_size)
        # set sgd and objective function hyperparams for this update
        zero_ary = np.zeros((1,))
        draw.lr.set_value(to_fX(zero_ary + learn_rate))
        draw.mom_1.set_value(to_fX(zero_ary + momentum))
        draw.mom_2.set_value(to_fX(zero_ary + 0.99))

        # perform a minibatch update and record the cost for this batch
        Xb = to_fX(Xtr.take(batch_idx, axis=0))
        _, Xb, Mb = construct_masked_data(Xb, drop_prob=drop_prob, \
                                    occ_dim=occ_dim, data_mean=None)
        result = draw.train_joint(Xb, Mb)

        costs = [(costs[j] + result[j]) for j in range(len(result))]
        if ((i % 200) == 0):
            costs = [(v / 200.0) for v in costs]
            str1 = "-- batch {0:d} --".format(i)
            str2 = "    total_cost: {0:.4f}".format(costs[0])
            str3 = "    nll_bound : {0:.4f}".format(costs[1])
            str4 = "    nll_term  : {0:.4f}".format(costs[2])
            str5 = "    kld_q2p   : {0:.4f}".format(costs[3])
            str6 = "    kld_p2q   : {0:.4f}".format(costs[4])
            str7 = "    reg_term  : {0:.4f}".format(costs[5])
            joint_str = "\n".join([str1, str2, str3, str4, str5, str6, str7])
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
            costs = [0.0 for v in costs]
        if ((i % 1000) == 0):
            draw.save_model_params("TBCLM_IMP_PARAMS_OD{}_DP{}_{}_{}.pkl".format(occ_dim, dp_int, step_type, att_tag))
            # compute a small-sample estimate of NLL bound on validation set
            Xva = row_shuffle(Xva)
            Xb = to_fX(Xva[:5000])
            _, Xb, Mb = construct_masked_data(Xb, drop_prob=drop_prob, \
                                    occ_dim=occ_dim, data_mean=None)
            va_costs = draw.compute_nll_bound(Xb, Mb)
            str1 = "    va_nll_bound : {}".format(va_costs[1])
            str2 = "    va_nll_term  : {}".format(va_costs[2])
            str3 = "    va_kld_q2p   : {}".format(va_costs[3])
            joint_str = "\n".join([str1, str2, str3])
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
def test_seq_cond_gen_copy(step_type='add', res_tag="AAA"):
    ##############################
    # File tag, for output stuff #
    ##############################
    result_tag = "{}TEST_{}".format(RESULT_PATH, res_tag)

    ##########################
    # Get some training data #
    ##########################
    rng = np.random.RandomState(1234)
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, as_shared=False, zero_mean=False)
    Xtr = datasets[0][0]
    Xva = datasets[1][0]
    Xte = datasets[2][0]
    # merge validation set and training set, and test on test set.
    #Xtr = np.concatenate((Xtr, Xva), axis=0)
    #Xva = Xte
    Xtr = to_fX(shift_and_scale_into_01(Xtr))
    Xva = to_fX(shift_and_scale_into_01(Xva))

    # basic params
    batch_size = 128
    traj_len = 20
    im_dim = 28
    obs_dim = im_dim * im_dim

    def sample_batch(np_ary, bs=100):
        row_count = np_ary.shape[0]
        samp_idx = npr.randint(low=0, high=row_count, size=(bs, ))
        xb = np_ary.take(samp_idx, axis=0)
        return xb

    ############################################################
    # Setup some parameters for the Iterative Refinement Model #
    ############################################################
    total_steps = traj_len
    init_steps = 5
    exit_rate = 0.1
    nll_weight = 0.0
    x_dim = obs_dim
    y_dim = obs_dim
    z_dim = 128
    att_spec_dim = 5
    rnn_dim = 512
    mlp_dim = 512

    def visualize_attention(result, pre_tag="AAA", post_tag="AAA"):
        seq_len = result[0].shape[0]
        samp_count = result[0].shape[1]
        # get generated predictions
        x_samps = np.zeros((seq_len * samp_count, obs_dim))
        idx = 0
        for s1 in range(samp_count):
            for s2 in range(seq_len):
                x_samps[idx] = result[0][s2, s1, :]
                idx += 1
        file_name = "{0:s}_traj_xs_{1:s}.png".format(pre_tag, post_tag)
        utils.visualize_samples(x_samps, file_name, num_rows=samp_count)
        # get sequential attention maps
        seq_samps = np.zeros((seq_len * samp_count, obs_dim))
        idx = 0
        for s1 in range(samp_count):
            for s2 in range(seq_len):
                seq_samps[idx] = result[1][s2, s1, :]
                idx += 1
        file_name = "{0:s}_traj_att_maps_{1:s}.png".format(pre_tag, post_tag)
        utils.visualize_samples(seq_samps, file_name, num_rows=samp_count)
        # get sequential attention maps (read out values)
        seq_samps = np.zeros((seq_len * samp_count, obs_dim))
        idx = 0
        for s1 in range(samp_count):
            for s2 in range(seq_len):
                seq_samps[idx] = result[2][s2, s1, :]
                idx += 1
        file_name = "{0:s}_traj_read_outs_{1:s}.png".format(pre_tag, post_tag)
        utils.visualize_samples(seq_samps, file_name, num_rows=samp_count)
        # get original input sequences
        seq_samps = np.zeros((seq_len * samp_count, obs_dim))
        idx = 0
        for s1 in range(samp_count):
            for s2 in range(seq_len):
                seq_samps[idx] = result[3][s2, s1, :]
                idx += 1
        file_name = "{0:s}_traj_xs_in_{1:s}.png".format(pre_tag, post_tag)
        utils.visualize_samples(seq_samps, file_name, num_rows=samp_count)
        return

    rnninits = {
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }
    inits = {
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }

    # module for doing local 2d read defined by an attention specification
    img_scale = 1.0  # image coords will range over [-img_scale...img_scale]
    read_N = 2  # use NxN grid for reader
    reader_mlp = FovAttentionReader2d(x_dim=obs_dim,
                                      width=im_dim,
                                      height=im_dim,
                                      N=read_N,
                                      img_scale=img_scale,
                                      att_scale=0.5,
                                      **inits)
    read_dim = reader_mlp.read_dim  # total number of "pixels" read by reader

    # MLP for updating belief state based on con_rnn
    writer_mlp = MLP([None, None], [rnn_dim, mlp_dim, obs_dim], \
                     name="writer_mlp", **inits)

    # mlps for processing inputs to LSTMs
    con_mlp_in = MLP([Identity()], \
                     [                       z_dim, 4*rnn_dim], \
                     name="con_mlp_in", **inits)
    var_mlp_in = MLP([Identity()], \
                     [(read_dim + read_dim + att_spec_dim + rnn_dim), 4*rnn_dim], \
                     name="var_mlp_in", **inits)
    gen_mlp_in = MLP([Identity()], \
                     [        (read_dim + att_spec_dim + rnn_dim), 4*rnn_dim], \
                     name="gen_mlp_in", **inits)

    # mlps for turning LSTM outputs into conditionals over z_gen
    con_mlp_out = CondNet([], [rnn_dim, att_spec_dim], \
                          name="con_mlp_out", **inits)
    gen_mlp_out = CondNet([], [rnn_dim, z_dim], name="gen_mlp_out", **inits)
    var_mlp_out = CondNet([], [rnn_dim, z_dim], name="var_mlp_out", **inits)

    # LSTMs for the actual LSTMs (obviously, perhaps)
    con_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="con_rnn", **rnninits)
    gen_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="gen_rnn", **rnninits)
    var_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="var_rnn", **rnninits)

    SCG = SeqCondGenRAM(x_and_y_are_seqs=False,
                        total_steps=total_steps,
                        init_steps=init_steps,
                        exit_rate=exit_rate,
                        nll_weight=nll_weight,
                        step_type=step_type,
                        x_dim=obs_dim,
                        y_dim=obs_dim,
                        reader_mlp=reader_mlp,
                        writer_mlp=writer_mlp,
                        con_mlp_in=con_mlp_in,
                        con_mlp_out=con_mlp_out,
                        con_rnn=con_rnn,
                        gen_mlp_in=gen_mlp_in,
                        gen_mlp_out=gen_mlp_out,
                        gen_rnn=gen_rnn,
                        var_mlp_in=var_mlp_in,
                        var_mlp_out=var_mlp_out,
                        var_rnn=var_rnn)
    SCG.initialize()

    compile_start_time = time.time()

    # build the attention trajectory sampler
    SCG.build_attention_funcs()

    # quick test of attention trajectory sampler
    Xb = sample_batch(Xtr, bs=32)
    result = SCG.sample_attention(Xb, Xb)
    visualize_attention(result, pre_tag=result_tag, post_tag="b0")

    # build the main model functions (i.e. training and cost functions)
    SCG.build_model_funcs()

    compile_end_time = time.time()
    compile_minutes = (compile_end_time - compile_start_time) / 60.0
    print("THEANO COMPILE TIME (MIN): {}".format(compile_minutes))

    # TEST SAVE/LOAD FUNCTIONALITY
    param_save_file = "{}_params.pkl".format(result_tag)
    SCG.save_model_params(param_save_file)
    SCG.load_model_params(param_save_file)

    ################################################################
    # Apply some updates, to check that they aren't totally broken #
    ################################################################
    print("Beginning to train the model...")
    out_file = open("{}_results.txt".format(result_tag), 'wb')
    out_file.flush()
    costs = [0. for i in range(10)]
    learn_rate = 0.0001
    momentum = 0.95
    for i in range(250000):
        lr_scale = min(1.0, ((i + 1) / 5000.0))
        mom_scale = min(1.0, ((i + 1) / 10000.0))
        if (((i + 1) % 10000) == 0):
            learn_rate = learn_rate * 0.95
        # set sgd and objective function hyperparams for this update
        SCG.set_sgd_params(lr=lr_scale * learn_rate,
                           mom_1=mom_scale * momentum,
                           mom_2=0.99)
        SCG.set_lam_kld(lam_kld_q2p=0.95, lam_kld_p2q=0.05, \
                        lam_kld_amu=0.0, lam_kld_alv=0.1)
        # perform a minibatch update and record the cost for this batch
        Xb = sample_batch(Xtr, bs=batch_size)
        result = SCG.train_joint(Xb, Xb)
        costs = [(costs[j] + result[j]) for j in range(len(result))]
        # output diagnostic information and checkpoint parameters, etc.
        if ((i % 250) == 0):
            costs = [(v / 250.0) for v in costs]
            str1 = "-- batch {0:d} --".format(i)
            str2 = "    total_cost: {0:.4f}".format(costs[0])
            str3 = "    nll_term  : {0:.4f}".format(costs[1])
            str4 = "    kld_q2p   : {0:.4f}".format(costs[2])
            str5 = "    kld_p2q   : {0:.4f}".format(costs[3])
            str6 = "    kld_amu   : {0:.4f}".format(costs[4])
            str7 = "    kld_alv   : {0:.4f}".format(costs[5])
            str8 = "    reg_term  : {0:.4f}".format(costs[6])
            joint_str = "\n".join(
                [str1, str2, str3, str4, str5, str6, str7, str8])
            print(joint_str)
            out_file.write(joint_str + "\n")
            out_file.flush()
            costs = [0.0 for v in costs]
        if ((i % 500) == 0):
            SCG.save_model_params("{}_params.pkl".format(result_tag))
            #############################################
            # check model performance on validation set #
            #############################################
            Xb = sample_batch(Xva, bs=500)
            result = SCG.compute_nll_bound(Xb, Xb)
            str2 = "    va_total_cost: {0:.4f}".format(float(result[0]))
            str3 = "    va_nll_term  : {0:.4f}".format(float(result[1]))
            str4 = "    va_kld_q2p   : {0:.4f}".format(float(result[2]))
            str5 = "    va_kld_p2q   : {0:.4f}".format(float(result[3]))
            str6 = "    va_kld_amu   : {0:.4f}".format(float(result[4]))
            str7 = "    va_kld_alv   : {0:.4f}".format(float(result[5]))
            str8 = "    va_reg_term  : {0:.4f}".format(float(result[6]))
            joint_str = "\n".join([str2, str3, str4, str5, str6, str7, str8])
            print(joint_str)
            out_file.write(joint_str + "\n")
            out_file.flush()
            ###########################################
            # sample and draw attention trajectories. #
            ###########################################
            Xb = sample_batch(Xva, bs=32)
            result = SCG.sample_attention(Xb, Xb)
            post_tag = "b{0:d}".format(i)
            visualize_attention(result, pre_tag=result_tag, post_tag=post_tag)
示例#38
0
def test_gc_pair():
    # Simple test code, to check that everything is basically functional.
    print("TESTING...")

    # Initialize a source of randomness
    rng = np.random.RandomState(1234)

    # Load some data to train/validate/test with
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, zero_mean=False)
    Xtr = datasets[0][0]
    tr_samples = Xtr.get_value(borrow=True).shape[0]
    data_dim = Xtr.get_value(borrow=True).shape[1]
    mm_proj_dim = 250

    # Do moment matching in some transformed space
    #P = np.identity(data_dim)
    P = npr.randn(data_dim, mm_proj_dim) / np.sqrt(float(mm_proj_dim))
    P = theano.shared(value=P.astype(theano.config.floatX), name='P_proj')

    target_mean, target_cov = projected_moments(Xtr, P, ary_type='theano')
    P = P.get_value(borrow=False).astype(theano.config.floatX)

    ###########################
    # Setup generator network #
    ###########################

    # Choose some parameters for the generative network
    gn_params = {}
    gn_config = [200, 1000, 1000, 28*28]
    gn_params['mlp_config'] = gn_config
    gn_params['lam_l2a'] = 1e-3
    gn_params['vis_drop'] = 0.0
    gn_params['hid_drop'] = 0.0
    gn_params['bias_noise'] = 0.1
    gn_params['out_type'] = 'bernoulli'
    gn_params['activation'] = relu_actfun
    gn_params['init_scale'] = 4.0

    # Symbolic input matrix to generator network
    Xp_sym = T.matrix(name='Xp_sym')
    Xd_sym = T.matrix(name='Xd_sym')

    # Initialize a generator network object
    GN = GenNet(rng=rng, Xp=Xp_sym, prior_sigma=1.0, params=gn_params)

    ###############################
    # Setup discriminator network #
    ###############################

    # Set some reasonable mlp parameters
    dn_params = {}
    # Set up some proto-networks
    pc0 = [28*28, (250, 4), (250, 4), 11]
    dn_params['proto_configs'] = [pc0]
    # Set up some spawn networks
    sc0 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True}
    #sc1 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True}
    dn_params['spawn_configs'] = [sc0]
    dn_params['spawn_weights'] = [1.0]
    # Set remaining params
    dn_params['ear_type'] = 2
    dn_params['ear_lam'] = 0.0
    dn_params['lam_l2a'] = 1e-3
    dn_params['vis_drop'] = 0.2
    dn_params['hid_drop'] = 0.5

    # Initialize a network object to use as the discriminator
    DN = PeaNet(rng=rng, Xd=Xd_sym, params=dn_params)

    ########################################################################
    # Initialize the joint controller for the generator/discriminator pair #
    ########################################################################

    gcp_params = {}
    gcp_params['d_net'] = DN
    gcp_params['g_net'] = GN
    gcp_params['lam_l2d'] = 1e-2
    gcp_params['mom_mix_rate'] = 0.05
    gcp_params['mom_match_weight'] = 0.05
    gcp_params['mom_match_proj'] = P
    gcp_params['target_mean'] = target_mean
    gcp_params['target_cov'] = target_cov

    # Initialize a GCPair instance using the previously constructed generator and
    # discriminator networks.
    GCP = GCPair(rng=rng, Xd=Xd_sym, Xp=Xp_sym, d_net=DN, g_net=GN, \
            data_dim=28*28, params=gcp_params)

    gn_learn_rate = 0.02
    dn_learn_rate = 0.01
    GCP.set_gn_sgd_params(learn_rate=gn_learn_rate, momentum=0.98)
    GCP.set_dn_sgd_params(learn_rate=dn_learn_rate, momentum=0.98)
    # Init generator's mean and covariance estimates with many samples
    GCP.init_moments(10000)

    batch_idx = T.lvector('batch_idx')
    batch_sample = theano.function(inputs=[ batch_idx ], \
            outputs=Xtr.take(batch_idx, axis=0))

    for i in range(750000):
        tr_idx = npr.randint(low=0,high=tr_samples,size=(100,)).astype(np.int32)
        Xn_np = GN.sample_from_prior(100)
        Xd_batch = batch_sample(tr_idx)
        Xd_batch = Xd_batch.astype(theano.config.floatX)
        Xn_batch = Xn_np.astype(theano.config.floatX)
        all_idx = np.arange(200)
        data_idx = all_idx[:100]
        noise_idx = all_idx[100:]
        scale = min(1.0, float(i+1)/10000.0)
        GCP.set_disc_weights(dweight_gn=scale, dweight_dn=scale)
        outputs = GCP.train_joint(Xd_batch, Xn_batch, data_idx, noise_idx)
        mom_match_cost = 1.0 * outputs[0]
        disc_cost_gn = 1.0 * outputs[1]
        disc_cost_dn = 1.0 * outputs[2]
        if ((i+1 % 100000) == 0):
            gn_learn_rate = gn_learn_rate * 0.7
            dn_learn_rate = dn_learn_rate * 0.7
            GCP.set_gn_sgd_params(learn_rate=gn_learn_rate, momentum=0.98)
            GCP.set_dn_sgd_params(learn_rate=dn_learn_rate, momentum=0.98)
        if ((i % 1000) == 0):
            print("batch: {0:d}, mom_match_cost: {1:.4f}, disc_cost_gn: {2:.4f}, disc_cost_dn: {3:.4f}".format( \
                    i, mom_match_cost, disc_cost_gn, disc_cost_dn))
        if ((i % 5000) == 0):
            file_name = "GCP_SAMPLES_b{0:d}.png".format(i)
            Xs = GCP.sample_from_gn(200)
            utils.visualize_samples(Xs, file_name)
            file_name = "GCP_WEIGHTS_b{0:d}.png".format(i)
            utils.visualize(GCP.DN, 0, 0, file_name)
    print("TESTING COMPLETE!")
    return
def test_mnist_results(step_type='add',
                       imp_steps=6,
                       occ_dim=15,
                       drop_prob=0.0):
    #########################################
    # Format the result tag more thoroughly #
    #########################################
    dp_int = int(100.0 * drop_prob)
    result_tag = "{}GPSI_OD{}_DP{}_IS{}_{}_NA".format(RESULT_PATH, occ_dim, dp_int, imp_steps, step_type)

    ##########################
    # Get some training data #
    ##########################
    rng = np.random.RandomState(1234)
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, as_shared=False, zero_mean=False)
    Xtr = datasets[0][0]
    Xva = datasets[1][0]
    Xte = datasets[2][0]
    # Merge validation set and training set, and test on test set.
    Xtr = np.concatenate((Xtr, Xva), axis=0)
    Xva = Xte
    Xtr = to_fX(shift_and_scale_into_01(Xtr))
    Xva = to_fX(shift_and_scale_into_01(Xva))
    tr_samples = Xtr.shape[0]
    va_samples = Xva.shape[0]
    batch_size = 250
    batch_reps = 1
    all_pix_mean = np.mean(np.mean(Xtr, axis=1))
    data_mean = to_fX( all_pix_mean * np.ones((Xtr.shape[1],)) )

    # Load parameters from a previously trained model
    print("Testing model load from file...")
    GPSI = load_gpsimputer_from_file(f_name="{}_PARAMS.pkl".format(result_tag), \
                                     rng=rng)

    ################################################################
    # Apply some updates, to check that they aren't totally broken #
    ################################################################
    log_name = "{}_FINAL_RESULTS_NEW.txt".format(result_tag)
    out_file = open(log_name, 'wb')

    Xva = row_shuffle(Xva)
    # record an estimate of performance on the test set
    str0 = "GUIDED SAMPLE BOUND:"
    print(str0)
    xi, xo, xm = construct_masked_data(Xva[:5000], drop_prob=drop_prob, \
                                       occ_dim=occ_dim, data_mean=data_mean)
    nll_0, kld_0 = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10, \
                                         use_guide_policy=True)
    xi, xo, xm = construct_masked_data(Xva[5000:], drop_prob=drop_prob, \
                                       occ_dim=occ_dim, data_mean=data_mean)
    nll_1, kld_1 = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10, \
                                         use_guide_policy=True)
    nll = np.concatenate((nll_0, nll_1))
    kld = np.concatenate((kld_0, kld_1))
    vfe = np.mean(nll) + np.mean(kld)
    str1 = "    va_nll_bound : {}".format(vfe)
    str2 = "    va_nll_term  : {}".format(np.mean(nll))
    str3 = "    va_kld_q2p   : {}".format(np.mean(kld))
    joint_str = "\n".join([str0, str1, str2, str3])
    print(joint_str)
    out_file.write(joint_str+"\n")
    out_file.flush()
    # record an estimate of performance on the test set
    str0 = "UNGUIDED SAMPLE BOUND:"
    print(str0)
    xi, xo, xm = construct_masked_data(Xva[:5000], drop_prob=drop_prob, \
                                       occ_dim=occ_dim, data_mean=data_mean)
    nll_0, kld_0 = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10, \
                                         use_guide_policy=False)
    xi, xo, xm = construct_masked_data(Xva[5000:], drop_prob=drop_prob, \
                                       occ_dim=occ_dim, data_mean=data_mean)
    nll_1, kld_1 = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10, \
                                         use_guide_policy=False)
    nll = np.concatenate((nll_0, nll_1))
    kld = np.concatenate((kld_0, kld_1))
    str1 = "    va_nll_bound : {}".format(np.mean(nll))
    str2 = "    va_nll_term  : {}".format(np.mean(nll))
    str3 = "    va_kld_q2p   : {}".format(np.mean(kld))
    joint_str = "\n".join([str0, str1, str2, str3])
    print(joint_str)
    out_file.write(joint_str+"\n")
    out_file.flush()
示例#40
0
def test_with_model_init():
    ##########################
    # Get some training data #
    ##########################
    rng = np.random.RandomState(1234)
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, as_shared=False, zero_mean=False)
    Xtr = to_fX(datasets[0][0])
    Xva = to_fX(datasets[1][0])
    Ytr = datasets[0][1]
    Yva = datasets[1][1]

    tr_samples = Xtr.shape[0]
    va_samples = Xva.shape[0]
    batch_size = 200

    BD = lambda ary: binarize_data(ary)

    #######################################
    # Setup some parameters for the model #
    #######################################
    obs_dim = Xtr.shape[1]
    z_dim = 64
    init_scale = 0.2

    # some InfNet instances to build the TwoStageModel from
    x_in = T.matrix('x_in')
    y_in = T.lvector('y_in')

    ###############
    # q_z_given_x #
    ###############
    print("Building q_z_given_x...")
    params = {}
    shared_config = [obs_dim, 1000, 1000]
    top_config = [shared_config[-1], z_dim]
    params['shared_config'] = shared_config
    params['mu_config'] = top_config
    params['sigma_config'] = top_config
    params['activation'] = relu_actfun
    params['init_scale'] = init_scale
    params['lam_l2a'] = 0.0
    params['vis_drop'] = 0.2
    params['hid_drop'] = 0.5
    params['bias_noise'] = 0.0
    params['input_noise'] = 0.0
    params['build_theano_funcs'] = False
    q_z_given_x = InfNet(rng=rng, Xd=x_in, \
            params=params, shared_param_dicts=None)
    q_z_given_x.init_biases(0.2)


    ###########################################################
    # Define parameters for the ClassModel, and initialize it #
    ###########################################################
    print("Building the ClassModel...")
    CM = ClassModel(rng=rng, \
            x_in=x_in, y_in=y_in, \
            q_z_given_x=q_z_given_x, \
            class_count=10, \
            z_dim=z_dim, \
            use_samples=False)
    CM.set_drop_rate(0.5)
    CM.set_lam_nll(lam_nll=1.0)
    CM.set_lam_kld(lam_kld_q2p=1.0, lam_kld_p2q=0.0)
    CM.set_lam_l2w(lam_l2w=1e-5)

    ################################################################
    # Apply some updates, to check that they aren't totally broken #
    ################################################################
    out_file = open("CM_RESULTS.txt", 'wb')
    costs = [0. for i in range(10)]
    learn_rate = 0.0002
    momentum = 0.9
    batch_idx = np.arange(batch_size) + tr_samples
    for i in range(250000):
        scale = min(1.0, ((i+1) / 1000.0))
        if (((i + 1) % 10000) == 0):
            learn_rate = learn_rate * 0.95
        # get the indices of training samples for this batch update
        batch_idx += batch_size
        if (np.max(batch_idx) >= tr_samples):
            # we finished an "epoch", so we rejumble the training set
            Xtr, Ytr = row_shuffle(Xtr, Ytr)
            batch_idx = np.arange(batch_size)
        # set sgd and objective function hyperparams for this update
        CM.set_sgd_params(lr_1=scale*learn_rate, lr_2=scale*learn_rate, \
                          mom_1=scale*momentum, mom_2=0.99)
        # perform a minibatch update and record the cost for this batch
        Xi_tr = Xtr.take(batch_idx, axis=0)
        Yi_tr = Ytr.take(batch_idx, axis=0)
        result = CM.train_joint(Xi_tr, Yi_tr)
        costs = [(costs[j] + result[j]) for j in range(len(result)-1)]
        # output useful information about training progress
        if ((i % 500) == 0):
            costs = [(v / 500.0) for v in costs]
            str1 = "-- batch {0:d} --".format(i)
            str2 = "    joint_cost  : {0:.4f}".format(costs[0])
            str3 = "    nll_cost    : {0:.4f}".format(costs[1])
            str4 = "    kld_cost    : {0:.4f}".format(costs[2])
            str5 = "    reg_cost    : {0:.4f}".format(costs[3])
            joint_str = "\n".join([str1, str2, str3, str4, str5])
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
            costs = [0.0 for v in costs]
        if (((i % 2000) == 0) or ((i < 10000) and ((i % 1000) == 0))):
            #####################################################
            # compute multi-sample estimates of the free-energy #
            #####################################################
            # training set...
            fe_terms = CM.compute_fe_terms(Xtr[0:2500],Ytr[0:2500], 30)
            fe_nll = np.mean(fe_terms[0])
            fe_kld = np.mean(fe_terms[1])
            fe_joint = fe_nll + fe_kld
            joint_str = "    vfe-tr: {0:.4f}, nll: ({1:.4f}, {2:.4f}, {3:.4f}), kld: ({4:.4f}, {5:.4f}, {6:.4f})".format( \
                    fe_joint, fe_nll, np.min(fe_terms[0]), np.max(fe_terms[0]), fe_kld, np.min(fe_terms[1]), np.max(fe_terms[1]))
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
            # validation set...
            Xva, Yva = row_shuffle(Xva, Yva)
            fe_terms = CM.compute_fe_terms(Xva[0:2500], Yva[0:2500], 30)
            fe_nll = np.mean(fe_terms[0])
            fe_kld = np.mean(fe_terms[1])
            fe_joint = fe_nll + fe_kld
            joint_str = "    vfe-va: {0:.4f}, nll: ({1:.4f}, {2:.4f}, {3:.4f}), kld: ({4:.4f}, {5:.4f}, {6:.4f})".format( \
                    fe_joint, fe_nll, np.min(fe_terms[0]), np.max(fe_terms[0]), fe_kld, np.min(fe_terms[1]), np.max(fe_terms[1]))
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
            ##########################################################
            # compute multi-sample estimates of classification error #
            ##########################################################
            # training set...
            va_error, va_preds = CM.class_error(Xtr[:2500], Ytr[:2500], samples=30)
            joint_str = "    tr-class-error: {0:.4f}".format(va_error)
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
            # validation set...
            va_error, va_preds = CM.class_error(Xva[:2500], Yva[:2500], samples=30)
            joint_str = "    va-class-error: {0:.4f}".format(va_error)
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
示例#41
0
def batch_test_ss_mlp_pt(test_count=10, su_count=1000):
    """Setup basic test for semisupervised EAR-regularized MLP."""
    # Set some reasonable sgd parameters
    sgd_params = {}
    sgd_params['start_rate'] = 0.01
    sgd_params['decay_rate'] = 0.998
    sgd_params['wt_norm_bound'] = 3.5
    sgd_params['epochs'] = 1000
    sgd_params['batch_size'] = 100
    sgd_params['result_tag'] = '---'
    # Set some reasonable mlp parameters
    mlp_params = {}
    # Set up some proto-networks
    pc0 = [28 * 28, 800, 800, 11]
    mlp_params['proto_configs'] = [pc0]
    # Set up some spawn networks
    sc0 = {
        'proto_key': 0,
        'input_noise': 0.1,
        'bias_noise': 0.1,
        'do_dropout': True
    }
    sc1 = {
        'proto_key': 0,
        'input_noise': 0.1,
        'bias_noise': 0.1,
        'do_dropout': True
    }
    mlp_params['spawn_configs'] = [sc0, sc1]
    mlp_params['spawn_weights'] = [0.0, 1.0]
    # Set remaining params
    mlp_params['ear_type'] = 5
    mlp_params['ear_lam'] = 1.0
    mlp_params['lam_l2a'] = 1e-2
    mlp_params['reg_all_obs'] = True

    for test_num in range(test_count):
        rng_seed = test_num
        sgd_params['result_tag'] = "test_{0:d}".format(test_num)

        # Initialize a random number generator for this test
        rng = np.random.RandomState(rng_seed)
        # Load some data to train/validate/test with
        dataset = 'data/mnist.pkl.gz'
        datasets = load_udm(dataset, zero_mean=False)

        # Construct the EAR_NET object that we will be training
        x_in = T.matrix('x_in')
        NET = EAR_NET(rng=rng, input=x_in, params=mlp_params)
        init_biases(NET, b_init=0.05)

        ##########################################
        # First, pretrain each layer in the mlp. #
        ##########################################
        sgd_params['result_tag'] = "ss_ear_pt_s{0:d}_t{1:d}".format(
            su_count, test_num)
        sgd_params['batch_size'] = 25
        sgd_params['start_rate'] = 0.02
        sgd_params['epochs'] = 40
        for i in range(len(NET.dae_costs)):
            print("==================================================")
            print("Pretraining hidden layer(s) at depth {0:d}".format(i + 1))
            print("==================================================")
            train_dae(NET, i, sgd_params, datasets)

        # Load some data to train/validate/test with
        rng = np.random.RandomState(rng_seed)
        dataset = 'data/mnist.pkl.gz'
        datasets = load_udm_ss(dataset, su_count, rng, zero_mean=False)
        # Run semisupervised training on the given MLP
        sgd_params['batch_size'] = 100
        sgd_params['start_rate'] = 0.04
        # Train with weak EAR regularization
        sgd_params['top_only'] = True
        sgd_params['epochs'] = 5
        NET.set_ear_lam(0.0)
        train_ss_mlp(NET, sgd_params, datasets)
        COMMENT = """
        # Train with no EAR regularization
        sgd_params['top_only'] = False
        sgd_params['epochs'] = 100
        NET.set_ear_lam(0.0)
        train_ss_mlp(NET, sgd_params, datasets)
        """
        # Train with weak EAR regularization
        sgd_params['top_only'] = False
        sgd_params['epochs'] = 5
        NET.set_ear_lam(0.5)
        train_ss_mlp(NET, sgd_params, datasets)
        # Train with weak EAR regularization
        sgd_params['epochs'] = 10
        NET.set_ear_lam(1.0)
        train_ss_mlp(NET, sgd_params, datasets)
        # Train with more EAR regularization
        sgd_params['epochs'] = 15
        NET.set_ear_lam(1.5)
        train_ss_mlp(NET, sgd_params, datasets)
        # Train with more EAR regularization
        sgd_params['epochs'] = 20
        NET.set_ear_lam(2.0)
        train_ss_mlp(NET, sgd_params, datasets)
        # Train with more EAR regularization
        sgd_params['epochs'] = 100
        NET.set_ear_lam(3.0)
        train_ss_mlp(NET, sgd_params, datasets)
    return
def test_mnist(occ_dim=15, drop_prob=0.0):
    #########################################
    # Format the result tag more thoroughly #
    #########################################
    dp_int = int(100.0 * drop_prob)
    result_tag = "{}VAE_OD{}_DP{}".format(RESULT_PATH, occ_dim, dp_int)

    ##########################
    # Get some training data #
    ##########################
    rng = np.random.RandomState(1234)
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, as_shared=False, zero_mean=False)
    Xtr = datasets[0][0]
    Xva = datasets[1][0]
    Xtr = to_fX(shift_and_scale_into_01(Xtr))
    Xva = to_fX(shift_and_scale_into_01(Xva))
    tr_samples = Xtr.shape[0]
    va_samples = Xva.shape[0]
    batch_size = 200
    batch_reps = 1
    all_pix_mean = np.mean(np.mean(Xtr, axis=1))
    data_mean = to_fX(all_pix_mean * np.ones((Xtr.shape[1],)))

    ############################################################
    # Setup some parameters for the Iterative Refinement Model #
    ############################################################
    obs_dim = Xtr.shape[1]
    z_dim = 100
    imp_steps = 15 # we'll check for the best step count (found oracularly)
    init_scale = 1.0

    x_in_sym = T.matrix('x_in_sym')
    x_out_sym = T.matrix('x_out_sym')
    x_mask_sym = T.matrix('x_mask_sym')

    #################
    # p_zi_given_xi #
    #################
    params = {}
    shared_config = [obs_dim, 500, 500]
    top_config = [shared_config[-1], z_dim]
    params['shared_config'] = shared_config
    params['mu_config'] = top_config
    params['sigma_config'] = top_config
    params['activation'] = relu_actfun
    params['init_scale'] = init_scale
    params['lam_l2a'] = 0.0
    params['vis_drop'] = 0.0
    params['hid_drop'] = 0.0
    params['bias_noise'] = 0.0
    params['input_noise'] = 0.0
    params['build_theano_funcs'] = False
    p_zi_given_xi = InfNet(rng=rng, Xd=x_in_sym, \
            params=params, shared_param_dicts=None)
    p_zi_given_xi.init_biases(0.2)
    ###################
    # p_xip1_given_zi #
    ###################
    params = {}
    shared_config = [z_dim, 500, 500]
    output_config = [obs_dim, obs_dim]
    params['shared_config'] = shared_config
    params['output_config'] = output_config
    params['activation'] = relu_actfun
    params['init_scale'] = init_scale
    params['lam_l2a'] = 0.0
    params['vis_drop'] = 0.0
    params['hid_drop'] = 0.0
    params['bias_noise'] = 0.0
    params['input_noise'] = 0.0
    params['build_theano_funcs'] = False
    p_xip1_given_zi = HydraNet(rng=rng, Xd=x_in_sym, \
            params=params, shared_param_dicts=None)
    p_xip1_given_zi.init_biases(0.2)
    ###################
    # q_zi_given_x_xi #
    ###################
    params = {}
    shared_config = [(obs_dim + obs_dim), 500, 500]
    top_config = [shared_config[-1], z_dim]
    params['shared_config'] = shared_config
    params['mu_config'] = top_config
    params['sigma_config'] = top_config
    params['activation'] = relu_actfun
    params['init_scale'] = init_scale
    params['lam_l2a'] = 0.0
    params['vis_drop'] = 0.0
    params['hid_drop'] = 0.0
    params['bias_noise'] = 0.0
    params['input_noise'] = 0.0
    params['build_theano_funcs'] = False
    q_zi_given_x_xi = InfNet(rng=rng, Xd=x_in_sym, \
            params=params, shared_param_dicts=None)
    q_zi_given_x_xi.init_biases(0.2)


    ###########################################################
    # Define parameters for the GPSImputer, and initialize it #
    ###########################################################
    print("Building the GPSImputer...")
    gpsi_params = {}
    gpsi_params['obs_dim'] = obs_dim
    gpsi_params['z_dim'] = z_dim
    gpsi_params['imp_steps'] = imp_steps
    gpsi_params['step_type'] = 'jump'
    gpsi_params['x_type'] = 'bernoulli'
    gpsi_params['obs_transform'] = 'sigmoid'
    gpsi_params['use_osm_mode'] = True
    GPSI = GPSImputer(rng=rng, 
            x_in=x_in_sym, x_out=x_out_sym, x_mask=x_mask_sym, \
            p_zi_given_xi=p_zi_given_xi, \
            p_xip1_given_zi=p_xip1_given_zi, \
            q_zi_given_x_xi=q_zi_given_x_xi, \
            params=gpsi_params, \
            shared_param_dicts=None)
    #########################################################################
    # Define parameters for the underlying OneStageModel, and initialize it #
    #########################################################################
    print("Building the OneStageModel...")
    osm_params = {}
    osm_params['x_type'] = 'bernoulli'
    osm_params['xt_transform'] = 'sigmoid'
    OSM = OneStageModel(rng=rng, \
            x_in=x_in_sym, \
            p_x_given_z=p_xip1_given_zi, \
            q_z_given_x=p_zi_given_xi, \
            x_dim=obs_dim, z_dim=z_dim, \
            params=osm_params)

    ################################################################
    # Apply some updates, to check that they aren't totally broken #
    ################################################################
    log_name = "{}_RESULTS.txt".format(result_tag)
    out_file = open(log_name, 'wb')
    costs = [0. for i in range(10)]
    learn_rate = 0.0002
    momentum = 0.5
    batch_idx = np.arange(batch_size) + tr_samples
    for i in range(200000):
        scale = min(1.0, ((i+1) / 5000.0))
        if (((i + 1) % 15000) == 0):
            learn_rate = learn_rate * 0.92
        if (i > 10000):
            momentum = 0.90
        else:
            momentum = 0.50
        # get the indices of training samples for this batch update
        batch_idx += batch_size
        if (np.max(batch_idx) >= tr_samples):
            # we finished an "epoch", so we rejumble the training set
            Xtr = row_shuffle(Xtr)
            batch_idx = np.arange(batch_size)
        # set sgd and objective function hyperparams for this update
        OSM.set_sgd_params(lr=scale*learn_rate, \
                           mom_1=scale*momentum, mom_2=0.99)
        OSM.set_lam_nll(lam_nll=1.0)
        OSM.set_lam_kld(lam_kld_1=1.0, lam_kld_2=0.0)
        OSM.set_lam_l2w(1e-4)
        # perform a minibatch update and record the cost for this batch
        xb = to_fX( Xtr.take(batch_idx, axis=0) )
        result = OSM.train_joint(xb, batch_reps)
        costs = [(costs[j] + result[j]) for j in range(len(result)-1)]
        if ((i % 250) == 0):
            costs = [(v / 250.0) for v in costs]
            str1 = "-- batch {0:d} --".format(i)
            str2 = "    joint_cost: {0:.4f}".format(costs[0])
            str3 = "    nll_cost  : {0:.4f}".format(costs[1])
            str4 = "    kld_cost  : {0:.4f}".format(costs[2])
            str5 = "    reg_cost  : {0:.4f}".format(costs[3])
            joint_str = "\n".join([str1, str2, str3, str4, str5])
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
            costs = [0.0 for v in costs]
        if ((i % 1000) == 0):
            Xva = row_shuffle(Xva)
            # record an estimate of performance on the test set
            xi, xo, xm = construct_masked_data(Xva[0:5000], drop_prob=drop_prob, \
                                               occ_dim=occ_dim, data_mean=data_mean)
            step_nll, step_kld = GPSI.compute_per_step_cost(xi, xo, xm, sample_count=10)
            min_nll = np.min(step_nll)
            str1 = "    va_nll_bound : {}".format(min_nll)
            str2 = "    va_nll_min  : {}".format(min_nll)
            str3 = "    va_nll_final : {}".format(step_nll[-1])
            joint_str = "\n".join([str1, str2, str3])
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
        if ((i % 5000) == 0):
            # Get some validation samples for evaluating model performance
            xb = to_fX( Xva[0:100] )
            xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \
                                    occ_dim=occ_dim, data_mean=data_mean)
            xi = np.repeat(xi, 2, axis=0)
            xo = np.repeat(xo, 2, axis=0)
            xm = np.repeat(xm, 2, axis=0)
            # draw some sample imputations from the model
            samp_count = xi.shape[0]
            _, model_samps = GPSI.sample_imputer(xi, xo, xm, use_guide_policy=False)
            seq_len = len(model_samps)
            seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1]))
            idx = 0
            for s1 in range(samp_count):
                for s2 in range(seq_len):
                    seq_samps[idx] = model_samps[s2][s1]
                    idx += 1
            file_name = "{}_samples_ng_b{}.png".format(result_tag, i)
            utils.visualize_samples(seq_samps, file_name, num_rows=20)
            # get visualizations of policy parameters
            file_name = "{}_gen_gen_weights_b{}.png".format(result_tag, i)
            W = GPSI.gen_gen_weights.get_value(borrow=False)
            utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20)
            file_name = "{}_gen_inf_weights_b{}.png".format(result_tag, i)
            W = GPSI.gen_inf_weights.get_value(borrow=False).T
            utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20)
def test_seq_cond_gen_copy(step_type='add', res_tag="AAA"):
    ##############################
    # File tag, for output stuff #
    ##############################
    result_tag = "{}TEST_{}".format(RESULT_PATH, res_tag)

    ##########################
    # Get some training data #
    ##########################
    rng = np.random.RandomState(1234)
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm(dataset, as_shared=False, zero_mean=False)
    Xtr = datasets[0][0]
    Xva = datasets[1][0]
    Xte = datasets[2][0]
    # merge validation set and training set, and test on test set.
    #Xtr = np.concatenate((Xtr, Xva), axis=0)
    #Xva = Xte
    Xtr = to_fX(shift_and_scale_into_01(Xtr))
    Xva = to_fX(shift_and_scale_into_01(Xva))

    # basic params
    batch_size = 128
    traj_len = 20
    im_dim = 28
    obs_dim = im_dim*im_dim

    def sample_batch(np_ary, bs=100):
        row_count = np_ary.shape[0]
        samp_idx = npr.randint(low=0,high=row_count,size=(bs,))
        xb = np_ary.take(samp_idx, axis=0)
        return xb

    ############################################################
    # Setup some parameters for the Iterative Refinement Model #
    ############################################################
    total_steps = traj_len
    init_steps = 5
    exit_rate = 0.1
    nll_weight = 0.0
    x_dim = obs_dim
    y_dim = obs_dim
    z_dim = 128
    att_spec_dim = 5
    rnn_dim = 512
    mlp_dim = 512

    def visualize_attention(result, pre_tag="AAA", post_tag="AAA"):
        seq_len = result[0].shape[0]
        samp_count = result[0].shape[1]
        # get generated predictions
        x_samps = np.zeros((seq_len*samp_count, obs_dim))
        idx = 0
        for s1 in range(samp_count):
            for s2 in range(seq_len):
                x_samps[idx] = result[0][s2,s1,:]
                idx += 1
        file_name = "{0:s}_traj_xs_{1:s}.png".format(pre_tag, post_tag)
        utils.visualize_samples(x_samps, file_name, num_rows=samp_count)
        # get sequential attention maps
        seq_samps = np.zeros((seq_len*samp_count, obs_dim))
        idx = 0
        for s1 in range(samp_count):
            for s2 in range(seq_len):
                seq_samps[idx] = result[1][s2,s1,:]
                idx += 1
        file_name = "{0:s}_traj_att_maps_{1:s}.png".format(pre_tag, post_tag)
        utils.visualize_samples(seq_samps, file_name, num_rows=samp_count)
        # get sequential attention maps (read out values)
        seq_samps = np.zeros((seq_len*samp_count, obs_dim))
        idx = 0
        for s1 in range(samp_count):
            for s2 in range(seq_len):
                seq_samps[idx] = result[2][s2,s1,:]
                idx += 1
        file_name = "{0:s}_traj_read_outs_{1:s}.png".format(pre_tag, post_tag)
        utils.visualize_samples(seq_samps, file_name, num_rows=samp_count)
        # get original input sequences
        seq_samps = np.zeros((seq_len*samp_count, obs_dim))
        idx = 0
        for s1 in range(samp_count):
            for s2 in range(seq_len):
                seq_samps[idx] = result[3][s2,s1,:]
                idx += 1
        file_name = "{0:s}_traj_xs_in_{1:s}.png".format(pre_tag, post_tag)
        utils.visualize_samples(seq_samps, file_name, num_rows=samp_count)
        return

    rnninits = {
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }
    inits = {
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }

    # module for doing local 2d read defined by an attention specification
    img_scale = 1.0 # image coords will range over [-img_scale...img_scale]
    read_N = 2      # use NxN grid for reader
    reader_mlp = FovAttentionReader2d(x_dim=obs_dim,
                                      width=im_dim, height=im_dim, N=read_N,
                                      img_scale=img_scale, att_scale=0.5,
                                      **inits)
    read_dim = reader_mlp.read_dim # total number of "pixels" read by reader

    # MLP for updating belief state based on con_rnn
    writer_mlp = MLP([None, None], [rnn_dim, mlp_dim, obs_dim], \
                     name="writer_mlp", **inits)

    # mlps for processing inputs to LSTMs
    con_mlp_in = MLP([Identity()], \
                     [                       z_dim, 4*rnn_dim], \
                     name="con_mlp_in", **inits)
    var_mlp_in = MLP([Identity()], \
                     [(read_dim + read_dim + att_spec_dim + rnn_dim), 4*rnn_dim], \
                     name="var_mlp_in", **inits)
    gen_mlp_in = MLP([Identity()], \
                     [        (read_dim + att_spec_dim + rnn_dim), 4*rnn_dim], \
                     name="gen_mlp_in", **inits)

    # mlps for turning LSTM outputs into conditionals over z_gen
    con_mlp_out = CondNet([], [rnn_dim, att_spec_dim], \
                          name="con_mlp_out", **inits)
    gen_mlp_out = CondNet([], [rnn_dim, z_dim], name="gen_mlp_out", **inits)
    var_mlp_out = CondNet([], [rnn_dim, z_dim], name="var_mlp_out", **inits)

    # LSTMs for the actual LSTMs (obviously, perhaps)
    con_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="con_rnn", **rnninits)
    gen_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="gen_rnn", **rnninits)
    var_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \
                         name="var_rnn", **rnninits)

    SCG = SeqCondGenRAM(
                x_and_y_are_seqs=False,
                total_steps=total_steps,
                init_steps=init_steps,
                exit_rate=exit_rate,
                nll_weight=nll_weight,
                step_type=step_type,
                x_dim=obs_dim,
                y_dim=obs_dim,
                reader_mlp=reader_mlp,
                writer_mlp=writer_mlp,
                con_mlp_in=con_mlp_in,
                con_mlp_out=con_mlp_out,
                con_rnn=con_rnn,
                gen_mlp_in=gen_mlp_in,
                gen_mlp_out=gen_mlp_out,
                gen_rnn=gen_rnn,
                var_mlp_in=var_mlp_in,
                var_mlp_out=var_mlp_out,
                var_rnn=var_rnn)
    SCG.initialize()

    compile_start_time = time.time()

    # build the attention trajectory sampler
    SCG.build_attention_funcs()

    # quick test of attention trajectory sampler
    Xb = sample_batch(Xtr, bs=32)
    result = SCG.sample_attention(Xb, Xb)
    visualize_attention(result, pre_tag=result_tag, post_tag="b0")

    # build the main model functions (i.e. training and cost functions)
    SCG.build_model_funcs()

    compile_end_time = time.time()
    compile_minutes = (compile_end_time - compile_start_time) / 60.0
    print("THEANO COMPILE TIME (MIN): {}".format(compile_minutes))

    # TEST SAVE/LOAD FUNCTIONALITY
    param_save_file = "{}_params.pkl".format(result_tag)
    SCG.save_model_params(param_save_file)
    SCG.load_model_params(param_save_file)

    ################################################################
    # Apply some updates, to check that they aren't totally broken #
    ################################################################
    print("Beginning to train the model...")
    out_file = open("{}_results.txt".format(result_tag), 'wb')
    out_file.flush()
    costs = [0. for i in range(10)]
    learn_rate = 0.0001
    momentum = 0.95
    for i in range(250000):
        lr_scale = min(1.0, ((i+1) / 5000.0))
        mom_scale = min(1.0, ((i+1) / 10000.0))
        if (((i + 1) % 10000) == 0):
            learn_rate = learn_rate * 0.95
        # set sgd and objective function hyperparams for this update
        SCG.set_sgd_params(lr=lr_scale*learn_rate, mom_1=mom_scale*momentum, mom_2=0.99)
        SCG.set_lam_kld(lam_kld_q2p=0.95, lam_kld_p2q=0.05, \
                        lam_kld_amu=0.0, lam_kld_alv=0.1)
        # perform a minibatch update and record the cost for this batch
        Xb = sample_batch(Xtr, bs=batch_size)
        result = SCG.train_joint(Xb, Xb)
        costs = [(costs[j] + result[j]) for j in range(len(result))]
        # output diagnostic information and checkpoint parameters, etc.
        if ((i % 250) == 0):
            costs = [(v / 250.0) for v in costs]
            str1 = "-- batch {0:d} --".format(i)
            str2 = "    total_cost: {0:.4f}".format(costs[0])
            str3 = "    nll_term  : {0:.4f}".format(costs[1])
            str4 = "    kld_q2p   : {0:.4f}".format(costs[2])
            str5 = "    kld_p2q   : {0:.4f}".format(costs[3])
            str6 = "    kld_amu   : {0:.4f}".format(costs[4])
            str7 = "    kld_alv   : {0:.4f}".format(costs[5])
            str8 = "    reg_term  : {0:.4f}".format(costs[6])
            joint_str = "\n".join([str1, str2, str3, str4, str5, str6, str7, str8])
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
            costs = [0.0 for v in costs]
        if ((i % 500) == 0):
            SCG.save_model_params("{}_params.pkl".format(result_tag))
            #############################################
            # check model performance on validation set #
            #############################################
            Xb = sample_batch(Xva, bs=500)
            result = SCG.compute_nll_bound(Xb, Xb)
            str2 = "    va_total_cost: {0:.4f}".format(float(result[0]))
            str3 = "    va_nll_term  : {0:.4f}".format(float(result[1]))
            str4 = "    va_kld_q2p   : {0:.4f}".format(float(result[2]))
            str5 = "    va_kld_p2q   : {0:.4f}".format(float(result[3]))
            str6 = "    va_kld_amu   : {0:.4f}".format(float(result[4]))
            str7 = "    va_kld_alv   : {0:.4f}".format(float(result[5]))
            str8 = "    va_reg_term  : {0:.4f}".format(float(result[6]))
            joint_str = "\n".join([str2, str3, str4, str5, str6, str7, str8])
            print(joint_str)
            out_file.write(joint_str+"\n")
            out_file.flush()
            ###########################################
            # sample and draw attention trajectories. #
            ###########################################
            Xb = sample_batch(Xva, bs=32)
            result = SCG.sample_attention(Xb, Xb)
            post_tag = "b{0:d}".format(i)
            visualize_attention(result, pre_tag=result_tag, post_tag=post_tag)
示例#44
0
def test_dA(learning_rate=0.1, training_epochs=30,
            dataset='./data/mnist.pkl.gz',
            batch_size=25, output_folder='dA_plots'):

    """
    Blargh!
    """
    datasets = load_udm(dataset)
    train_set_x, train_set_y = datasets[0]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()    # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images

    if not os.path.isdir(output_folder):
        os.makedirs(output_folder)
    os.chdir(output_folder)
    ####################################
    # BUILDING THE MODEL NO CORRUPTION #
    ####################################

    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2 ** 30))

    da = dA(numpy_rng=rng, theano_rng=theano_rng, input=x,
            n_visible=28 * 28, n_hidden=500)

    cost, updates = da.get_cost_updates(corruption_level=0.,
                                        learning_rate=learning_rate)

    train_da = theano.function(inputs=[index], outputs=[cost], updates=updates,
            givens={ x: train_set_x[index*batch_size:(index+1)*batch_size,:] })

    ############
    # TRAINING #
    ############

    # go through training epochs
    for epoch in xrange(training_epochs):
        # go through training set
        c = []
        t1 = time.clock()
        for batch_index in xrange(n_train_batches):
            c.append(train_da(batch_index))
        t2 = time.clock()

        print "Training epoch {0:d}, cost {1:.4f}, time {2:.4f}".format( \
                epoch, numpy.mean(c), (t2 - t1))

        image = PIL.Image.fromarray(
            tile_raster_images(X=da.W.get_value(borrow=True).T,
                               img_shape=(28, 28), tile_shape=(10, 10),
                               tile_spacing=(1, 1)))
        image.save('filters_corruption_00.png')

    #####################################
    # BUILDING THE MODEL CORRUPTION 30% #
    #####################################

    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2 ** 30))

    da = dA(numpy_rng=rng, theano_rng=theano_rng, input=x,
            n_visible=28 * 28, n_hidden=500)

    cost, updates = da.get_cost_updates(corruption_level=0.3,
                                        learning_rate=learning_rate)

    train_da = theano.function([index], cost, updates=updates,
         givens={x: train_set_x[index * batch_size:
                                  (index + 1) * batch_size]})

    ############
    # TRAINING #
    ############

    # go through training epochs
    for epoch in xrange(training_epochs):
        # go through training set
        c = []
        t1 = time.clock()
        for batch_index in xrange(n_train_batches):
            c.append(train_da(batch_index))
        t2 = time.clock()

        print "Training epoch {0:d}, cost {1:.4f}, time {2:.4f}".format( \
                epoch, numpy.mean(c), (t2 - t1))

        image = PIL.Image.fromarray(
            tile_raster_images(X=da.W.get_value(borrow=True).T,
                               img_shape=(28, 28), tile_shape=(10, 10),
                               tile_spacing=(1, 1)))
        image.save('filters_corruption_30.png')

    os.chdir('../')