示例#1
0
def search_model_adam(state, channel, reload_model=False):

    pp.pprint(state)

    def NReLU(x, rng=None, use_noise=False):
        assert rng is not None
        if use_noise:
            stds = Sigmoid(x)
            x = x + rng.normal(x.shape, avg=0.0, std=stds, dtype=x.dtype)
        return Trect(x)

    def NRect(x, rng=None, use_noise=False, std=0.05):
        assert rng is not None
        if use_noise:
            x = x + rng.normal(x.shape, avg=0.0, std=std, dtype=x.dtype)
        return Trect(x)

    def get_inps(use_mask=True,
                 vgen=None,
                 use_bow_out=False,
                 debug=False,
                 output_map=None):
        if use_mask:
            X, y, mask, cmask = TT.itensor3("X"), TT.imatrix("y"), TT.fmatrix("mask"), \
                    TT.fmatrix("cost_mask")
            qmask = TT.fmatrix("qmask")
            bow_out = TT.ftensor3("bow_out")

            if debug:
                theano.config.compute_test_value = "warn"
                batch = vgen.next()
                X.tag.test_value = batch['x'].astype("int32")
                y.tag.test_value = batch['y'].astype("int32")
                mask.tag.test_value = batch['mask'].astype("float32")
                cmask.tag.test_value = batch['cmask'].astype("float32")
                qmask.tag.test_value = batch["qmask"].astype("float32")
                if use_bow_out:
                    bow_out.tag.test_value = batch['bow_out'].astype("float32")

            if output_map:
                outs = {}
                outs["X"] = X
                outs["y"] = y
                outs["mask"] = mask
                outs["cmask"] = cmask
                if use_bow_out:
                    outs["bow_out"] = bow_out
                outs["qmask"] = qmask
            else:
                outs = [X, y, mask, cmask]
                if use_bow_out:
                    outs += [bow_out]
            return outs
        else:
            X, y = TT.itensor3("X"), TT.itensor3("y")
            if debug:
                theano.config.compute_test_value = "warn"
                batch = vgen.next()
                X.tag.test_value = batch['x']
                y.tag.test_value = batch['y']
            return [X, y]

    lr = state.lr
    batch_size = state.batch_size

    seed = state.get("seed", 3)
    seed_path = "{0}/seed_{1}.txt".format(state.save_path, str(seed))
    replace_seed(seed_path, seed)
    seed_setter = SEEDSetter(seed_path)
    print "seed is", seed_setter

    # No of els in the cols of the content for the memory
    mem_size = state.mem_size

    # No of rows in M
    mem_nel = state.mem_nel
    std = state.std
    renormalization_scale = state.renormalization_scale
    sub_mb_size = state.sub_mb_size
    smoothed_diff_weights = state.get('smoothed_diff_weights', True)

    # No of hids for controller
    n_hids = state.n_hids

    # Not using deep out
    deep_out_size = 100

    # Size of the bow embeddings
    bow_size = state.get('bow_size', 80)

    # ff controller
    use_ff_controller = state.use_ff_controller

    # For RNN controller:
    learn_h0 = state.get('learn_h0', False)
    use_nogru_mem2q = False

    # Use loc based addressing:
    use_loc_based_addressing = state.get('use_loc_based_addressing', False)
    bowout = state.get('bowout', True)
    use_reinforce = state.get('use_reinforce', False)

    max_seq_len = state.max_seq_len
    max_fact_len = state.max_fact_len

    n_read_heads = state.n_read_heads
    n_write_heads = 1
    n_reading_steps = state.n_reading_steps

    lambda1_rein = state.get('lambda1_rein', 4e-5)
    lambda2_rein = state.get('lambda2_rein', 1e-5)
    base_reg = 2e-5

    #size of the address in the memory:
    address_size = state.address_size
    renormalization_scale = state.renormalization_scale
    w2v_embed_scale = 0.05
    use_layer_norm = state.get('use_layer_norm', False)

    rng = np.random.RandomState(int(seed))
    trng = RandomStreams(int(seed))
    NRect = lambda x, use_noise=False: NRect(
        x, rng=trng, use_noise=use_noise, std=std)
    use_noise = False
    emb_scale = state.get('emb_scale', 0.32)

    use_quad_interactions = state.get('use_quad_interactions', True)

    mode = state.get('theano_function_mode', None)
    import sys
    sys.setrecursionlimit(50000)

    learning_rule = Adam(gradient_clipping=state.get('gradient_clip', 10))
    task_id = state.task_id
    print "Task id is, ", task_id
    cont_act = Tanh
    mem_gater_activ = Sigmoid
    erase_activ = Sigmoid
    content_activ = Tanh
    use_gru_inp = state.get('use_gru_inp', True)
    use_bow_inp = state.get('use_bow_inp', False)

    w2v_embed_path = None
    use_reinforce_baseline = state.use_reinforce_baseline
    use_reinforce = state.get('use_reinforce', False)
    l1_pen = state.get('l1_pen', 1e-4)
    l2_pen = state.get('l2_pen', 1e-3)
    hybrid_att = state.get('hybrid_att', False)
    use_dice_val = state.get('use_dice_val', False)
    debug = state.get('debug', False)
    correlation_ws = state.get('correlation_ws', 6e-4)
    anticorr = state.get('anticorr', None)
    path = state.path
    prfx = (
        "ntm_on_fb_BABI_task_%(task_id)d_seed_%(seed)s_learn_h0_l1_no_n_hids_%(n_hids)s_bsize_%(batch_size)d"
        "_std_%(std)f_mem_nel_%(mem_nel)d_mem_size_%(mem_size)f_lr_%(lr)f"
    ) % locals()

    prfx = state.save_path + prfx
    tdata_gen = FBbABIDataIteratorSingleQ(
        task_file='all_tasks_train_ngram_False.pkl',
        randomize=True,
        max_seq_len=max_seq_len,
        max_fact_len=max_fact_len,
        task_id=task_id,
        task_path=path,
        mode='train',
        fact_vocab="../all_tasks_test_ngram_False_dict.pkl",
        batch_size=batch_size)

    vdata_gen = FBbABIDataIteratorSingleQ(
        task_file='all_tasks_valid_ngram_False.pkl',
        max_fact_len=tdata_gen.max_fact_len,
        max_seq_len=max_seq_len,
        randomize=False,
        task_id=task_id,
        mode="valid",
        task_path=path,
        fact_vocab="../all_tasks_test_ngram_False_dict.pkl",
        batch_size=batch_size)

    tst_data_gen = FBbABIDataIteratorSingleQ(
        task_file='../all_tasks_test_ngram_False.pkl',
        max_fact_len=tdata_gen.max_fact_len,
        max_seq_len=max_seq_len,
        randomize=False,
        task_id=task_id,
        mode="valid",
        task_path=path,
        fact_vocab="../all_tasks_test_ngram_False_dict.pkl",
        batch_size=batch_size)

    use_mask = True
    n_layers = state.get('n_layers', 1)
    inps = get_inps(vgen=vdata_gen,
                    debug=debug,
                    use_bow_out=bowout,
                    output_map=True)

    wi = WeightInitializer(sparsity=-1,
                           scale=std,
                           rng=rng,
                           init_method=InitMethods.Adaptive,
                           center=0.0)

    bi = BiasInitializer(sparsity=-1,
                         scale=std,
                         rng=rng,
                         init_method=BiasInitMethods.Constant,
                         center=0.0)

    print "Length of the vocabulary, ", len(tdata_gen.vocab.items())
    ntm = NTMModel(n_in=len(tdata_gen.vocab.items()),
                   n_hids=n_hids,
                   bow_size=bow_size,
                   n_out=len(tdata_gen.vocab.items()),
                   predict_bow_out=bowout,
                   mem_size=mem_size,
                   mem_nel=mem_nel,
                   use_ff_controller=use_ff_controller,
                   sub_mb_size=sub_mb_size,
                   deep_out_size=deep_out_size,
                   inps=inps,
                   n_layers=n_layers,
                   hybrid_att=hybrid_att,
                   smoothed_diff_weights=smoothed_diff_weights,
                   baseline_reg=base_reg,
                   w2v_embed_path=w2v_embed_path,
                   renormalization_scale=renormalization_scale,
                   w2v_embed_scale=w2v_embed_scale,
                   emb_scale=emb_scale,
                   n_read_heads=n_read_heads,
                   n_write_heads=n_write_heads,
                   use_layer_norm=use_layer_norm,
                   use_last_hidden_state=False,
                   use_loc_based_addressing=use_loc_based_addressing,
                   use_simple_rnn_inp_rep=False,
                   use_gru_inp_rep=use_gru_inp,
                   use_bow_input=use_bow_inp,
                   anticorr=anticorr,
                   erase_activ=erase_activ,
                   use_gate_quad_interactions=use_quad_interactions,
                   content_activ=content_activ,
                   use_multiscale_shifts=True,
                   correlation_ws=correlation_ws,
                   learning_rule=learning_rule,
                   lambda1_rein=lambda1_rein,
                   lambda2_rein=lambda2_rein,
                   n_reading_steps=n_reading_steps,
                   use_deepout=False,
                   use_reinforce=use_reinforce,
                   use_nogru_mem2q=use_nogru_mem2q,
                   use_reinforce_baseline=use_reinforce_baseline,
                   controller_activ=cont_act,
                   use_adv_indexing=False,
                   use_out_mem=False,
                   unroll_recurrence=False,
                   address_size=address_size,
                   reinforce_decay=0.9,
                   learn_h0=learn_h0,
                   theano_function_mode=mode,
                   l1_pen=l1_pen,
                   debug=debug,
                   mem_gater_activ=mem_gater_activ,
                   tie_read_write_gates=False,
                   weight_initializer=wi,
                   bias_initializer=bi,
                   use_cost_mask=True,
                   use_noise=use_noise,
                   max_fact_len=max_fact_len,
                   softmax=True,
                   use_mask=use_mask,
                   batch_size=batch_size)

    bow_weight_stop = state.get('bow_weight_stop', 1.2 * 1e-1)
    bow_weight_anneal_start = state.get('bow_weight_anneal_start', 320)
    bow_weight_start = state.get("bow_weight_start", 0.74)
    bow_out_anneal_rate = state.get("bow_out_anneal_rate", 2 * 1e-4)
    save_freq = state.get("save_freq", 1000)

    main_loop = FBaBIMainLoop(ntm,
                              print_every=50,
                              checkpoint_every=save_freq,
                              validate_every=500,
                              bow_out_anneal_rate=bow_out_anneal_rate,
                              bow_weight_start=bow_weight_start,
                              bow_weight_stop=bow_weight_stop,
                              bow_weight_anneal_start=bow_weight_anneal_start,
                              train_data_gen=tdata_gen,
                              valid_data_gen=vdata_gen,
                              test_data_gen=tst_data_gen,
                              learning_rate=lr,
                              reload_model=reload_model,
                              valid_iters=None,
                              linear_start=False,
                              use_qmask=True,
                              max_iters=state.max_iters,
                              state=state,
                              prefix=prfx)
    main_loop.run()

    if channel is None:
        return None
    return channel.COMPLETE
示例#2
0
def search_model_adam(state, channel, reload_model=False):

    pp.pprint(state)

    def NReLU(x, rng=None, use_noise=False):
        assert rng is not None
        if use_noise:
            stds = Sigmoid(x)
            x = x + rng.normal(x.shape, avg=0.0, std=stds, dtype=x.dtype)
        return Trect(x)

    def get_inps(use_mask=True, vgen=None, debug=False, output_map=None):

        if use_mask:
            X, y, mask, cmask = TT.ftensor3("X"), TT.ftensor3("y"), \
                                    TT.fmatrix("mask"), TT.ftensor3("cost_mask")
            if debug:
                theano.config.compute_test_value = "warn"
                batch = vgen.next()
                X.tag.test_value = batch['x'].astype("float32")
                y.tag.test_value = batch['y'].astype("float32")
                mask.tag.test_value = batch['mask'].astype("float32")
                cmask.tag.test_value = batch['cmask'].astype("float32")

            if output_map:
                outs = {}
                outs["X"] = X
                outs["y"] = y
                outs["mask"] = mask
                outs["cmask"] = cmask
            else:
                outs = [X, y, mask, cmask]

            return outs
        else:
            X, y = TT.tensor3("X"), TT.tensor3("y")

            if debug:
                theano.config.compute_test_value = "warn"
                batch = vgen.next()
                X.tag.test_value = batch['x']
                y.tag.test_value = batch['y']
            return [X, y]

    lr = state['lr']
    batch_size = state['batch_size']

    # No of els in the cols of the content for the memory
    mem_size = state['mem_size']

    # No of rows in M
    mem_nel = state['mem_nel']
    std = state['std']
    renormalization_scale = state['renormalization_scale']
    sub_mb_size = state['sub_mb_size']
    smoothed_diff_weights = state.get('smoothed_diff_weights', True)

    max_len = 10
    inp_size = 10

    # No of hids for controller
    n_hids =  state['n_hids']

    # Not using deep out
    deep_out_size = 100

    # Size of the bow embeddings
    bow_size = state.get('bow_size', 80)

    # ff controller
    use_ff_controller = state['use_ff_controller']

    # For RNN controller:
    learn_h0 = state.get('learn_h0', False)
    use_nogru_mem2q = False

    # Use loc based addressing:
    use_loc_based_addressing = state.get('use_loc_based_addressing', False)
    bowout = state.get('bowout', False)
    use_reinforce = state.get('use_reinforce', False)

    seed = 7

    n_read_heads = state['n_read_heads']
    n_write_heads = 1
    n_reading_steps = state['n_reading_steps']

    lambda1_rein = state.get('lambda1_rein', 4e-5)
    lambda2_rein = state.get('lambda2_rein', 1e-5)
    base_reg = 2e-5

    #size of the address in the memory:
    address_size = state['address_size']
    renormalization_scale = state['renormalization_scale']
    w2v_embed_scale = 0.05

    rng = np.random.RandomState(seed)
    trng = RandomStreams(seed)
    NRect = lambda x, use_noise=False: NRect(x, rng=trng, use_noise=use_noise, std=std)
    use_noise = False

    use_quad_interactions = state.get('use_quad_interactions', True)
    mode = state.get('theano_function_mode', None)

    import sys
    sys.setrecursionlimit(50000)

    learning_rule = Adam(gradient_clipping=state.get('gradient_clip', 10))

    cont_act = Tanh
    mem_gater_activ = Sigmoid
    erase_activ = Sigmoid
    content_activ = Tanh
    use_gru_inp = state.get('use_gru_inp', False)
    use_bow_inp = state.get('use_bow_inp', False)

    w2v_embed_path = None
    use_reinforce_baseline = state['use_reinforce_baseline']
    use_reinforce = state.get('use_reinforce', False)
    l1_pen = state.get('l1_pen', 1e-4)
    l2_pen = state.get('l2_pen', 1e-3)
    hybrid_att = state.get('hybrid_att', False)
    use_dice_val = state.get('use_dice_val', False)
    debug = state.get('debug', False)
    correlation_ws = state.get('correlation_ws', False)

    anticorr = state.get('anticorr', None)

    prfx = ("ntm_copy_learn_h0_l1_no_n_hids_%(n_hids)s_bsize_%(batch_size)d_rs_%(renormalization_scale)f"
            "_std_%(std)f_mem_nel_%(mem_nel)d_mem_size_%(mem_size)d_lr_%(lr)f_%(address_size)d") % locals()

    save_path = state.get("save_path", ".")
    prfx = save_path + prfx

    tdata_gen = CopyDataGen(batch_size,
                            max_len,
                            inp_size,
                            rng=rng,
                            seed=seed,
                            rnd_len=True)

    vdata_gen = CopyDataGen(batch_size,
                            max_len,
                            inp_size,
                            rng=rng,
                            seed=2,
                            rnd_len=False)

    tst_data_gen = CopyDataGen(batch_size,
                               max_len,
                               inp_size,
                               rng=rng,
                               seed=3,
                               rnd_len=False)

    n_layers = state.get('n_layers', 1)
    inps = get_inps(vgen=vdata_gen, debug=debug, output_map=True)

    wi = WeightInitializer(sparsity=-1,
                           scale=std,
                           rng=rng,
                           init_method=InitMethods.Adaptive,
                           center=0.0)

    bi = BiasInitializer(sparsity=-1,
                         scale=std,
                         rng=rng,
                         init_method=BiasInitMethods.Constant,
                         center=0.0)

    ntm = NTMModel(n_in=inp_size,
                   n_hids=n_hids,
                   bow_size=bow_size,
                   n_out=inp_size,
                   predict_bow_out=bowout,
                   mem_size=mem_size,
                   mem_nel=mem_nel,
                   use_ff_controller=use_ff_controller,
                   sub_mb_size=sub_mb_size,
                   deep_out_size=deep_out_size,
                   inps=inps,
                   n_layers=n_layers,
                   hybrid_att=hybrid_att,
                   smoothed_diff_weights=smoothed_diff_weights,
                   baseline_reg=base_reg,
                   w2v_embed_path=w2v_embed_path,
                   renormalization_scale=renormalization_scale,
                   w2v_embed_scale=w2v_embed_scale,
                   n_read_heads=n_read_heads,
                   n_write_heads=n_write_heads,
                   use_last_hidden_state=False,
                   use_loc_based_addressing=use_loc_based_addressing,
                   use_simple_rnn_inp_rep=False,
                   use_gru_inp_rep=use_gru_inp,
                   use_bow_input=use_bow_inp,
                   anticorr=anticorr,
                   erase_activ=erase_activ,
                   use_gate_quad_interactions=use_quad_interactions,
                   content_activ=content_activ,
                   use_multiscale_shifts=True,
                   correlation_ws=correlation_ws,
                   learning_rule=learning_rule,
                   lambda1_rein=lambda1_rein,
                   lambda2_rein=lambda2_rein,
                   n_reading_steps=n_reading_steps,
                   use_deepout=False,
                   use_reinforce=use_reinforce,
                   use_nogru_mem2q=use_nogru_mem2q,
                   use_reinforce_baseline=use_reinforce_baseline,
                   controller_activ=cont_act,
                   use_adv_indexing=False,
                   use_out_mem=False,
                   unroll_recurrence=False,
                   address_size=address_size,
                   reinforce_decay=0.9,
                   learn_h0=learn_h0,
                   theano_function_mode=mode,
                   l1_pen=l1_pen,
                   debug=debug,
                   mem_gater_activ=mem_gater_activ,
                   tie_read_write_gates=False,
                   weight_initializer=wi,
                   bias_initializer=bi,
                   use_cost_mask=True,
                   use_noise=use_noise,
                   max_fact_len=max_len,
                   softmax=False,
                   batch_size=batch_size)

    save_freq = state.get("save_freq", 1000)

    main_loop = NTMToyMainLoop(ntm,
                               print_every=50,
                               checkpoint_every=save_freq,
                               validate_every=500,
                               train_data_gen=tdata_gen,
                               valid_data_gen=vdata_gen,
                               test_data_gen=tst_data_gen,
                               learning_rate=lr,
                               reload_model=reload_model,
                               valid_iters=200,
                               max_iters=state['max_iters'],
                               state=state,
                               prefix=prfx)
    main_loop.run()
示例#3
0
def search_model_adam(state, channel, reload_model=False):

    pp.pprint(state)

    def NReLU(x, rng=None, use_noise=False):
        assert rng is not None
        if use_noise:
            stds = Sigmoid(x)
            x = x + rng.normal(x.shape, avg=0.0, std=stds, dtype=x.dtype)
        return Trect(x)

    def get_inps(vgen=None, debug=False, output_map=None):

        X, y = TT.fmatrix("X"), TT.vector("y", dtype="uint8")

        if debug:
            theano.config.compute_test_value = "warn"
            batch = vgen.get_epoch_iterator().next()
            X.tag.test_value = batch[0].reshape((batch[0].shape[0], -1))
            y.tag.test_value = batch[1].flatten()
        return [X, y]

    lr = state['lr']
    batch_size = state['batch_size']

    # No of els in the cols of the content for the memory
    mem_size = state['mem_size']

    # No of rows in M
    mem_nel = state['mem_nel']
    std = state['std']
    renormalization_scale = state['renormalization_scale']
    sub_mb_size = state['sub_mb_size']
    smoothed_diff_weights = state.get('smoothed_diff_weights', False)

    max_len = 784
    inp_size = 1

    # No of hids for controller
    n_hids = state['n_hids']

    # Not using deep out
    deep_out_size = 100

    # Size of the bow embeddings
    bow_size = state.get('bow_size', 80)

    # ff controller
    use_ff_controller = state['use_ff_controller']

    # For RNN controller:
    learn_h0 = state.get('learn_h0', False)
    use_nogru_mem2q = False

    # Use loc based addressing:
    use_loc_based_addressing = state.get('use_loc_based_addressing', False)
    bowout = state.get('bowout', False)
    use_reinforce = state.get('use_reinforce', False)
    permute_order = state.get('permute_order', True)

    seed = 7
    n_read_heads = state['n_read_heads']
    n_write_heads = 1
    n_reading_steps = state['n_reading_steps']

    lambda1_rein = state.get('lambda1_rein', 4e-5)
    lambda2_rein = state.get('lambda2_rein', 1e-5)
    base_reg = 2e-5

    #size of the address in the memory:
    address_size = state["address_size"]
    w2v_embed_scale = 0.05
    n_out = 10

    rng = np.random.RandomState(seed)
    trng = RandomStreams(seed)
    NRect = lambda x, use_noise=False: NRect(
        x, rng=trng, use_noise=use_noise, std=std)
    use_noise = False

    use_quad_interactions = state.get('use_quad_interactions', True)

    mode = state.get('theano_function_mode', None)
    import sys
    sys.setrecursionlimit(50000)

    learning_rule = Adam(gradient_clipping=state.get('gradient_clip', 10))

    cont_act = Tanh
    mem_gater_activ = Sigmoid
    erase_activ = Sigmoid
    content_activ = Tanh
    use_gru_inp = state.get('use_gru_inp', False)
    use_bow_inp = state.get('use_bow_inp', False)

    w2v_embed_path = None
    use_reinforce_baseline = state['use_reinforce_baseline']

    use_reinforce = state.get('use_reinforce', False)
    l1_pen = state.get('l1_pen', 1e-4)
    l2_pen = state.get('l2_pen', 1e-3)
    hybrid_att = state.get('hybrid_att', False)
    use_dice_val = state.get('use_dice_val', False)
    debug = state.get('debug', False)
    correlation_ws = state.get('correlation_ws', False)

    idxs = np.arange(max_len)
    np.random.shuffle(idxs)
    use_batch_norm = state.get("use_batch_norm", False)
    anticorr = state.get('anticorr', None)
    prfx = (
        "ntm_on_fb_copy_task_all_learn_h0_l1_no_n_hids_%(n_hids)s_bsize_%(batch_size)d"
        "_std_%(std)f_mem_nel_%(mem_nel)d_mem_size_%(mem_size)f_lr_%(lr)f_use_bn_%(use_batch_norm)d_hard2"
    ) % locals()

    tdata_gen = get_stream(which_set="train", batch_size=batch_size)

    vdata_gen = get_stream(which_set="valid", batch_size=batch_size)

    tst_data_gen = get_stream(which_set="test", batch_size=batch_size)

    n_layers = state.get('n_layers', 1)
    inps = get_inps(vgen=vdata_gen, debug=debug, output_map=True)

    wi = WeightInitializer(sparsity=-1,
                           scale=std,
                           rng=rng,
                           init_method=InitMethods.Adaptive,
                           center=0.0)

    bi = BiasInitializer(sparsity=-1,
                         scale=1e-3,
                         rng=rng,
                         init_method=BiasInitMethods.Random,
                         center=0.0)

    ntm = NTMModel(n_in=inp_size,
                   n_hids=n_hids,
                   bow_size=bow_size,
                   n_out=n_out,
                   predict_bow_out=bowout,
                   mem_size=mem_size,
                   mem_nel=mem_nel,
                   use_ff_controller=use_ff_controller,
                   sub_mb_size=sub_mb_size,
                   deep_out_size=deep_out_size,
                   inps=inps,
                   n_layers=n_layers,
                   hybrid_att=hybrid_att,
                   smoothed_diff_weights=smoothed_diff_weights,
                   baseline_reg=base_reg,
                   w2v_embed_path=w2v_embed_path,
                   renormalization_scale=renormalization_scale,
                   use_batch_norm=use_batch_norm,
                   w2v_embed_scale=w2v_embed_scale,
                   n_read_heads=n_read_heads,
                   n_write_heads=n_write_heads,
                   use_last_hidden_state=True,
                   use_loc_based_addressing=use_loc_based_addressing,
                   use_simple_rnn_inp_rep=False,
                   use_gru_inp_rep=use_gru_inp,
                   use_bow_input=use_bow_inp,
                   use_inp_content=False,
                   anticorr=anticorr,
                   erase_activ=erase_activ,
                   use_gate_quad_interactions=use_quad_interactions,
                   content_activ=content_activ,
                   use_multiscale_shifts=True,
                   correlation_ws=correlation_ws,
                   learning_rule=learning_rule,
                   lambda1_rein=lambda1_rein,
                   lambda2_rein=lambda2_rein,
                   n_reading_steps=n_reading_steps,
                   use_deepout=False,
                   use_reinforce=use_reinforce,
                   use_nogru_mem2q=use_nogru_mem2q,
                   use_reinforce_baseline=use_reinforce_baseline,
                   controller_activ=cont_act,
                   use_adv_indexing=False,
                   use_out_mem=False,
                   unroll_recurrence=False,
                   address_size=address_size,
                   reinforce_decay=0.9,
                   learn_h0=learn_h0,
                   theano_function_mode=mode,
                   l1_pen=l1_pen,
                   debug=debug,
                   mem_gater_activ=mem_gater_activ,
                   tie_read_write_gates=False,
                   weight_initializer=wi,
                   bias_initializer=bi,
                   use_cost_mask=False,
                   use_noise=use_noise,
                   rnd_indxs=idxs,
                   permute_order=permute_order,
                   max_fact_len=max_len,
                   softmax=True,
                   batch_size=None)

    save_freq = state.get("save_freq", 1000)

    main_loop = SeqMNISTMainLoop(ntm,
                                 print_every=50,
                                 checkpoint_every=save_freq,
                                 validate_every=500,
                                 train_data_gen=tdata_gen,
                                 valid_data_gen=vdata_gen,
                                 test_data_gen=tst_data_gen,
                                 learning_rate=lr,
                                 reload_model=reload_model,
                                 num_epochs=250,
                                 state=state,
                                 prefix=prfx)
    main_loop.run()
def train(
        dim_word_desc=400,  # word vector dimensionality
        dim_word_q=400,
        dim_word_ans=600,
        dim_proj=300,
        dim=400,  # the number of LSTM units
        encoder_desc='lstm',
        encoder_desc_word='lstm',
        encoder_desc_sent='lstm',
        use_dq_sims=False,
        eyem=None,
        learn_h0=False,
        use_desc_skip_c_g=False,
        debug=False,
        encoder_q='lstm',
        patience=10,
        max_epochs=5000,
        dispFreq=100,
        decay_c=0.,
        alpha_c=0.,
        clip_c=-1.,
        lrate=0.01,
        n_words_q=49145,
        n_words_desc=115425,
        n_words_ans=409,
        pkl_train_files=None,
        pkl_valid_files=None,
        maxlen=2000,  # maximum length of the description
        optimizer='rmsprop',
        batch_size=2,
        vocab=None,
        valid_batch_size=16,
        use_elu_g=False,
        saveto='model.npz',
        model_dir=None,
        ms_nlayers=3,
        validFreq=1000,
        saveFreq=1000,  # save the parameters after every saveFreq updates
        datasets=[None],
        truncate=400,
        momentum=0.9,
        use_bidir=False,
        cost_mask=None,
        valid_datasets=[
            '/u/yyu/stor/caglar/rc-data/cnn/cnn_test_data.h5',
            '/u/yyu/stor/caglar/rc-data/cnn/cnn_valid_data.h5'
        ],
        dropout_rate=0.5,
        use_dropout=True,
        reload_=True,
        **opt_ds):

    ensure_dir_exists(model_dir)
    mpath = os.path.join(model_dir, saveto)
    mpath_best = os.path.join(model_dir, prfx("best", saveto))
    mpath_last = os.path.join(model_dir, prfx("last", saveto))
    mpath_stats = os.path.join(model_dir, prfx("stats", saveto))

    # Model options
    model_options = locals().copy()
    model_options['use_sent_reps'] = opt_ds['use_sent_reps']
    stats = defaultdict(list)

    del model_options['eyem']
    del model_options['cost_mask']

    if cost_mask is not None:
        cost_mask = sharedX(cost_mask)

    # reload options and parameters
    if reload_:
        print "Reloading the model."
        if os.path.exists(mpath_best):
            print "Reloading the best model from %s." % mpath_best
            with open(os.path.join(mpath_best, '%s.pkl' % mpath_best),
                      'rb') as f:
                models_options = pkl.load(f)
            params = init_params(model_options)
            params = load_params(mpath_best, params)
        elif os.path.exists(mpath):
            print "Reloading the model from %s." % mpath
            with open(os.path.join(mpath, '%s.pkl' % mpath), 'rb') as f:
                models_options = pkl.load(f)
            params = init_params(model_options)
            params = load_params(mpath, params)
        else:
            raise IOError("Couldn't open the file.")
    else:
        print "Couldn't reload the models initializing from scratch."
        params = init_params(model_options)

    if datasets[0]:
        print "Short dataset", datasets[0]

    print 'Loading data'
    print 'Building model'
    if pkl_train_files is None or pkl_valid_files is None:
        train, valid, test = load_data(path=datasets[0],
                                       valid_path=valid_datasets[0],
                                       test_path=valid_datasets[1],
                                       batch_size=batch_size,
                                       **opt_ds)
    else:
        train, valid, test = load_pkl_data(train_file_paths=pkl_train_files,
                                           valid_file_paths=pkl_valid_files,
                                           batch_size=batch_size,
                                           vocab=vocab,
                                           eyem=eyem,
                                           **opt_ds)

    tparams = init_tparams(params)
    trng, use_noise, inps_d, \
                     opt_ret, \
                     cost, errors, ent_errors, ent_derrors, probs = \
                        build_model(tparams,
                                    model_options,
                                    prepare_data if not opt_ds['use_sent_reps'] \
                                            else prepare_data_sents,
                                    valid,
                                    cost_mask=cost_mask)

    alphas = opt_ret['dec_alphas']

    if opt_ds['use_sent_reps']:
        inps = [inps_d["desc"], \
                inps_d["word_mask"], \
                inps_d["q"], \
                inps_d['q_mask'], \
                inps_d['ans'], \
                inps_d['wlen'],
                inps_d['slen'], inps_d['qlen'],\
                inps_d['ent_mask']
                ]
    else:
        inps = [inps_d["desc"], \
                inps_d["word_mask"], \
                inps_d["q"], \
                inps_d['q_mask'], \
                inps_d['ans'], \
                inps_d['wlen'], \
                inps_d['qlen'], \
                inps_d['ent_mask']]

    outs = [cost, errors, probs, alphas]
    if ent_errors:
        outs += [ent_errors]

    if ent_derrors:
        outs += [ent_derrors]

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, outs, profile=profile)
    print 'Done'

    # Apply weight decay on the feed-forward connections
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.

        for kk, vv in tparams.iteritems():
            if "logit" in kk or "ff" in kk:
                weight_decay += (vv**2).sum()

        weight_decay *= decay_c
        cost += weight_decay

    # after any regularizer
    print 'Computing gradient...',
    grads = safe_grad(cost, itemlist(tparams))
    print 'Done'

    # Gradient clipping:
    if clip_c > 0.:
        g2 = get_norms(grads)
        for p, g in grads.iteritems():
            grads[p] = tensor.switch(g2 > (clip_c**2),
                                     (g / tensor.sqrt(g2 + 1e-8)) * clip_c, g)
    inps.pop()
    if optimizer.lower() == "adasecant":
        learning_rule = Adasecant(delta_clip=25.0,
                                  use_adagrad=True,
                                  grad_clip=0.25,
                                  gamma_clip=0.)
    elif optimizer.lower() == "rmsprop":
        learning_rule = RMSPropMomentum(init_momentum=momentum)
    elif optimizer.lower() == "adam":
        learning_rule = Adam()
    elif optimizer.lower() == "adadelta":
        learning_rule = AdaDelta()

    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    learning_rule = None

    if learning_rule:
        f_grad_shared, f_update = learning_rule.get_funcs(learning_rate=lr,
                                                          grads=grads,
                                                          inp=inps,
                                                          cost=cost,
                                                          errors=errors)
    else:
        f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps,
                                                  cost, errors)

    print 'Done'
    print 'Optimization'
    history_errs = []
    # reload history
    if reload_ and os.path.exists(mpath):
        history_errs = list(numpy.load(mpath)['history_errs'])

    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size

    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size

    best_found = False
    uidx = 0
    estop = False

    train_cost_ave, train_err_ave, \
            train_gnorm_ave = reset_train_vals()

    for eidx in xrange(max_epochs):
        n_samples = 0

        if train.done:
            train.reset()

        for d_, q_, a, em in train:
            n_samples += len(a)
            uidx += 1
            use_noise.set_value(1.)

            if opt_ds['use_sent_reps']:
                # To mask the description and the question.
                d, d_mask, q, q_mask, dlen, slen, qlen = prepare_data_sents(
                    d_, q_)

                if d is None:
                    print 'Minibatch with zero sample under length ', maxlen
                    uidx -= 1
                    continue

                ud_start = time.time()
                cost, errors, gnorm, pnorm = f_grad_shared(
                    d, d_mask, q, q_mask, a, dlen, slen, qlen)
            else:
                d, d_mask, q, q_mask, dlen, qlen = prepare_data(d_, q_)

                if d is None:
                    print 'Minibatch with zero sample under length ', maxlen
                    uidx -= 1
                    continue

                ud_start = time.time()
                cost, errors, gnorm, pnorm = f_grad_shared(
                    d, d_mask, q, q_mask, a, dlen, qlen)

            upnorm = f_update(lrate)
            ud = time.time() - ud_start

            # Collect the running ave train stats.
            train_cost_ave = running_ave(train_cost_ave, cost)
            train_err_ave = running_ave(train_err_ave, errors)
            train_gnorm_ave = running_ave(train_gnorm_ave, gnorm)

            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                import ipdb
                ipdb.set_trace()

            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, ' Update ', uidx, \
                        ' Cost ', cost, ' UD ', ud, \
                        ' UpNorm ', upnorm[0].tolist(), \
                        ' GNorm ', gnorm, \
                        ' Pnorm ', pnorm, 'Terrors ', errors

            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',
                if best_p is not None and best_found:
                    numpy.savez(mpath_best,
                                history_errs=history_errs,
                                **best_p)
                    pkl.dump(model_options, open('%s.pkl' % mpath_best, 'wb'))
                else:
                    params = unzip(tparams)

                numpy.savez(mpath, history_errs=history_errs, **params)
                pkl.dump(model_options, open('%s.pkl' % mpath, 'wb'))
                pkl.dump(stats, open("%s.pkl" % mpath_stats, 'wb'))

                print 'Done'
                print_param_norms(tparams)

            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                if valid.done:
                    valid.reset()

                valid_costs, valid_errs, valid_probs, \
                        valid_alphas, error_ent, error_dent = eval_model(f_log_probs,
                                                  prepare_data if not opt_ds['use_sent_reps'] \
                                                    else prepare_data_sents,
                                                  model_options,
                                                  valid,
                                                  use_sent_rep=opt_ds['use_sent_reps'])

                valid_alphas_ = numpy.concatenate(
                    [va.argmax(0) for va in valid_alphas.tolist()], axis=0)
                valid_err = valid_errs.mean()
                valid_cost = valid_costs.mean()
                valid_alpha_ent = -negentropy(valid_alphas)

                mean_valid_alphas = valid_alphas_.mean()
                std_valid_alphas = valid_alphas_.std()

                mean_valid_probs = valid_probs.argmax(1).mean()
                std_valid_probs = valid_probs.argmax(1).std()

                history_errs.append([valid_cost, valid_err])

                stats['train_err_ave'].append(train_err_ave)
                stats['train_cost_ave'].append(train_cost_ave)
                stats['train_gnorm_ave'].append(train_gnorm_ave)

                stats['valid_errs'].append(valid_err)
                stats['valid_costs'].append(valid_cost)
                stats['valid_err_ent'].append(error_ent)
                stats['valid_err_desc_ent'].append(error_dent)

                stats['valid_alphas_mean'].append(mean_valid_alphas)
                stats['valid_alphas_std'].append(std_valid_alphas)
                stats['valid_alphas_ent'].append(valid_alpha_ent)

                stats['valid_probs_mean'].append(mean_valid_probs)
                stats['valid_probs_std'].append(std_valid_probs)

                if uidx == 0 or valid_err <= numpy.array(
                        history_errs)[:, 1].min():
                    best_p = unzip(tparams)
                    bad_counter = 0
                    best_found = True
                else:
                    bst_found = False

                if numpy.isnan(valid_err):
                    import ipdb
                    ipdb.set_trace()

                print "============================"
                print '\t>>>Valid error: ', valid_err, \
                        ' Valid cost: ', valid_cost
                print '\t>>>Valid pred mean: ', mean_valid_probs, \
                        ' Valid pred std: ', std_valid_probs
                print '\t>>>Valid alphas mean: ', mean_valid_alphas, \
                        ' Valid alphas std: ', std_valid_alphas, \
                        ' Valid alpha negent: ', valid_alpha_ent, \
                        ' Valid error ent: ', error_ent, \
                        ' Valid error desc ent: ', error_dent

                print "============================"
                print "Running average train stats "
                print '\t>>>Train error: ', train_err_ave, \
                        ' Train cost: ', train_cost_ave, \
                        ' Train grad norm: ', train_gnorm_ave
                print "============================"


                train_cost_ave, train_err_ave, \
                    train_gnorm_ave = reset_train_vals()

        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)
    valid.reset()
    valid_cost, valid_error, valid_probs, \
            valid_alphas, error_ent = eval_model(f_log_probs,
                                      prepare_data if not opt_ds['use_sent_reps'] \
                                           else prepare_data_sents,
                                      model_options, valid,
                                      use_sent_rep=opt_ds['use_sent_rep'])

    print " Final eval resuts: "
    print 'Valid error: ', valid_error.mean()
    print 'Valid cost: ', valid_cost.mean()
    print '\t>>>Valid pred mean: ', valid_probs.mean(), \
            ' Valid pred std: ', valid_probs.std(), \
            ' Valid error ent: ', error_ent

    params = copy.copy(best_p)

    numpy.savez(mpath_last,
                zipped_params=best_p,
                history_errs=history_errs,
                **params)

    return valid_err, valid_cost
示例#5
0
def search_model_adam_gru_soft(state, channel):
    def NReLU(x, rng=None, use_noise=False):
        assert rng is not None
        if use_noise:
            stds = Sigmoid(x)
            x = x + rng.normal(x.shape, avg=0.0, std=stds, dtype=x.dtype)
        return Trect(x)

    def NRect(x, rng=None, use_noise=False, std=0.05):
        assert rng is not None
        if use_noise:
            x = x + rng.normal(x.shape, avg=0.0, std=std, dtype=x.dtype)
        return Trect(x)

    def get_inps(use_mask=True, vgen=None, debug=False):
        if use_mask:
            X, y, mask, cmask = TT.itensor3("X"), TT.imatrix("y"), TT.fmatrix(
                "mask"), TT.fmatrix("cost_mask")
            if debug:
                theano.config.compute_test_value = "warn"
                batch = vgen.next()
                X.tag.test_value = batch['x'].astype("int32")
                y.tag.test_value = batch['y'].astype("int32")
                mask.tag.test_value = batch['mask'].astype("float32")
                cmask.tag.test_value = batch['cmask'].astype("float32")
            return [X, y, mask, cmask]
        else:
            X, y = TT.itensor3("X"), TT.itensor3("y")
            if debug:
                theano.config.compute_test_value = "warn"
                batch = vgen.next()
                X.tag.test_value = batch['x']
                y.tag.test_value = batch['y']
            return [X, y]

    lr = state.lr
    batch_size = state.batch_size
    seed = state.get("seed", 3)

    # No of els in the cols of the content for the memory
    mem_size = state.mem_size

    # No of rows in M
    mem_nel = state.mem_nel
    std = state.std
    renormalization_scale = state.renormalization_scale
    sub_mb_size = state.sub_mb_size

    # No of hids for controller
    n_hids = state.n_hids

    # Not using deep out
    deep_out_size = 100

    # Size of the bow embeddings
    bow_size = state.n_hids

    # ff controller
    use_ff_controller = True

    # For RNN controller:
    learn_h0 = True
    use_nogru_mem2q = False

    # Use loc based addressing:
    use_loc_based_addressing = False

    seed = 7

    max_seq_len = 100
    max_fact_len = 12

    n_read_heads = 1
    n_write_heads = 1
    n_reading_steps = 1

    lambda1_rein = 4e-5
    lambda2_rein = 1e-5
    base_reg = 3e-5

    #size of the address in the memory:
    address_size = 20
    w2v_embed_scale = 0.05

    rng = np.random.RandomState(seed)
    trng = RandomStreams(seed)
    NRect = lambda x, use_noise=False: NRect(
        x, rng=trng, use_noise=use_noise, std=std)
    use_noise = False

    use_quad_interactions = True
    mode = None
    import sys
    sys.setrecursionlimit(50000)

    learning_rule = Adam(gradient_clipping=10)
    task_id = state.task_id

    cont_act = Tanh
    mem_gater_activ = Sigmoid
    erase_activ = Sigmoid
    content_activ = Tanh
    w2v_embed_path = None

    use_reinforce_baseline = False

    l1_pen = 7e-4
    l2_pen = 9e-4
    debug = False

    path = "/data/lisatmp3/gulcehrc/data/tasks_1-20_v1-2/en-10k/splitted_trainval/"
    prfx = (
        "ntm_on_fb_BABI_task_all__learn_h0_l1_no_n_hids_%(n_hids)s_bsize_%(batch_size)d"
        "_std_%(std)f_mem_nel_%(mem_nel)d_mem_size_%(mem_size)f_lr_%(lr)f"
    ) % locals()

    tdata_gen = FBbABIDataIteratorSingleQ(
        task_file='all_tasks_train_ngram_False.pkl',
        randomize=True,
        max_seq_len=max_seq_len,
        max_fact_len=max_fact_len,
        task_id=task_id,
        task_path=path,
        mode='train',
        fact_vocab="all_tasks_train_ngram_False_dict.pkl",
        batch_size=batch_size)

    vdata_gen = FBbABIDataIteratorSingleQ(
        task_file='all_tasks_valid_ngram_False.pkl',
        max_fact_len=tdata_gen.max_fact_len,
        max_seq_len=max_seq_len,
        randomize=False,
        task_id=task_id,
        mode="valid",
        task_path=path,
        fact_vocab="all_tasks_train_ngram_False_dict.pkl",
        batch_size=batch_size)

    inps = get_inps(vgen=vdata_gen, debug=debug)

    wi = WeightInitializer(sparsity=-1,
                           scale=std,
                           rng=rng,
                           init_method=InitMethods.Adaptive,
                           center=0.0)

    bi = BiasInitializer(sparsity=-1,
                         scale=std,
                         rng=rng,
                         init_method=BiasInitMethods.Constant,
                         center=0.0)

    print "Length of the vocabulary, ", len(tdata_gen.vocab.items())

    ntm = NTMModel(n_in=len(tdata_gen.vocab.items()),
                   n_hids=n_hids,
                   bow_size=bow_size,
                   n_out=len(tdata_gen.vocab.items()),
                   mem_size=mem_size,
                   mem_nel=mem_nel,
                   use_ff_controller=use_ff_controller,
                   sub_mb_size=sub_mb_size,
                   deep_out_size=deep_out_size,
                   inps=inps,
                   baseline_reg=base_reg,
                   w2v_embed_path=w2v_embed_path,
                   renormalization_scale=renormalization_scale,
                   w2v_embed_scale=w2v_embed_scale,
                   n_read_heads=n_read_heads,
                   n_write_heads=n_write_heads,
                   use_last_hidden_state=False,
                   use_loc_based_addressing=use_loc_based_addressing,
                   use_gru_inp_rep=False,
                   use_bow_input=True,
                   erase_activ=erase_activ,
                   use_gate_quad_interactions=use_quad_interactions,
                   content_activ=content_activ,
                   use_multiscale_shifts=True,
                   learning_rule=learning_rule,
                   lambda1_rein=lambda1_rein,
                   lambda2_rein=lambda2_rein,
                   n_reading_steps=n_reading_steps,
                   use_deepout=False,
                   use_reinforce=False,
                   use_nogru_mem2q=use_nogru_mem2q,
                   use_reinforce_baseline=use_reinforce_baseline,
                   controller_activ=cont_act,
                   use_adv_indexing=False,
                   use_out_mem=False,
                   unroll_recurrence=False,
                   address_size=address_size,
                   reinforce_decay=0.9,
                   learn_h0=learn_h0,
                   theano_function_mode=mode,
                   l1_pen=l1_pen,
                   mem_gater_activ=mem_gater_activ,
                   tie_read_write_gates=False,
                   weight_initializer=wi,
                   bias_initializer=bi,
                   use_cost_mask=True,
                   use_noise=use_noise,
                   max_fact_len=max_fact_len,
                   softmax=True,
                   batch_size=batch_size)

    main_loop = FBaBIMainLoop(ntm,
                              print_every=40,
                              checkpoint_every=400,
                              validate_every=100,
                              train_data_gen=tdata_gen,
                              valid_data_gen=vdata_gen,
                              learning_rate=lr,
                              reload_model=False,
                              valid_iters=None,
                              linear_start=False,
                              max_iters=state.max_iters,
                              prefix=prfx)
    main_loop.run()

    return channel.COMPLETE