def search_model_adam(state, channel, reload_model=False): pp.pprint(state) def NReLU(x, rng=None, use_noise=False): assert rng is not None if use_noise: stds = Sigmoid(x) x = x + rng.normal(x.shape, avg=0.0, std=stds, dtype=x.dtype) return Trect(x) def NRect(x, rng=None, use_noise=False, std=0.05): assert rng is not None if use_noise: x = x + rng.normal(x.shape, avg=0.0, std=std, dtype=x.dtype) return Trect(x) def get_inps(use_mask=True, vgen=None, use_bow_out=False, debug=False, output_map=None): if use_mask: X, y, mask, cmask = TT.itensor3("X"), TT.imatrix("y"), TT.fmatrix("mask"), \ TT.fmatrix("cost_mask") qmask = TT.fmatrix("qmask") bow_out = TT.ftensor3("bow_out") if debug: theano.config.compute_test_value = "warn" batch = vgen.next() X.tag.test_value = batch['x'].astype("int32") y.tag.test_value = batch['y'].astype("int32") mask.tag.test_value = batch['mask'].astype("float32") cmask.tag.test_value = batch['cmask'].astype("float32") qmask.tag.test_value = batch["qmask"].astype("float32") if use_bow_out: bow_out.tag.test_value = batch['bow_out'].astype("float32") if output_map: outs = {} outs["X"] = X outs["y"] = y outs["mask"] = mask outs["cmask"] = cmask if use_bow_out: outs["bow_out"] = bow_out outs["qmask"] = qmask else: outs = [X, y, mask, cmask] if use_bow_out: outs += [bow_out] return outs else: X, y = TT.itensor3("X"), TT.itensor3("y") if debug: theano.config.compute_test_value = "warn" batch = vgen.next() X.tag.test_value = batch['x'] y.tag.test_value = batch['y'] return [X, y] lr = state.lr batch_size = state.batch_size seed = state.get("seed", 3) seed_path = "{0}/seed_{1}.txt".format(state.save_path, str(seed)) replace_seed(seed_path, seed) seed_setter = SEEDSetter(seed_path) print "seed is", seed_setter # No of els in the cols of the content for the memory mem_size = state.mem_size # No of rows in M mem_nel = state.mem_nel std = state.std renormalization_scale = state.renormalization_scale sub_mb_size = state.sub_mb_size smoothed_diff_weights = state.get('smoothed_diff_weights', True) # No of hids for controller n_hids = state.n_hids # Not using deep out deep_out_size = 100 # Size of the bow embeddings bow_size = state.get('bow_size', 80) # ff controller use_ff_controller = state.use_ff_controller # For RNN controller: learn_h0 = state.get('learn_h0', False) use_nogru_mem2q = False # Use loc based addressing: use_loc_based_addressing = state.get('use_loc_based_addressing', False) bowout = state.get('bowout', True) use_reinforce = state.get('use_reinforce', False) max_seq_len = state.max_seq_len max_fact_len = state.max_fact_len n_read_heads = state.n_read_heads n_write_heads = 1 n_reading_steps = state.n_reading_steps lambda1_rein = state.get('lambda1_rein', 4e-5) lambda2_rein = state.get('lambda2_rein', 1e-5) base_reg = 2e-5 #size of the address in the memory: address_size = state.address_size renormalization_scale = state.renormalization_scale w2v_embed_scale = 0.05 use_layer_norm = state.get('use_layer_norm', False) rng = np.random.RandomState(int(seed)) trng = RandomStreams(int(seed)) NRect = lambda x, use_noise=False: NRect( x, rng=trng, use_noise=use_noise, std=std) use_noise = False emb_scale = state.get('emb_scale', 0.32) use_quad_interactions = state.get('use_quad_interactions', True) mode = state.get('theano_function_mode', None) import sys sys.setrecursionlimit(50000) learning_rule = Adam(gradient_clipping=state.get('gradient_clip', 10)) task_id = state.task_id print "Task id is, ", task_id cont_act = Tanh mem_gater_activ = Sigmoid erase_activ = Sigmoid content_activ = Tanh use_gru_inp = state.get('use_gru_inp', True) use_bow_inp = state.get('use_bow_inp', False) w2v_embed_path = None use_reinforce_baseline = state.use_reinforce_baseline use_reinforce = state.get('use_reinforce', False) l1_pen = state.get('l1_pen', 1e-4) l2_pen = state.get('l2_pen', 1e-3) hybrid_att = state.get('hybrid_att', False) use_dice_val = state.get('use_dice_val', False) debug = state.get('debug', False) correlation_ws = state.get('correlation_ws', 6e-4) anticorr = state.get('anticorr', None) path = state.path prfx = ( "ntm_on_fb_BABI_task_%(task_id)d_seed_%(seed)s_learn_h0_l1_no_n_hids_%(n_hids)s_bsize_%(batch_size)d" "_std_%(std)f_mem_nel_%(mem_nel)d_mem_size_%(mem_size)f_lr_%(lr)f" ) % locals() prfx = state.save_path + prfx tdata_gen = FBbABIDataIteratorSingleQ( task_file='all_tasks_train_ngram_False.pkl', randomize=True, max_seq_len=max_seq_len, max_fact_len=max_fact_len, task_id=task_id, task_path=path, mode='train', fact_vocab="../all_tasks_test_ngram_False_dict.pkl", batch_size=batch_size) vdata_gen = FBbABIDataIteratorSingleQ( task_file='all_tasks_valid_ngram_False.pkl', max_fact_len=tdata_gen.max_fact_len, max_seq_len=max_seq_len, randomize=False, task_id=task_id, mode="valid", task_path=path, fact_vocab="../all_tasks_test_ngram_False_dict.pkl", batch_size=batch_size) tst_data_gen = FBbABIDataIteratorSingleQ( task_file='../all_tasks_test_ngram_False.pkl', max_fact_len=tdata_gen.max_fact_len, max_seq_len=max_seq_len, randomize=False, task_id=task_id, mode="valid", task_path=path, fact_vocab="../all_tasks_test_ngram_False_dict.pkl", batch_size=batch_size) use_mask = True n_layers = state.get('n_layers', 1) inps = get_inps(vgen=vdata_gen, debug=debug, use_bow_out=bowout, output_map=True) wi = WeightInitializer(sparsity=-1, scale=std, rng=rng, init_method=InitMethods.Adaptive, center=0.0) bi = BiasInitializer(sparsity=-1, scale=std, rng=rng, init_method=BiasInitMethods.Constant, center=0.0) print "Length of the vocabulary, ", len(tdata_gen.vocab.items()) ntm = NTMModel(n_in=len(tdata_gen.vocab.items()), n_hids=n_hids, bow_size=bow_size, n_out=len(tdata_gen.vocab.items()), predict_bow_out=bowout, mem_size=mem_size, mem_nel=mem_nel, use_ff_controller=use_ff_controller, sub_mb_size=sub_mb_size, deep_out_size=deep_out_size, inps=inps, n_layers=n_layers, hybrid_att=hybrid_att, smoothed_diff_weights=smoothed_diff_weights, baseline_reg=base_reg, w2v_embed_path=w2v_embed_path, renormalization_scale=renormalization_scale, w2v_embed_scale=w2v_embed_scale, emb_scale=emb_scale, n_read_heads=n_read_heads, n_write_heads=n_write_heads, use_layer_norm=use_layer_norm, use_last_hidden_state=False, use_loc_based_addressing=use_loc_based_addressing, use_simple_rnn_inp_rep=False, use_gru_inp_rep=use_gru_inp, use_bow_input=use_bow_inp, anticorr=anticorr, erase_activ=erase_activ, use_gate_quad_interactions=use_quad_interactions, content_activ=content_activ, use_multiscale_shifts=True, correlation_ws=correlation_ws, learning_rule=learning_rule, lambda1_rein=lambda1_rein, lambda2_rein=lambda2_rein, n_reading_steps=n_reading_steps, use_deepout=False, use_reinforce=use_reinforce, use_nogru_mem2q=use_nogru_mem2q, use_reinforce_baseline=use_reinforce_baseline, controller_activ=cont_act, use_adv_indexing=False, use_out_mem=False, unroll_recurrence=False, address_size=address_size, reinforce_decay=0.9, learn_h0=learn_h0, theano_function_mode=mode, l1_pen=l1_pen, debug=debug, mem_gater_activ=mem_gater_activ, tie_read_write_gates=False, weight_initializer=wi, bias_initializer=bi, use_cost_mask=True, use_noise=use_noise, max_fact_len=max_fact_len, softmax=True, use_mask=use_mask, batch_size=batch_size) bow_weight_stop = state.get('bow_weight_stop', 1.2 * 1e-1) bow_weight_anneal_start = state.get('bow_weight_anneal_start', 320) bow_weight_start = state.get("bow_weight_start", 0.74) bow_out_anneal_rate = state.get("bow_out_anneal_rate", 2 * 1e-4) save_freq = state.get("save_freq", 1000) main_loop = FBaBIMainLoop(ntm, print_every=50, checkpoint_every=save_freq, validate_every=500, bow_out_anneal_rate=bow_out_anneal_rate, bow_weight_start=bow_weight_start, bow_weight_stop=bow_weight_stop, bow_weight_anneal_start=bow_weight_anneal_start, train_data_gen=tdata_gen, valid_data_gen=vdata_gen, test_data_gen=tst_data_gen, learning_rate=lr, reload_model=reload_model, valid_iters=None, linear_start=False, use_qmask=True, max_iters=state.max_iters, state=state, prefix=prfx) main_loop.run() if channel is None: return None return channel.COMPLETE
def search_model_adam(state, channel, reload_model=False): pp.pprint(state) def NReLU(x, rng=None, use_noise=False): assert rng is not None if use_noise: stds = Sigmoid(x) x = x + rng.normal(x.shape, avg=0.0, std=stds, dtype=x.dtype) return Trect(x) def get_inps(vgen=None, debug=False, output_map=None): X, y = TT.fmatrix("X"), TT.vector("y", dtype="uint8") if debug: theano.config.compute_test_value = "warn" batch = vgen.get_epoch_iterator().next() X.tag.test_value = batch[0].reshape((batch[0].shape[0], -1)) y.tag.test_value = batch[1].flatten() return [X, y] lr = state['lr'] batch_size = state['batch_size'] # No of els in the cols of the content for the memory mem_size = state['mem_size'] # No of rows in M mem_nel = state['mem_nel'] std = state['std'] renormalization_scale = state['renormalization_scale'] sub_mb_size = state['sub_mb_size'] smoothed_diff_weights = state.get('smoothed_diff_weights', False) max_len = 784 inp_size = 1 # No of hids for controller n_hids = state['n_hids'] # Not using deep out deep_out_size = 100 # Size of the bow embeddings bow_size = state.get('bow_size', 80) # ff controller use_ff_controller = state['use_ff_controller'] # For RNN controller: learn_h0 = state.get('learn_h0', False) use_nogru_mem2q = False # Use loc based addressing: use_loc_based_addressing = state.get('use_loc_based_addressing', False) bowout = state.get('bowout', False) use_reinforce = state.get('use_reinforce', False) permute_order = state.get('permute_order', True) seed = 7 n_read_heads = state['n_read_heads'] n_write_heads = 1 n_reading_steps = state['n_reading_steps'] lambda1_rein = state.get('lambda1_rein', 4e-5) lambda2_rein = state.get('lambda2_rein', 1e-5) base_reg = 2e-5 #size of the address in the memory: address_size = state["address_size"] w2v_embed_scale = 0.05 n_out = 10 rng = np.random.RandomState(seed) trng = RandomStreams(seed) NRect = lambda x, use_noise=False: NRect( x, rng=trng, use_noise=use_noise, std=std) use_noise = False use_quad_interactions = state.get('use_quad_interactions', True) mode = state.get('theano_function_mode', None) import sys sys.setrecursionlimit(50000) learning_rule = Adam(gradient_clipping=state.get('gradient_clip', 10)) cont_act = Tanh mem_gater_activ = Sigmoid erase_activ = Sigmoid content_activ = Tanh use_gru_inp = state.get('use_gru_inp', False) use_bow_inp = state.get('use_bow_inp', False) w2v_embed_path = None use_reinforce_baseline = state['use_reinforce_baseline'] use_reinforce = state.get('use_reinforce', False) l1_pen = state.get('l1_pen', 1e-4) l2_pen = state.get('l2_pen', 1e-3) hybrid_att = state.get('hybrid_att', False) use_dice_val = state.get('use_dice_val', False) debug = state.get('debug', False) correlation_ws = state.get('correlation_ws', False) idxs = np.arange(max_len) np.random.shuffle(idxs) use_batch_norm = state.get("use_batch_norm", False) anticorr = state.get('anticorr', None) prfx = ( "ntm_on_fb_copy_task_all_learn_h0_l1_no_n_hids_%(n_hids)s_bsize_%(batch_size)d" "_std_%(std)f_mem_nel_%(mem_nel)d_mem_size_%(mem_size)f_lr_%(lr)f_use_bn_%(use_batch_norm)d_hard2" ) % locals() tdata_gen = get_stream(which_set="train", batch_size=batch_size) vdata_gen = get_stream(which_set="valid", batch_size=batch_size) tst_data_gen = get_stream(which_set="test", batch_size=batch_size) n_layers = state.get('n_layers', 1) inps = get_inps(vgen=vdata_gen, debug=debug, output_map=True) wi = WeightInitializer(sparsity=-1, scale=std, rng=rng, init_method=InitMethods.Adaptive, center=0.0) bi = BiasInitializer(sparsity=-1, scale=1e-3, rng=rng, init_method=BiasInitMethods.Random, center=0.0) ntm = NTMModel(n_in=inp_size, n_hids=n_hids, bow_size=bow_size, n_out=n_out, predict_bow_out=bowout, mem_size=mem_size, mem_nel=mem_nel, use_ff_controller=use_ff_controller, sub_mb_size=sub_mb_size, deep_out_size=deep_out_size, inps=inps, n_layers=n_layers, hybrid_att=hybrid_att, smoothed_diff_weights=smoothed_diff_weights, baseline_reg=base_reg, w2v_embed_path=w2v_embed_path, renormalization_scale=renormalization_scale, use_batch_norm=use_batch_norm, w2v_embed_scale=w2v_embed_scale, n_read_heads=n_read_heads, n_write_heads=n_write_heads, use_last_hidden_state=True, use_loc_based_addressing=use_loc_based_addressing, use_simple_rnn_inp_rep=False, use_gru_inp_rep=use_gru_inp, use_bow_input=use_bow_inp, use_inp_content=False, anticorr=anticorr, erase_activ=erase_activ, use_gate_quad_interactions=use_quad_interactions, content_activ=content_activ, use_multiscale_shifts=True, correlation_ws=correlation_ws, learning_rule=learning_rule, lambda1_rein=lambda1_rein, lambda2_rein=lambda2_rein, n_reading_steps=n_reading_steps, use_deepout=False, use_reinforce=use_reinforce, use_nogru_mem2q=use_nogru_mem2q, use_reinforce_baseline=use_reinforce_baseline, controller_activ=cont_act, use_adv_indexing=False, use_out_mem=False, unroll_recurrence=False, address_size=address_size, reinforce_decay=0.9, learn_h0=learn_h0, theano_function_mode=mode, l1_pen=l1_pen, debug=debug, mem_gater_activ=mem_gater_activ, tie_read_write_gates=False, weight_initializer=wi, bias_initializer=bi, use_cost_mask=False, use_noise=use_noise, rnd_indxs=idxs, permute_order=permute_order, max_fact_len=max_len, softmax=True, batch_size=None) save_freq = state.get("save_freq", 1000) main_loop = SeqMNISTMainLoop(ntm, print_every=50, checkpoint_every=save_freq, validate_every=500, train_data_gen=tdata_gen, valid_data_gen=vdata_gen, test_data_gen=tst_data_gen, learning_rate=lr, reload_model=reload_model, num_epochs=250, state=state, prefix=prfx) main_loop.run()
def search_model_adam(state, channel, reload_model=False): pp.pprint(state) def NReLU(x, rng=None, use_noise=False): assert rng is not None if use_noise: stds = Sigmoid(x) x = x + rng.normal(x.shape, avg=0.0, std=stds, dtype=x.dtype) return Trect(x) def get_inps(use_mask=True, vgen=None, debug=False, output_map=None): if use_mask: X, y, mask, cmask = TT.ftensor3("X"), TT.ftensor3("y"), \ TT.fmatrix("mask"), TT.ftensor3("cost_mask") if debug: theano.config.compute_test_value = "warn" batch = vgen.next() X.tag.test_value = batch['x'].astype("float32") y.tag.test_value = batch['y'].astype("float32") mask.tag.test_value = batch['mask'].astype("float32") cmask.tag.test_value = batch['cmask'].astype("float32") if output_map: outs = {} outs["X"] = X outs["y"] = y outs["mask"] = mask outs["cmask"] = cmask else: outs = [X, y, mask, cmask] return outs else: X, y = TT.tensor3("X"), TT.tensor3("y") if debug: theano.config.compute_test_value = "warn" batch = vgen.next() X.tag.test_value = batch['x'] y.tag.test_value = batch['y'] return [X, y] lr = state['lr'] batch_size = state['batch_size'] # No of els in the cols of the content for the memory mem_size = state['mem_size'] # No of rows in M mem_nel = state['mem_nel'] std = state['std'] renormalization_scale = state['renormalization_scale'] sub_mb_size = state['sub_mb_size'] smoothed_diff_weights = state.get('smoothed_diff_weights', True) max_len = 10 inp_size = 10 # No of hids for controller n_hids = state['n_hids'] # Not using deep out deep_out_size = 100 # Size of the bow embeddings bow_size = state.get('bow_size', 80) # ff controller use_ff_controller = state['use_ff_controller'] # For RNN controller: learn_h0 = state.get('learn_h0', False) use_nogru_mem2q = False # Use loc based addressing: use_loc_based_addressing = state.get('use_loc_based_addressing', False) bowout = state.get('bowout', False) use_reinforce = state.get('use_reinforce', False) seed = 7 n_read_heads = state['n_read_heads'] n_write_heads = 1 n_reading_steps = state['n_reading_steps'] lambda1_rein = state.get('lambda1_rein', 4e-5) lambda2_rein = state.get('lambda2_rein', 1e-5) base_reg = 2e-5 #size of the address in the memory: address_size = state['address_size'] renormalization_scale = state['renormalization_scale'] w2v_embed_scale = 0.05 rng = np.random.RandomState(seed) trng = RandomStreams(seed) NRect = lambda x, use_noise=False: NRect(x, rng=trng, use_noise=use_noise, std=std) use_noise = False use_quad_interactions = state.get('use_quad_interactions', True) mode = state.get('theano_function_mode', None) import sys sys.setrecursionlimit(50000) learning_rule = Adam(gradient_clipping=state.get('gradient_clip', 10)) cont_act = Tanh mem_gater_activ = Sigmoid erase_activ = Sigmoid content_activ = Tanh use_gru_inp = state.get('use_gru_inp', False) use_bow_inp = state.get('use_bow_inp', False) w2v_embed_path = None use_reinforce_baseline = state['use_reinforce_baseline'] use_reinforce = state.get('use_reinforce', False) l1_pen = state.get('l1_pen', 1e-4) l2_pen = state.get('l2_pen', 1e-3) hybrid_att = state.get('hybrid_att', False) use_dice_val = state.get('use_dice_val', False) debug = state.get('debug', False) correlation_ws = state.get('correlation_ws', False) anticorr = state.get('anticorr', None) prfx = ("ntm_copy_learn_h0_l1_no_n_hids_%(n_hids)s_bsize_%(batch_size)d_rs_%(renormalization_scale)f" "_std_%(std)f_mem_nel_%(mem_nel)d_mem_size_%(mem_size)d_lr_%(lr)f_%(address_size)d") % locals() save_path = state.get("save_path", ".") prfx = save_path + prfx tdata_gen = CopyDataGen(batch_size, max_len, inp_size, rng=rng, seed=seed, rnd_len=True) vdata_gen = CopyDataGen(batch_size, max_len, inp_size, rng=rng, seed=2, rnd_len=False) tst_data_gen = CopyDataGen(batch_size, max_len, inp_size, rng=rng, seed=3, rnd_len=False) n_layers = state.get('n_layers', 1) inps = get_inps(vgen=vdata_gen, debug=debug, output_map=True) wi = WeightInitializer(sparsity=-1, scale=std, rng=rng, init_method=InitMethods.Adaptive, center=0.0) bi = BiasInitializer(sparsity=-1, scale=std, rng=rng, init_method=BiasInitMethods.Constant, center=0.0) ntm = NTMModel(n_in=inp_size, n_hids=n_hids, bow_size=bow_size, n_out=inp_size, predict_bow_out=bowout, mem_size=mem_size, mem_nel=mem_nel, use_ff_controller=use_ff_controller, sub_mb_size=sub_mb_size, deep_out_size=deep_out_size, inps=inps, n_layers=n_layers, hybrid_att=hybrid_att, smoothed_diff_weights=smoothed_diff_weights, baseline_reg=base_reg, w2v_embed_path=w2v_embed_path, renormalization_scale=renormalization_scale, w2v_embed_scale=w2v_embed_scale, n_read_heads=n_read_heads, n_write_heads=n_write_heads, use_last_hidden_state=False, use_loc_based_addressing=use_loc_based_addressing, use_simple_rnn_inp_rep=False, use_gru_inp_rep=use_gru_inp, use_bow_input=use_bow_inp, anticorr=anticorr, erase_activ=erase_activ, use_gate_quad_interactions=use_quad_interactions, content_activ=content_activ, use_multiscale_shifts=True, correlation_ws=correlation_ws, learning_rule=learning_rule, lambda1_rein=lambda1_rein, lambda2_rein=lambda2_rein, n_reading_steps=n_reading_steps, use_deepout=False, use_reinforce=use_reinforce, use_nogru_mem2q=use_nogru_mem2q, use_reinforce_baseline=use_reinforce_baseline, controller_activ=cont_act, use_adv_indexing=False, use_out_mem=False, unroll_recurrence=False, address_size=address_size, reinforce_decay=0.9, learn_h0=learn_h0, theano_function_mode=mode, l1_pen=l1_pen, debug=debug, mem_gater_activ=mem_gater_activ, tie_read_write_gates=False, weight_initializer=wi, bias_initializer=bi, use_cost_mask=True, use_noise=use_noise, max_fact_len=max_len, softmax=False, batch_size=batch_size) save_freq = state.get("save_freq", 1000) main_loop = NTMToyMainLoop(ntm, print_every=50, checkpoint_every=save_freq, validate_every=500, train_data_gen=tdata_gen, valid_data_gen=vdata_gen, test_data_gen=tst_data_gen, learning_rate=lr, reload_model=reload_model, valid_iters=200, max_iters=state['max_iters'], state=state, prefix=prfx) main_loop.run()
def search_model_adam_gru_soft(state, channel): def NReLU(x, rng=None, use_noise=False): assert rng is not None if use_noise: stds = Sigmoid(x) x = x + rng.normal(x.shape, avg=0.0, std=stds, dtype=x.dtype) return Trect(x) def NRect(x, rng=None, use_noise=False, std=0.05): assert rng is not None if use_noise: x = x + rng.normal(x.shape, avg=0.0, std=std, dtype=x.dtype) return Trect(x) def get_inps(use_mask=True, vgen=None, debug=False): if use_mask: X, y, mask, cmask = TT.itensor3("X"), TT.imatrix("y"), TT.fmatrix( "mask"), TT.fmatrix("cost_mask") if debug: theano.config.compute_test_value = "warn" batch = vgen.next() X.tag.test_value = batch['x'].astype("int32") y.tag.test_value = batch['y'].astype("int32") mask.tag.test_value = batch['mask'].astype("float32") cmask.tag.test_value = batch['cmask'].astype("float32") return [X, y, mask, cmask] else: X, y = TT.itensor3("X"), TT.itensor3("y") if debug: theano.config.compute_test_value = "warn" batch = vgen.next() X.tag.test_value = batch['x'] y.tag.test_value = batch['y'] return [X, y] lr = state.lr batch_size = state.batch_size seed = state.get("seed", 3) # No of els in the cols of the content for the memory mem_size = state.mem_size # No of rows in M mem_nel = state.mem_nel std = state.std renormalization_scale = state.renormalization_scale sub_mb_size = state.sub_mb_size # No of hids for controller n_hids = state.n_hids # Not using deep out deep_out_size = 100 # Size of the bow embeddings bow_size = state.n_hids # ff controller use_ff_controller = True # For RNN controller: learn_h0 = True use_nogru_mem2q = False # Use loc based addressing: use_loc_based_addressing = False seed = 7 max_seq_len = 100 max_fact_len = 12 n_read_heads = 1 n_write_heads = 1 n_reading_steps = 1 lambda1_rein = 4e-5 lambda2_rein = 1e-5 base_reg = 3e-5 #size of the address in the memory: address_size = 20 w2v_embed_scale = 0.05 rng = np.random.RandomState(seed) trng = RandomStreams(seed) NRect = lambda x, use_noise=False: NRect( x, rng=trng, use_noise=use_noise, std=std) use_noise = False use_quad_interactions = True mode = None import sys sys.setrecursionlimit(50000) learning_rule = Adam(gradient_clipping=10) task_id = state.task_id cont_act = Tanh mem_gater_activ = Sigmoid erase_activ = Sigmoid content_activ = Tanh w2v_embed_path = None use_reinforce_baseline = False l1_pen = 7e-4 l2_pen = 9e-4 debug = False path = "/data/lisatmp3/gulcehrc/data/tasks_1-20_v1-2/en-10k/splitted_trainval/" prfx = ( "ntm_on_fb_BABI_task_all__learn_h0_l1_no_n_hids_%(n_hids)s_bsize_%(batch_size)d" "_std_%(std)f_mem_nel_%(mem_nel)d_mem_size_%(mem_size)f_lr_%(lr)f" ) % locals() tdata_gen = FBbABIDataIteratorSingleQ( task_file='all_tasks_train_ngram_False.pkl', randomize=True, max_seq_len=max_seq_len, max_fact_len=max_fact_len, task_id=task_id, task_path=path, mode='train', fact_vocab="all_tasks_train_ngram_False_dict.pkl", batch_size=batch_size) vdata_gen = FBbABIDataIteratorSingleQ( task_file='all_tasks_valid_ngram_False.pkl', max_fact_len=tdata_gen.max_fact_len, max_seq_len=max_seq_len, randomize=False, task_id=task_id, mode="valid", task_path=path, fact_vocab="all_tasks_train_ngram_False_dict.pkl", batch_size=batch_size) inps = get_inps(vgen=vdata_gen, debug=debug) wi = WeightInitializer(sparsity=-1, scale=std, rng=rng, init_method=InitMethods.Adaptive, center=0.0) bi = BiasInitializer(sparsity=-1, scale=std, rng=rng, init_method=BiasInitMethods.Constant, center=0.0) print "Length of the vocabulary, ", len(tdata_gen.vocab.items()) ntm = NTMModel(n_in=len(tdata_gen.vocab.items()), n_hids=n_hids, bow_size=bow_size, n_out=len(tdata_gen.vocab.items()), mem_size=mem_size, mem_nel=mem_nel, use_ff_controller=use_ff_controller, sub_mb_size=sub_mb_size, deep_out_size=deep_out_size, inps=inps, baseline_reg=base_reg, w2v_embed_path=w2v_embed_path, renormalization_scale=renormalization_scale, w2v_embed_scale=w2v_embed_scale, n_read_heads=n_read_heads, n_write_heads=n_write_heads, use_last_hidden_state=False, use_loc_based_addressing=use_loc_based_addressing, use_gru_inp_rep=False, use_bow_input=True, erase_activ=erase_activ, use_gate_quad_interactions=use_quad_interactions, content_activ=content_activ, use_multiscale_shifts=True, learning_rule=learning_rule, lambda1_rein=lambda1_rein, lambda2_rein=lambda2_rein, n_reading_steps=n_reading_steps, use_deepout=False, use_reinforce=False, use_nogru_mem2q=use_nogru_mem2q, use_reinforce_baseline=use_reinforce_baseline, controller_activ=cont_act, use_adv_indexing=False, use_out_mem=False, unroll_recurrence=False, address_size=address_size, reinforce_decay=0.9, learn_h0=learn_h0, theano_function_mode=mode, l1_pen=l1_pen, mem_gater_activ=mem_gater_activ, tie_read_write_gates=False, weight_initializer=wi, bias_initializer=bi, use_cost_mask=True, use_noise=use_noise, max_fact_len=max_fact_len, softmax=True, batch_size=batch_size) main_loop = FBaBIMainLoop(ntm, print_every=40, checkpoint_every=400, validate_every=100, train_data_gen=tdata_gen, valid_data_gen=vdata_gen, learning_rate=lr, reload_model=False, valid_iters=None, linear_start=False, max_iters=state.max_iters, prefix=prfx) main_loop.run() return channel.COMPLETE