def main(_): pp.pprint(flags.FLAGS.__flags) with tf.device('/cpu:0'), tf.Session() as sess: if FLAGS.task == 'copy': if FLAGS.is_train: cell, ntm = copy_train(FLAGS, sess) else: cell = NTMCell(input_dim=FLAGS.input_dim, output_dim=FLAGS.output_dim, controller_layer_size=FLAGS.controller_layer_size, write_head_size=FLAGS.write_head_size, read_head_size=FLAGS.read_head_size) ntm = NTM(cell, sess, 1, FLAGS.max_length, test_max_length=FLAGS.test_max_length, forward_only=True) ntm.load(FLAGS.checkpoint_dir, 'copy') copy(ntm, int(FLAGS.test_max_length*1/3), sess) print copy(ntm, int(FLAGS.test_max_length*2/3), sess) print copy(ntm, int(FLAGS.test_max_length*3/3), sess) elif FLAGS.task == 'recall': pass
def compareFixed(): t = Tasks() x_test, y_test = t.sequence_type_1(100) add_params, mul_params = torch.load('program_memory/add.pt'), torch.load( 'program_memory/mul.pt') hnm = HNM(10, 20, add_params, mul_params) hnm.load_state_dict(torch.load("learned_params/hnm_arch_2.pt")) ntm = NTM(10, 20) ntm.load_state_dict(torch.load("learned_params/ntm.pt")) lstm = LSTM(14, 256, 325, 1) lstm.load_state_dict(torch.load("learned_params/lstm.pt")) hnm_diff, lstm_diff, ntm_diff = 0, 0, 0 for i in range(len(x_test)): hnm_out = hnm.recurrent_forward(x_test[i:i + 1]) ntm_out = ntm.recurrent_forward(x_test[i:i + 1]) lstm_out = lstm.recurrent_forward(x_test[i:i + 1]) answer = np.argmax(y_test[i:i + 1].detach().numpy()) hnm_diff += abs(answer - np.argmax(hnm_out.detach().numpy())) ntm_diff += abs(answer - np.argmax(ntm_out.detach().numpy())) lstm_diff += abs(answer - np.argmax(lstm_out.detach().numpy())) print(hnm_diff / len(y_test), ntm_diff / len(y_test), lstm_diff / len(y_test))
def create_ntm(config, sess, **ntm_args): if config.rand_hyper: hyper_params = {} if config.is_test: hyper_params = load_hyperparamters(config) else: hyper_params = generate_hyperparams(config) print(" [*] Hyperparameters: {}".format(hyper_params)) cell = NTMCell(input_dim=config.input_dim, output_dim=config.output_dim, controller_layer_size=hyper_params["c_layer"], controller_dim=hyper_params["c_dim"], mem_size=hyper_params["mem_size"], write_head_size=config.write_head_size, read_head_size=config.read_head_size, is_LSTM_mode=config.is_LSTM_mode) scope = ntm_args.pop('scope', 'NTM-%s' % config.task) # Description + query + plan + answer min_length = (config.min_size - 1) + 1 + config.plan_length + (config.min_size - 1) max_length = int(((config.max_size * (config.max_size - 1) / 2) + 1 + config.plan_length + (config.max_size - 1))) ntm = NTM(cell, sess, min_length, max_length, config.min_size, config.max_size, scope=scope, **ntm_args, lr=hyper_params["lr"], momentum=hyper_params["momentum"], decay=hyper_params["decay"], beta=hyper_params["l2"]) else: cell = NTMCell(input_dim=config.input_dim, output_dim=config.output_dim, controller_layer_size=config.controller_layer_size, controller_dim=config.controller_dim, write_head_size=config.write_head_size, read_head_size=config.read_head_size, is_LSTM_mode=config.is_LSTM_mode) scope = ntm_args.pop('scope', 'NTM-%s' % config.task) # Description + query + plan + answer min_length = (config.min_size - 1) + 1 + config.plan_length + (config.min_size - 1) max_length = int(((config.max_size * (config.max_size - 1) / 2) + 1 + config.plan_length + (config.max_size - 1))) ntm = NTM(cell, sess, min_length, max_length, config.min_size, config.max_size, scope=scope, **ntm_args) return cell, ntm
def __init__(self, d_vocab, d_emb, d_dec, max_len, bos_idx, num_heads=8, N=64, M=32, seg_size=20): super().__init__() self.d_vocab = d_vocab self.seg_size = seg_size self.embs = nn.Embedding(d_vocab, d_emb) self.rnn = nn.GRU(d_emb, d_dec, batch_first=True) self.ntm_scale = nn.Parameter(torch.zeros([1, d_dec]), requires_grad=True) self.ntm = NTM('mem-aug', embedding_size=d_dec, hidden_size=d_dec, memory_size=M, head_num=num_heads, memory_feature_size=N, output_size=d_dec) self.init = nn.Parameter(torch.zeros(1, d_dec), requires_grad=True) self.bos_idx = nn.Parameter(torch.tensor([bos_idx]), requires_grad=False) self.out_layer = nn.Linear(d_dec, d_vocab) self.max_len = max_len
def trainNTM(): t = Tasks() x_train, y_train = t.sequence_type_1(2000) ntm = NTM(10, 20) ntm.train(x_train, y_train, 1, maxEpoch=25, learning_rate=0.0006)
def predict_train(config, sess): """Train an NTM for the copy task given a TensorFlow session, which is a connection to the C++ backend""" if not os.path.isdir(config.checkpoint_dir): raise Exception(" [!] Directory %s not found" % config.checkpoint_dir) # delimiter flag-like vector inputs indicating the start and end # you can see these in the figure examples in the README # this is kind of defined redundantly start_symbol = np.zeros([config.input_dim], dtype=np.float32) start_symbol[0] = 1 end_symbol = np.zeros([config.input_dim], dtype=np.float32) end_symbol[1] = 1 # initialise the neural turing machine and the neural-net controller thing cell = NTMCell(input_dim=config.input_dim, output_dim=config.output_dim, controller_layer_size=config.controller_layer_size, write_head_size=config.write_head_size, read_head_size=config.read_head_size) ntm = NTM(cell, sess, config.min_length, config.max_length*3) print(" [*] Initialize all variables") tf.initialize_all_variables().run() print(" [*] Initialization finished") start_time = time.time() for idx in xrange(config.epoch): # generate a sequence of random length seq_length = randint(config.min_length, config.max_length) * 4 inc_seq, comp_seq = generate_predict_sequence(seq_length, config.input_dim - 2) # this somehow associates the desired inputs and outputs with the NTM feed_dict = {input_:vec for vec, input_ in zip(inc_seq, ntm.inputs)} feed_dict.update( {true_output:vec for vec, true_output in zip(comp_seq, ntm.true_outputs)} ) feed_dict.update({ ntm.start_symbol: start_symbol, ntm.end_symbol: end_symbol }) # this runs the session and returns the current training loss and step # I'm kind of surprised it returns the step, but whatevs _, cost, step = sess.run([ntm.optims[seq_length], ntm.get_loss(seq_length), ntm.global_step], feed_dict=feed_dict) # how does one use these checkpoints? if idx % 100 == 0: ntm.save(config.checkpoint_dir, 'copy', step) if idx % print_interval == 0: print("[%5d] %2d: %.2f (%.1fs)" \ % (idx, seq_length, cost, time.time() - start_time)) print("Training Copy task finished") return cell, ntm
def trainNTM(): ntm = NTM(10, 14) X, y = [], [] for i in range(10): tempX, tempY = getData("data/observations_"+str(i*500)+".npy", "data/actions_"+str(i*500)+".npy") X.extend(tempX) y.extend(tempY) print(len(X), len(y)) ntm.train(X, y, 1)
def copy_train(config): sess = config.sess if not os.path.isdir(config.checkpoint_dir): raise Exception(" [!] Directory %s not found" % config.checkpoint_dir) # delimiter flag for start and end start_symbol = np.zeros([config.input_dim], dtype=np.float32) start_symbol[0] = 1 end_symbol = np.zeros([config.input_dim], dtype=np.float32) end_symbol[1] = 1 cell = NTMCell(input_dim=config.input_dim, output_dim=config.output_dim, controller_layer_size=config.controller_layer_size, write_head_size=config.write_head_size, read_head_size=config.read_head_size) ntm = NTM(cell, sess, config.min_length, config.max_length) print(" [*] Initialize all variables") tf.initialize_all_variables().run() print(" [*] Initialization finished") start_time = time.time() for idx in xrange(config.epoch): seq_length = randint(config.min_length, config.max_length) seq = generate_copy_sequence(seq_length, config.input_dim - 2) feed_dict = {input_: vec for vec, input_ in zip(seq, ntm.inputs)} feed_dict.update({ true_output: vec for vec, true_output in zip(seq, ntm.true_outputs) }) feed_dict.update({ ntm.start_symbol: start_symbol, ntm.end_symbol: end_symbol }) _, cost, step = sess.run([ ntm.optims[seq_length], ntm.get_loss(seq_length), ntm.global_step ], feed_dict=feed_dict) if idx % 100 == 0: ntm.save(config.checkpoint_dir, 'copy', step) if idx % print_interval == 0: print("[%5d] %2d: %.2f (%.1fs)" \ % (idx, seq_length, cost, time.time() - start_time)) print("Training Copy task finished") return cell, ntm
def copy_train(config): sess = config.sess if not os.path.isdir(config.checkpoint_dir): raise Exception(" [!] Directory %s not found" % config.checkpoint_dir) # delimiter flag for start and end start_symbol = np.zeros([config.input_dim], dtype=np.float32) start_symbol[0] = 1 end_symbol = np.zeros([config.input_dim], dtype=np.float32) end_symbol[1] = 1 cell = NTMCell(input_dim=config.input_dim, output_dim=config.output_dim, controller_layer_size=config.controller_layer_size, write_head_size=config.write_head_size, read_head_size=config.read_head_size) ntm = NTM(cell, sess, config.min_length, config.max_length) print(" [*] Initialize all variables") tf.initialize_all_variables().run() print(" [*] Initialization finished") start_time = time.time() for idx in xrange(config.epoch): seq_length = randint(config.min_length, config.max_length) seq = generate_copy_sequence(seq_length, config.input_dim - 2) feed_dict = {input_:vec for vec, input_ in zip(seq, ntm.inputs)} feed_dict.update( {true_output:vec for vec, true_output in zip(seq, ntm.true_outputs)} ) feed_dict.update({ ntm.start_symbol: start_symbol, ntm.end_symbol: end_symbol }) _, cost, step = sess.run([ntm.optims[seq_length], ntm.get_loss(seq_length), ntm.global_step], feed_dict=feed_dict) if idx % 100 == 0: ntm.save(config.checkpoint_dir, 'copy', step) if idx % print_interval == 0: print("[%5d] %2d: %.2f (%.1fs)" \ % (idx, seq_length, cost, time.time() - start_time)) print("Training Copy task finished") return cell, ntm
def __init__(self, num_inputs, num_outputs, controller_size, controller_layers, num_heads, N, M, controller_type='lstm'): """Initialize an EncapsulatedNTM. :param num_inputs: External number of inputs. :param num_outputs: External number of outputs. :param controller_size: The size of the internal representation. :param controller_layers: Controller number of layers. :param num_heads: Number of heads. :param N: Number of rows in the memory bank. :param M: Number of cols/features in the memory bank. """ super(EncapsulatedNTM, self).__init__() # Save args self.num_inputs = num_inputs self.num_outputs = num_outputs self.controller_size = controller_size self.controller_layers = controller_layers self.num_heads = num_heads self.N = N self.M = M # Create the NTM components memory = NTMMemory(N, M) if controller_type == 'lstm': controller = LSTMController(num_inputs + M * num_heads, controller_size, controller_layers) else: controller = MLPController(num_inputs + M * num_heads, controller_size, controller_layers) heads = nn.ModuleList([]) for i in range(num_heads): heads += [ NTMReadHead(memory, controller_size), NTMWriteHead(memory, controller_size) ] self.ntm = NTM(num_inputs, num_outputs, controller, memory, heads) self.memory = memory
def create_ntm(FLAGS, sess, **ntm_args): cell = NTMCell( input_dim=FLAGS.input_dim, output_dim=FLAGS.output_dim, controller_layer_size=FLAGS.controller_layer_size, write_head_size=FLAGS.write_head_size, read_head_size=FLAGS.read_head_size) ntm = NTM( cell, sess, FLAGS.min_length, FLAGS.max_length, test_max_length=FLAGS.test_max_length, scope='NTM-%s' % FLAGS.task, **ntm_args) return cell, ntm
def create_ntm(config, sess, **ntm_args): cell = NTMCell( input_dim=config.input_dim, output_dim=config.output_dim, controller_layer_size=config.controller_layer_size, controller_dim=config.controller_dim, write_head_size=config.write_head_size, read_head_size=config.read_head_size) scope = ntm_args.pop('scope', 'NTM-%s' % config.task) ntm = NTM( cell, sess, config.min_length, config.max_length, test_max_length=config.test_max_length, scope=scope, **ntm_args) return cell, ntm
def compare(): obstacle, wall_cw, wall_awc = Obstacle(), WallCW(), WallACW() obstacle_params, wall_cw_params, wall_acw_params = torch.load( 'program_memory/move.pt'), torch.load( 'program_memory/cw.pt'), torch.load('program_memory/acw.pt') networks = [obstacle, wall_cw, wall_awc] params = [obstacle_params, wall_cw_params, wall_acw_params] hnm = HNM(10, 14, networks, params) hnm.load_state_dict(torch.load('learned_params/hnm.pt')) ntm = NTM(10, 14) ntm.load_state_dict(torch.load('learned_params/ntm.pt')) lstm = LSTM(14, 64, 3, 1) lstm.load_state_dict(torch.load('learned_params/lstm.pt')) testX, testY = getTestData() hnm_correct, ntm_correct, lstm_correct = 0, 0, 0 totSamples = 0 for i in range(0, 25): s = torch.from_numpy(np.array(testX[i:i + 1][0])).float().unsqueeze(0) s_lstm = s.view(s.size()[0], s.size()[2], -1) l = np.array(testY[i:i + 1][0]) print(i) (hnm_read_weights, hnm_write_weights) = hnm._initialise() (ntm_read_weights, ntm_write_weights) = ntm._initialise() lstm_h = lstm.h0.expand(s_lstm.size()[0], 64) lstm_c = lstm.c0.expand(s_lstm.size()[0], 64) for j in range(s.size()[1]): (hnm_out, hnm_read_weights, hnm_write_weights) = hnm.forward(s[:, j, :], hnm_read_weights, hnm_write_weights) (ntm_out, ntm_read_weights, ntm_write_weights) = ntm.forward(s[:, j, :], ntm_read_weights, ntm_write_weights) lstm_h, lstm_c, lstm_out = lstm.forward(s_lstm[:, :, j], lstm_h, lstm_c) if np.argmax(hnm_out.detach().numpy()) == np.argmax(l[j]): hnm_correct += 1 if np.argmax(ntm_out.detach().numpy()) == np.argmax(l[j]): ntm_correct += 1 if np.argmax(lstm_out.detach().numpy()) == np.argmax(l[j]): lstm_correct += 1 totSamples += 1 print(hnm_correct, ntm_correct, lstm_correct) print(totSamples)
def gen_model(input_dim, batch_size, output_dim, n_slots=n_slots, m_depth=m_depth, controller_model=None, activation="sigmoid", read_heads=1, write_heads=1): model = Sequential() model.name = "NTM_-_" + controller_model.name model.batch_size = batch_size model.input_dim = input_dim model.output_dim = output_dim ntm = NTM(output_dim, n_slots=n_slots, m_depth=m_depth, shift_range=3, controller_model=controller_model, activation=activation, read_heads=read_heads, write_heads=write_heads, return_sequences=True, input_shape=(None, input_dim), batch_size=batch_size) model.add(ntm) sgd = Adam(lr=learning_rate, clipnorm=clipnorm) model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['binary_accuracy'], sample_weight_mode="temporal") return model
return parser.parse_args() if __name__ == "__main__": args = parse_arguments() writer = SummaryWriter() dataset = BinaySeqDataset(args) dataloader = DataLoader(dataset, batch_size=1, shuffle=True, num_workers=4) model = NTM(M=args.memory_capacity, N=args.memory_vector_size, input_size=args.token_size, output_size=args.token_size, controller_out_dim=args.controller_output_dim, controller_hid_dim=args.controller_hidden_dim, ) print(model) criterion = torch.nn.BCELoss() optimizer = torch.optim.RMSprop(model.parameters(), lr=args.learning_rate) print("--------- Number of parameters -----------") print(model.calculate_num_params()) print("--------- Start training -----------") losses = []
class NTMAugmentedDecoder(nn.Module): def __init__(self, d_vocab, d_emb, d_dec, max_len, bos_idx, num_heads=8, N=64, M=32, seg_size=20): super().__init__() self.d_vocab = d_vocab self.seg_size = seg_size self.embs = nn.Embedding(d_vocab, d_emb) self.rnn = nn.GRU(d_emb, d_dec, batch_first=True) self.ntm_scale = nn.Parameter(torch.zeros([1, d_dec]), requires_grad=True) self.ntm = NTM('mem-aug', embedding_size=d_dec, hidden_size=d_dec, memory_size=M, head_num=num_heads, memory_feature_size=N, output_size=d_dec) self.init = nn.Parameter(torch.zeros(1, d_dec), requires_grad=True) self.bos_idx = nn.Parameter(torch.tensor([bos_idx]), requires_grad=False) self.out_layer = nn.Linear(d_dec, d_vocab) self.max_len = max_len def forward(self, labels, state=None): """ Run decoder on input context with optional conditioning on labels and prelabels :param labels: labels conditioned on at each timestep in next-prediction task (used during training) :param state: initial state to begin decoding with. Could be output of encoder. :return: If labels are provided, returns logits tensor (batch_size, num_steps, vocab_size). If labels are not provided, returns predictions tensor (batch_size, num_steps) using provided sampling function. """ batch_size = labels.shape[0] self.ntm.reset(batch_size, device=labels.device) if state is None: state = self.init.expand(batch_size, -1).contiguous() # bxh init = self.embs(self.bos_idx.expand(batch_size).unsqueeze(1)) # bx1xh # initialize ntm state, which keeps track of reads and writes ntm_state = torch.zeros_like(state).to(state.device) labels_no_last = labels[:, : -1] # b x (t-1) # we don't take last word as input # break input into slices, read and write between slices num_slices = labels.shape[1] // self.seg_size all_logit_slices = [] for slice in range(num_slices): # grab slice of input labels_slice = labels_no_last[:, slice * self.seg_size:slice * self.seg_size + self.seg_size] in_embs = self.embs(labels_slice) # b x (t-1) x w if slice == 0: # add bos index on first iteration in_embs = torch.cat([init, in_embs], dim=1) # b x t x w # give ntm state as input to all time steps of next slice # multiple ntm state by scalars before giving to RNN, so it is not used in the beginning of training #scaled_ntm_state = ntm_state * self.ntm_scale #exp_ntm_state = scaled_ntm_state.unsqueeze(1).expand([-1, in_embs.shape[1], -1]) # b x t x h rnn_input = in_embs # torch.cat([in_embs, exp_ntm_state], dim=-1) # read slice of conversation history, with access to ntm state outputs, _ = self.rnn( rnn_input, state.unsqueeze(0)) # b x (t-1) x h OR b x t x h # grab last state and use it to read and write from ntm state = outputs[:, -1, :] #ntm_state = self.ntm(state) # predict outputs for this slice logits = self.out_layer(outputs) # b x t x v all_logit_slices.append(logits) # append predictions for all slices together logits = torch.cat(all_logit_slices, dim=1) return logits def complete(self, x, state=None, sample_func=None): """ Given tensor x containing token indices, fill in all padding token (zero) elements with predictions from the NTM decoder. :param x: (batch_size, num_steps) tensor containing token indices :return: tensor same shape as x, where zeros have been filled with decoder predictions """ batch_size, num_steps = x.shape self.ntm.reset(batch_size, device=x.device) if state is None: state = self.init.expand(batch_size, -1).contiguous() # bxh if sample_func is None: sample_func = partial(torch.argmax, dim=-1) ntm_state = torch.zeros_like(state).to(state.device) all_logits = [] all_preds = [] init = self.embs(self.bos_idx.expand(batch_size).unsqueeze(1)) # bx1xh word = init.squeeze(1) # b x w for step in range(num_steps): # run RNN over input words rnn_input = word.unsqueeze( 1) # torch.cat([word, ntm_state], dim=-1).unsqueeze(1) _, state = self.rnn(rnn_input, state.unsqueeze(0)) # 1 x b x h state = state.squeeze(0) # produce prediction at each time step logits = self.out_layer(state) # b x v all_logits.append(logits) pred = sample_func(logits) # b # # at the end of each segment, read and write from NTM # if step % self.seg_size == (self.seg_size - 1): # # end of each segment, read and write from NTM # ntm_state = self.ntm(state) # here, we grab word from x if it exists, otherwise use prediction mask = (x[:, step] != 0).long() # b word_index = x[:, step] * mask + pred * ( 1 - mask) # use label or prediction, whichever is available word = self.embs(word_index) # b x w all_preds.append(word_index) logits = torch.stack(all_logits, dim=1) return logits, torch.stack(all_preds, dim=1)
cur_dir = os.getcwd() PATH = os.path.join(cur_dir, args.saved_model) # PATH = os.path.join(cur_dir, 'saved_models/saved_model_copy_500000.pt') # ntm = torch.load(PATH) """ For the Copy task, input_size: seq_width + 2, output_size: seq_width For the RepeatCopy task, input_size: seq_width + 2, output_size: seq_width + 1 For the Associative task, input_size: seq_width + 2, output_size: seq_width For the NGram task, input_size: 1, output_size: 1 For the Priority Sort task, input_size: seq_width + 1, output_size: seq_width """ ntm = NTM(input_size=task_params['seq_width'] + 1, output_size=task_params['seq_width'], controller_size=task_params['controller_size'], memory_units=task_params['memory_units'], memory_unit_size=task_params['memory_unit_size'], num_heads=task_params['num_heads']) ntm.load_state_dict(torch.load(PATH)) # ----------------------------------------------------------------------------- # --- evaluation # ----------------------------------------------------------------------------- ntm.reset() data = dataset[0] # 0 is a dummy index input, target = data['input'], data['target'] out = torch.zeros(target.size()) # ----------------------------------------------------------------------------- # loop for other tasks
task_params['num_heads'], task_params['uniform'], task_params['random_distr'], task_params['multi_layer_controller']) # Output directory for tensorboard configure(args.tb_dir + "/" + saved_model_name) """ For the Copy task, input_size: seq_width + 2, output_size: seq_width For the RepeatCopy task, input_size: seq_width + 2, output_size: seq_width + 1 For the Associative task, input_size: seq_width + 2, output_size: seq_width For the NGram task, input_size: 1, output_size: 1 For the Priority Sort task, input_size: seq_width + 1, output_size: seq_width """ ntm = NTM(input_size=task_params['seq_width'] + 1, output_size=task_params['seq_width'], controller_size=task_params['controller_size'], memory_units=task_params['memory_units'], memory_unit_size=task_params['memory_unit_size'], num_heads=task_params['num_heads'], multi_layer_controller=task_params['multi_layer_controller']) if args.load_model != "": ntm.load_state_dict(torch.load(args.load_model)) criterion = nn.BCELoss() # As the learning rate is task specific, the argument can be moved to json file optimizer = optim.RMSprop(ntm.parameters(), lr=args.lr, alpha=args.alpha, momentum=args.momentum) ''' optimizer = optim.Adam(ntm.parameters(), lr=args.lr,
controller_input_dim, controller_output_dim = controller_shape(num_encoder_tokens, layer_dim, m_depth, n_slots, shift_range, read_heads, write_heads) encoder_inputs = Input(shape=(None, num_encoder_tokens)) encoder = NTM(layer_dim, n_slots=n_slots, m_depth=m_depth, shift_range=shift_range, controller_model=None, activation="sigmoid", read_heads = read_heads, write_heads = write_heads, return_sequences=True, return_state=True) saidas = encoder(encoder_inputs) print(saidas[1])
def generate_target_original_plots(iteration, task_params, model_path, image_output): dataset = PrioritySort(task_params) criterion = nn.BCELoss() ntm = NTM(input_size=task_params['seq_width'] + 1, output_size=task_params['seq_width'], controller_size=task_params['controller_size'], memory_units=task_params['memory_units'], memory_unit_size=task_params['memory_unit_size'], num_heads=task_params['num_heads'], save_weigths=True, multi_layer_controller=task_params['multi_layer_controller']) ntm.load_state_dict(torch.load(model_path)) # ----------------------------------------------------------------------------- # --- evaluation # ----------------------------------------------------------------------------- ntm.reset() data = dataset[0] # 0 is a dummy index input, target = data['input'], data['target'] out = torch.zeros(target.size()) # ----------------------------------------------------------------------------- # loop for other tasks # ----------------------------------------------------------------------------- for i in range(input.size()[0]): # to maintain consistency in dimensions as torch.cat was throwing error in_data = torch.unsqueeze(input[i], 0) ntm(in_data) # passing zero vector as the input while generating target sequence in_data = torch.unsqueeze(torch.zeros(input.size()[1]), 0) for i in range(target.size()[0]): out[i] = ntm(in_data) loss = criterion(out, target) binary_output = out.clone() binary_output = binary_output.detach().apply_(lambda x: 0 if x < 0.5 else 1) # sequence prediction error is calculted in bits per sequence error = torch.sum(torch.abs(binary_output - target)) fig = plt.figure() ax1 = fig.add_subplot(211) ax2 = fig.add_subplot(221) ax1.set_title("Result") ax2.set_title("Target") sns.heatmap(binary_output, ax=ax1, vmin=0, vmax=1, linewidths=.5, cbar=False, square=True) sns.heatmap(target, ax=ax2, vmin=0, vmax=1, linewidths=.5, cbar=False, square=True) plt.savefig( image_output + "/priority_sort_{}_{}_{}_{}_{}_{}_{}_{}_{}_image_{}.png".format( task_params['seq_width'] + 1, task_params['seq_width'], task_params['controller_size'], task_params['memory_units'], task_params['memory_unit_size'], task_params['num_heads'], task_params['uniform'], task_params['random_distr'], task_params['multi_layer_controller'], iteration)) fig = plt.figure(figsize=(15, 6)) ax1_2 = fig.add_subplot(211) ax2_2 = fig.add_subplot(212) ax1_2.set_title("Read Weigths") ax2_2.set_title("Write Weights") sns.heatmap(ntm.all_read_w, ax=ax1_2, linewidths=.01, square=True) sns.heatmap(ntm.all_write_w, ax=ax2_2, linewidths=.01, square=True) plt.tight_layout() plt.savefig( image_output + "/priority_sort_{}_{}_{}_{}_{}_{}_{}_{}_{}_weigths_{}.png".format( task_params['seq_width'] + 1, task_params['seq_width'], task_params['controller_size'], task_params['memory_units'], task_params['memory_unit_size'], task_params['num_heads'], task_params['uniform'], task_params['random_distr'], task_params['multi_layer_controller'], iteration), dpi=250) # ---logging--- print('[*] Checkpoint Loss: %.2f\tError in bits per sequence: %.2f' % (loss, error))
def get_ntm_model(): from keras.models import Model import keras.backend as K assert permute_layer is not None num_read = 1 num_write = 1 mem_length = 40 n_slots = 128 model_input = Input( (WINDOW_LENGTH, 1) + INPUT_SHAPE, #batch_shape = (batch_size,) + (WINDOW_LENGTH,1) + INPUT_SHAPE ) per = permute_layer(model_input) x = TimeDistributed( Conv2D(32, (8, 8), name='conv1', activation='relu', subsample=(4, 4)))(per) x = TimeDistributed( Conv2D(64, (4, 4), name='conv2', activation='relu', subsample=(2, 2)))(x) x = TimeDistributed( Conv2D(64, (3, 3), name='conv3', activation='relu', subsample=(1, 1)))(x) #x = TimeDistributed(Conv2D(64,(4,4),name='conv2',activation = 'relu',subsample = (2,2)))(x) #x = TimeDistributed(Conv2D(64,(3,3),name='conv3',activation = 'relu',subsample = (1,1)))(x) x = TimeDistributed(Flatten(name="Flatten1"))( x) # (batch_size,WINDOW_LENGTH,3176) x_shape = K.int_shape(x) print('x has shape:', x_shape) # controller construction controller_inp = Input((x_shape[-1], ), name="controller_input") # (batch_size,3176) read_inp = Input((num_read, mem_length), name="read_inp") # (batch_size,n_read,n_write) read_inp_flatten = Flatten(name="read_inp_flatten")( read_inp) #(batch_size,n_read * n_write) print('controller_inp shape:', controller_inp.shape) #print('read_inp_flatten shape:',K.int_shape(read_inp_flatten)) #print('read_inp_flatten_repeat shape:',K.int_shape(read_inp_flatten_repeat)) #hidden_int = Dense(512,activation = 'relu')(controller_inp) hidden = Concatenate(name="ctrl_inp_read_inp_concat")( [controller_inp, read_inp_flatten]) # (batch_size, 3176 + num_read * mem_length) #hidden = Dense(512,activation = 'relu')(concat) controller_output = Dense(nb_actions, activation='linear')(hidden) controller = Model([controller_inp, read_inp], [controller_output, controller_inp]) controller.summary() # ntm constuction #TODO: reset the state for on_batch_end!! ntm_cell = NTM( controller, # custom controller, should output a vector n_slots, mem_length, # Memory config num_shift=3, # shifting batch_size=batch_size, #controller_instr_output_dim = controller_instr_output_dim, return_sequences=False, is_controller_recurrent=True, num_read=num_read, num_write=num_write)(x) # (batch_size,512) ntm_cell_output_shape = K.int_shape(ntm_cell) print('ntm_cell output:', ntm_cell_output_shape) ntm_cell_output_shape = ntm_cell_output_shape[1:] #model_output = Dense(nb_actions,activation = 'linear')(ntm_cell) model = Model(model_input, ntm_cell) model.summary() return model
n_slots = n_slots, m_depth = m_depth, controller_model = None, activation = "sigmoid", read_head = 1, write_head = 1): model = Sequential() model.name = "NTM_-_"+ controller_model.name model.batch_size = batch_size model.input_dim = input_dim model.ouput_dim =output_dim ntm = NTM(output_dim, n_slots = n_slots, m_depth = m_depth, shift_range = 3, controller_model = controller_model, activation = activation, read_heads = read_heads, write_heads = write_heads, # return_sequences = True, input_shape = (None,input_dim), batch_size = batch_size) model.add(NTM) sgd = Adam(lr = learning_rate, clipnorm = clipnorm) model.compile(loss = 'binary_crossentropy',optimizer=sgd, metrics = ['binary_accuracy'],sample_weight_model = "temporal") return model
from keras.models import Sequential from keras.optimizers import Adam from ntm import NeuralTuringMachine as NTM model = Sequential() ntm = NTM([625], n_slots=50, m_depth=20, shift_range=3, controller_model=None, return_sequences=True, input_shape=(None, 625), batch_size=100) model.add(ntm) # sgd = Adam(lr=learning_rate, clipnorm=clipnorm) model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['binary_accuracy'], sample_weight_mode="temporal") print(model.summary())
def trainNTM(): ntm = NTM(10, 14) X, y = getData() ntm.train(X, y, 1)
dataset = PrioritySort(task_params) input_size=task_params['seq_width']+1 output_size=task_params['seq_width'] """ For the Copy task, input_size: seq_width + 2, output_size: seq_width For the RepeatCopy task, input_size: seq_width + 2, output_size: seq_width + 1 For the Associative task, input_size: seq_width + 2, output_size: seq_width For the NGram task, input_size: 1, output_size: 1 For the Priority Sort task, input_size: seq_width + 1, output_size: seq_width """ has_tau=0 if args.model=='ntm': model = NTM(input_size= input_size, output_size=output_size, controller_size=args.lstm_size, memory_units=128, memory_unit_size=20, num_heads=1)#task_params['num_heads']) elif args.model=='dnc': model = DNC(input_size= input_size, output_size=output_size, hidden_size=args.lstm_size, nr_cells=128, cell_size=20, read_heads=1)#task_params['num_heads']) model.init_param() elif args.model=='sam': model = SAM(input_size= input_size, output_size=output_size, hidden_size=args.lstm_size, nr_cells=128,
'length': 5, 'controller_layer_size': 1, 'write_head_size': 1, 'read_head_size': 1, 'checkpoint_dir': 'checkpoint' } if __name__ == "__main__": with tf.device('/cpu:0'), tf.Session() as sess: cell = NTMCell(input_dim=config['input_dim'], output_dim=config['output_dim'], controller_layer_size=config['controller_layer_size'], write_head_size=config['write_head_size'], read_head_size=config['read_head_size'], controller_dim=32) ntm = NTM(cell, sess, config['length'] * 2 + 2) if not os.path.isdir(config['checkpoint_dir'] + '/copy_' + str(config['length'] * 2 + 2)): print(" [*] Initialize all variables") tf.global_variables_initializer().run() print(" [*] Initialization finished") else: ntm.load(config['checkpoint_dir'], 'copy') start_time = time.time() print('') for idx in range(config['epoch']): seq_length = np.random.randint(2, config['length'] + 1) X, Y, masks = build_seq_batch(seq_length, config['length'], config['input_dim'] - 2)
class EncapsulatedNTM(nn.Module): def __init__(self, num_inputs, num_outputs, controller_size, controller_layers, num_heads, N, M, controller_type='lstm'): """Initialize an EncapsulatedNTM. :param num_inputs: External number of inputs. :param num_outputs: External number of outputs. :param controller_size: The size of the internal representation. :param controller_layers: Controller number of layers. :param num_heads: Number of heads. :param N: Number of rows in the memory bank. :param M: Number of cols/features in the memory bank. """ super(EncapsulatedNTM, self).__init__() # Save args self.num_inputs = num_inputs self.num_outputs = num_outputs self.controller_size = controller_size self.controller_layers = controller_layers self.num_heads = num_heads self.N = N self.M = M # Create the NTM components memory = NTMMemory(N, M) if controller_type == 'lstm': controller = LSTMController(num_inputs + M * num_heads, controller_size, controller_layers) else: controller = MLPController(num_inputs + M * num_heads, controller_size, controller_layers) heads = nn.ModuleList([]) for i in range(num_heads): heads += [ NTMReadHead(memory, controller_size), NTMWriteHead(memory, controller_size) ] self.ntm = NTM(num_inputs, num_outputs, controller, memory, heads) self.memory = memory def init_sequence(self, batch_size): """Initializing the state.""" self.batch_size = batch_size self.memory.reset(batch_size) self.previous_state = self.ntm.create_new_state(batch_size) def forward(self, x=None): if x is None: x = Variable(torch.zeros(self.batch_size, self.num_inputs)) o, self.previous_state = self.ntm(x, self.previous_state) return o, self.previous_state def calculate_num_params(self): """Returns the total number of parameters.""" num_params = 0 for p in self.parameters(): num_params += p.data.view(-1).size(0) return num_params
dataset = RepeatCopyDataset(task_params) dataset = AssociativeDataset(task_params) dataset = NGram(task_params) dataset = PrioritySort(task_params) ''' """ For the Copy task, input_size: seq_width + 2, output_size: seq_width For the RepeatCopy task, input_size: seq_width + 2, output_size: seq_width + 1 For the Associative task, input_size: seq_width + 2, output_size: seq_width For the NGram task, input_size: 1, output_size: 1 For the Priority Sort task, input_size: seq_width + 1, output_size: seq_width """ ntm = NTM(input_size=task_params['seq_width'] + 2, output_size=task_params['seq_width'], controller_size=task_params['controller_size'], memory_units=task_params['memory_units'], memory_unit_size=task_params['memory_unit_size'], num_heads=task_params['num_heads']) criterion = nn.BCELoss() # As the learning rate is task specific, the argument can be moved to json file optimizer = optim.RMSprop(ntm.parameters(), lr=args.lr, alpha=args.alpha, momentum=args.momentum) ''' optimizer = optim.Adam(ntm.parameters(), lr=args.lr, betas=(args.beta1, args.beta2)) ''' args.saved_model = 'saved_model_copy.pt'
task_params['max_seq_len'] = data[d][1] dataset = ReverseDataset(task_params) dataset2 = ReverseDataset(task_params) task_params['min_seq_len'] = data[d][2] task_params['max_seq_len'] = data[d][3] dataset3 = ReverseDataset(task_params) #4.save model args.saved_model = 'saved_model/' + 'saved_model_reverse_' + args.config + '.pt' cur_dir = os.getcwd() PATH = os.path.join(cur_dir, args.saved_model) """ For the Reverse task, input_size: seq_width + 2, output_size: seq_width """ ntm = NTM(input_size=task_params['seq_width'] + 2, output_size=task_params['seq_width'], controller_size=task_params['controller_size'], memory_units=task_params['memory_units'], memory_unit_size=task_params['memory_unit_size'], num_heads=task_params['num_heads']) criterion = nn.BCELoss() # As the learning rate is task specific, the argument can be moved to json file # optimizer = optim.RMSprop(ntm.parameters(), # lr=args.lr, # alpha=args.alpha, # momentum=args.momentum) optimizer = optim.Adam(ntm.parameters(), lr=args.lr, betas=(args.beta1, args.beta2)) # ---------------------------------------------------------------------------- # -- basic training loop
model = Sequential() model.name = "NTM_-_" + None.name model.batch_size = batch_size model.input_dim = input_dim model.output_dim = 1 ntm = NTM( 1, n_slots=n_slots, #n_slots: Memory width m_depth=m_depth, #m_depth: Memory depth at each location shift_range=3, #shift_range: int, number of available shifts, ex. if 3, avilable shifts are # (-1, 0, 1) controller_model=None, #controller_model: A keras model with required restrictions to be used as a controller. # The requirements are appropriate shape, linear activation and stateful=True if recurrent. # Default: One dense layer. activation="sigmoid", #activation: This is the activation applied to the layer output. # It can be either a Keras activation or a string like "tanh", "sigmoid", "linear" etc. # Default is linear. read_heads=1, write_heads=1, return_sequences=True, input_shape=(None, input_dim), batch_size=batch_size) model.add(ntm) model.compile(loss='binary_crossentropy', optimizer=Adam(lr=learning_rate, clipnorm=clipnorm), metrics=['binary_accuracy'], sample_weight_mode="temporal")
def main(): train = True weight_path = "" #"model-constant-mem_4500.save" if train: # The big cheese, we're doin it guys! plt.ion() fig = plt.figure() # COPY TASK, copy a 20 length tensor of random numbers. Each section will contain 5 bits, one of which (bits 0 - 3) will be on. # We'll input the 20 length, then input an end token, which will be the 5th bit (or 4th if using zero-based indexing), then input another 20 length with all zeros. This signals to the NTM that it needs to start outputting the copy. class Controller: def __init__(self): # Output size is 5, because it needs to output the copied 5 bits self.size = 128 # We'll have 1 read head, which produces a single read_vector of size 10. We also need to feed in the input, which is of size 5 (for the five bits) # so our total input size is 15 self.fc_1 = init_weight(15, 128) self.fc_2 = init_weight(128, 128) # This is our controller output self.fc_3 = init_weight(128, 128) def get_weights(self): return [self.fc_1, self.fc_2, self.fc_3] def forward(self, inp): fc1 = T.nnet.relu(T.dot(inp, self.fc_1)) fc2 = T.nnet.relu(T.dot(fc1, self.fc_2)) # I would ReLU the output, but I already did in the NTM implementation fc3 = T.dot(fc2, self.fc_3) return fc3 # output size is 5, for the 5 copy bits ntm = NTM(controller=Controller(), output_size=5, memory_slots=20, slot_size=10, read_heads=1, batch_size=10) data = T.tensor3() target = T.tensor3() #r = theano.shared(np.random.randn(10, 10, 20)) r = theano.shared(1.) r_ = theano.shared(np.zeros([10, 10, 20])) + r if weight_path != '': print("loading weights") # Load weights, but just the NTM weights, not memory, since we may extend it checkpoint = open(weight_path, 'rb') all_weights = ntm.weights for w in all_weights: w.set_value(cPickle.load(checkpoint).get_value()) checkpoint.close() memory_states, _, weightings, ntm_outputs = ntm.process(data, r_) # We average the loss across batches, so that we have a singular loss for each timestep. We then average these losses # ntm_outputs - target ** 2 -> ts x batchsize x bits # loss = T.sum(T.mean(T.sum(5 * (T.nnet.sigmoid(ntm_outputs) - target)**2, axis=2), axis=1), axis=0) updates = RMSprop(cost=loss, params=ntm.weights + [r], lr=1e-3) train = theano.function(inputs=[data, target], outputs=[ memory_states, weightings, weightings, ntm_outputs, loss, updates[2][1] ], updates=updates) for example in range(5000): # Produce the first half # let's feed a test example # ts x batchsize x bits end = np.zeros([1, 10, 5]) for batch in range(10): end[0, batch, -1] = 1 # Make the last bit in each batch a 1 first_half = (np.random.randn(10, 10, 5) > 0).astype( np.float32) * 1 for batch in range(10): first_half[:, batch, -1] = 0 # Make sure the last bit (end bit) of each batch is 0 # Produce second half second_half = np.zeros([10, 10, 5]) # Just a bunch of zeros data = np.concatenate([first_half, end, second_half], axis=0) target = np.concatenate([second_half, end, first_half], axis=0) # lamar gotta have that extra timestep for the end bit outputs = train(data, target) print("LOSS " + str(outputs[-2]) + ", " + str(example)) read = outputs[2] read = read[:, 0, 0, :] write = outputs[2] write = write[:, 1, 0, :] outputs = outputs[3] outputs = outputs[:, 0] #.transpose([1, 0]) if (example % 20 == 0 and example != 0): cmap = 'jet' fig.add_subplot(2, 2, 1) plt.imshow(sigmoid(outputs), cmap=cmap) fig.add_subplot(2, 2, 2) plt.imshow(target[:, 0], cmap=cmap) fig.add_subplot(2, 2, 3) plt.imshow(read, cmap=cmap) fig.add_subplot(2, 2, 4) plt.imshow(write, cmap=cmap) plt.pause(0.1) if (example % 500 == 0): print("SAVING WEIGHTS") f = open('model-constant-mem_' + str(example) + '.save', 'wb') for w in ntm.weights + [r]: cPickle.dump(w, f, protocol=cPickle.HIGHEST_PROTOCOL) f.close() """ fig = plt.figure() fig.add_subplot(2, 2, 1) plt.imshow(data[:, 0, :], origin = [0, 0]) fig.add_subplot(2, 2, 2) plt.imshow(target[:, 0, :], origin = [10, 0]) plt.show() """ else: # Test time! plt.ion() fig = plt.figure() # COPY TASK, we're going to see how well our Neural Turing Machine model extends to longer sequences class Controller: def __init__(self): # Output size is 5, because it needs to output the copied 5 bits self.size = 128 # We'll have 1 read head, which produces a single read_vector of size 10. We also need to feed in the input, which is of size 5 (for the five bits) # so our total input size is 15 self.fc_1 = init_weight(15, 128) self.fc_2 = init_weight(128, 128) # This is our controller output self.fc_3 = init_weight(128, 128) def get_weights(self): return [self.fc_1, self.fc_2, self.fc_3] def forward(self, inp): fc1 = T.nnet.relu(T.dot(inp, self.fc_1)) fc2 = T.nnet.relu(T.dot(fc1, self.fc_2)) # I would ReLU the output, but I already did in the NTM implementation fc3 = T.dot(fc2, self.fc_3) return fc3 # output size is 5, for the 5 copy bits ntm = NTM(controller=Controller(), output_size=5, memory_slots=80, slot_size=10, read_heads=1, batch_size=10) data = T.tensor3() # Load weights checkpoint = open( 'pretrained-models-copy/model-constant-mem_4500.save', 'rb') all_weights = ntm.weights for w in all_weights: w.set_value(cPickle.load(checkpoint).get_value()) r = theano.shared( np.zeros(shape=[10, 10, 80]) + cPickle.load(checkpoint).get_value()) checkpoint.close() memory_states, _, weightings, ntm_outputs = ntm.process(data, r) test = theano.function( inputs=[data], outputs=[memory_states, weightings, weightings, ntm_outputs]) for example in range(5000): print(r.get_value()) # Produce the first half # let's feed a test example # ts x batchsize x bits end = np.zeros([1, 10, 5]) for batch in range(10): end[0, batch, -1] = 1 # Make the last bit in each batch a 1 first_half = (np.random.randn(60, 10, 5) > .7).astype( np.float32) * 1 for batch in range(10): first_half[:, batch, -1] = 0 # Make sure the last bit (end bit) of each batch is 0 # Produce second half second_half = np.zeros([60, 10, 5]) # Just a bunch of zeros data = np.concatenate([first_half, end, second_half], axis=0) # lamar gotta have that extra timestep for the end bit outputs = test(data) read = outputs[2] read = read[:, 0, 0, :] write = outputs[2] write = write[:, 1, 0, :] outputs = outputs[3] outputs = outputs[:, 0] #.transpose([1, 0]) cmap = 'jet' fig.add_subplot(2, 2, 1) plt.imshow(sigmoid(outputs), cmap=cmap) fig.add_subplot(2, 2, 2) plt.imshow(data[:, 0], cmap=cmap) fig.add_subplot(2, 2, 3) plt.imshow(read, cmap=cmap) fig.add_subplot(2, 2, 4) plt.imshow(write, cmap=cmap) plt.pause(0.1) input("")
from_checkpoint = opt[1] elif opt[0] == '--iterations': iterations = int(opt[1]) graph = tf.Graph() with graph.as_default(): with tf.compat.v1.Session(graph=graph) as session: llprint("Building Computational Graph ... ") optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate, momentum=momentum) turing_machine = NTM(RecurrentController, input_size, output_size, memory_size, word_size, read_heads, shift_range, batch_size) # squash the DNC output between 0 and 1 output, _ = turing_machine.get_outputs() squashed_output = tf.clip_by_value(tf.sigmoid(output), 1e-6, 1. - 1e-6) loss = binary_cross_entropy(squashed_output, turing_machine.target_output) summaries = [] gradients = optimizer.compute_gradients(loss) for i, (grad, var) in enumerate(gradients): if grad is not None: summaries.append(