import theano.tensor as T import numpy as np from theano_toolkit import utils as U from theano_toolkit import hinton from theano_toolkit import updates from theano_toolkit.parameters import Parameters import ctc import font import lstm from ocr import * if __name__ == "__main__": import sys test_word = sys.argv[1] P = Parameters() X = T.matrix('X') predict = build_model(P, 8, 512, len(font.chars) + 1) probs = predict(X) test = theano.function(inputs=[X], outputs=probs) P.load('model.pkl') image = font.imagify(test_word) hinton.plot(image.astype(np.float32).T[::-1]) y_seq = label_seq(test_word) probs = test(image) print " ", ' '.join(font.chars[i] if i < len(font.chars) else "_" for i in np.argmax(probs, axis=1)) hinton.plot(probs[:, y_seq].T, max_arr=1.)
class Model: """ Simple predictive model for forecasting words from sequence using LSTMs. Choose how many LSTMs to stack what size their memory should be, and how many words can be predicted. """ def __init__(self, hidden_size, input_size, vocab_size, stack_size=1, celltype=LSTM): # core layer in RNN/LSTM self.model = StackedCells(input_size, celltype=celltype, layers =[hidden_size] * stack_size) # add an embedding self.model.layers.insert(0, Embedding(vocab_size, input_size)) # add a classifier: self.model.layers.append(Layer(hidden_size, vocab_size, activation = softmax)) self.turing_params = Parameters() #init turing machine model self.turing_updates , self.turing_predict = turing_model.build(self.turing_params , hidden_size , vocab_size) # inputs are matrices of indices, # each row is a sentence, each column a timestep self._stop_word = theano.shared(np.int32(999999999), name="stop word") self.for_how_long = T.ivector() self.input_mat = T.imatrix() self.priming_word = T.iscalar() self.srng = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024)) # create symbolic variables for prediction: #change by darong #issue : what is greedy self.lstm_predictions = self.create_lstm_prediction() self.final_predictions = self.create_final_prediction() # create symbolic variable for greedy search: self.greedy_predictions = self.create_lstm_prediction(greedy=True) # create gradient training functions: self.create_cost_fun()#create 2 cost func(lstm final) self.lstm_lr = 0.01 self.turing_lr = 0.01 self.all_lr = 0.01 self.create_training_function()#create 3 functions(lstm turing all) self.create_predict_function()#create 2 predictions(lstm final) # create ppl self.lstm_ppl = self.create_lstm_ppl() self.final_ppl = self.create_final_ppl() self.create_ppl_function() def save(self, save_file, vocab): pickle.dump(self.model, open(save_file, "wb")) # pickle is for lambda function, cPickle cannot pickle.dump(vocab, open(save_file+'.vocab', "wb")) # pickle is for lambda function, cPickle cannot def save_turing(self, save_file): self.turing_params.save(save_file + '.turing') def load(self, load_file, lr): self.model = pickle.load(open(load_file, "rb")) if os.path.isfile(load_file + '.turing') : self.turing_params.load(load_file + '.turing') else : print "no turing model!!!! pretrain with lstm param" self.turing_params['W_input_hidden'] = self.model.layers[-1].params[0].get_value().T #not sure self.turing_params['W_read_hidden'] = self.model.layers[-1].params[0].get_value().T self.turing_params['b_hidden_0'] = self.model.layers[-1].params[1].get_value() temp = self.model.layers[1].initial_hidden_state.get_value()[self.hidden_size:] self.turing_params['memory_init'] = temp.reshape((1,)+temp.shape) # need to compile again for calculating predictions after loading lstm self.srng = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024)) self.lstm_predictions = self.create_lstm_prediction() self.final_predictions = self.create_final_prediction() self.greedy_predictions = self.create_lstm_prediction(greedy=True)#can change to final self.create_cost_fun()#create 2 cost func(lstm final) self.lstm_lr = lr self.turing_lr = lr#change this self.all_lr = lr self.create_training_function()#create 3 functions(lstm turing all) self.create_predict_function()#create 2 predictions(lstm final) self.lstm_ppl = self.create_lstm_ppl() self.final_ppl = self.create_final_ppl() self.create_ppl_function() print "done loading model" # print "done compile" def stop_on(self, idx): self._stop_word.set_value(idx) @property def params(self): return self.model.params def create_lstm_prediction(self, greedy=False): def step(idx, *states): # new hiddens are the states we need to pass to LSTMs # from past. Because the StackedCells also include # the embeddings, and those have no state, we pass # a "None" instead: new_hiddens = [None] + list(states) new_states = self.model.forward(idx, prev_hiddens = new_hiddens) if greedy: new_idxes = new_states[-1] new_idx = new_idxes.argmax() # provide a stopping condition for greedy search: return ([new_idx.astype(self.priming_word.dtype)] + new_states[1:-1]), theano.scan_module.until(T.eq(new_idx,self._stop_word)) else: return new_states[1:] # in sequence forecasting scenario we take everything # up to the before last step, and predict subsequent # steps ergo, 0 ... n - 1, hence: inputs = self.input_mat[:, 0:-1] num_examples = inputs.shape[0] # pass this to Theano's recurrence relation function: # choose what gets outputted at each timestep: if greedy: outputs_info = [dict(initial=self.priming_word, taps=[-1])] + [initial_state_with_taps(layer) for layer in self.model.layers[1:-1]] result, _ = theano.scan(fn=step, n_steps=200, outputs_info=outputs_info) else: outputs_info = [initial_state_with_taps(layer, num_examples) for layer in self.model.layers[1:]] result, _ = theano.scan(fn=step, sequences=[inputs.T], outputs_info=outputs_info) if greedy: return result[0] # softmaxes are the last layer of our network, # and are at the end of our results list: return result[-1].transpose((2,0,1)) # we reorder the predictions to be: # 1. what row / example # 2. what timestep # 3. softmax dimension def create_final_prediction(self, greedy=False): def step(idx, *states): # new hiddens are the states we need to pass to LSTMs # from past. Because the StackedCells also include # the embeddings, and those have no state, we pass # a "None" instead: new_hiddens = [None] + list(states) new_states = self.model.forward(idx, prev_hiddens = new_hiddens) if greedy: new_idxes = new_states[-1] new_idx = new_idxes.argmax() # provide a stopping condition for greedy search: return ([new_idx.astype(self.priming_word.dtype)] + new_states[1:-1]), theano.scan_module.until(T.eq(new_idx,self._stop_word)) else: return new_states[1:] # in sequence forecasting scenario we take everything # up to the before last step, and predict subsequent # steps ergo, 0 ... n - 1, hence: inputs = self.input_mat[:, 0:-1] num_examples = inputs.shape[0] # pass this to Theano's recurrence relation function: # choose what gets outputted at each timestep: if greedy: outputs_info = [dict(initial=self.priming_word, taps=[-1])] + [initial_state_with_taps(layer) for layer in self.model.layers[1:-1]] result, _ = theano.scan(fn=step, n_steps=200, outputs_info=outputs_info) else: outputs_info = [initial_state_with_taps(layer, num_examples) for layer in self.model.layers[1:]] result, _ = theano.scan(fn=step, sequences=[inputs.T], outputs_info=outputs_info) if greedy: return result[0] # softmaxes are the last layer of our network, # and are at the end of our results list: hidden_size = result[-2].shape[2]/2 turing_result = self.turing_predict(result[-2][:,:,hidden_size:]) #the last layer do transpose before compute return turing_result.transpose((1,0,2)) # we reorder the predictions to be: # 1. what row / example # 2. what timestep # 3. softmax dimension def create_cost_fun (self): # create a cost function that # takes each prediction at every timestep # and guesses next timestep's value: what_to_predict = self.input_mat[:, 1:] # because some sentences are shorter, we # place masks where the sentences end: # (for how long is zero indexed, e.g. an example going from `[2,3)`) # has this value set 0 (here we substract by 1): for_how_long = self.for_how_long - 1 # all sentences start at T=0: starting_when = T.zeros_like(self.for_how_long) self.lstm_cost = masked_loss(self.lstm_predictions, what_to_predict, for_how_long, starting_when).sum() self.final_cost = masked_loss(self.final_predictions, what_to_predict, for_how_long, starting_when).sum() def create_predict_function(self): self.lstm_pred_fun = theano.function( inputs=[self.input_mat], outputs=self.lstm_predictions, allow_input_downcast=True ) self.final_pred_fun = theano.function( inputs=[self.input_mat], outputs=self.final_predictions, allow_input_downcast=True ) self.greedy_fun = theano.function( inputs=[self.priming_word], outputs=T.concatenate([T.shape_padleft(self.priming_word), self.greedy_predictions]), allow_input_downcast=True ) def create_training_function(self): updates, _, _, _, _ = create_optimization_updates(self.lstm_cost, self.params, method="SGD", lr=self.lstm_lr) # updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta", lr=self.lr) self.lstm_update_fun = theano.function( inputs=[self.input_mat, self.for_how_long], outputs=self.lstm_cost, updates=updates, allow_input_downcast=True) updates_turing = self.turing_updates(self.final_cost , lr=self.turing_lr) # updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta", lr=self.lr) self.turing_update_fun = theano.function( inputs=[self.input_mat, self.for_how_long], outputs=self.final_cost, updates=updates_turing, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True), allow_input_downcast=True) all_updates_lstm, _, _, _, _ = create_optimization_updates(self.final_cost, self.params, method="SGD", lr=self.all_lr,part=True) all_updates_turing_temp = self.turing_updates(self.final_cost , lr=self.all_lr) updates_all = all_updates_lstm for pair in all_updates_turing_temp : updates_all[pair[0]] = pair[1] self.all_update_fun = theano.function( inputs=[self.input_mat, self.for_how_long], outputs=self.final_cost, updates=updates_all, allow_input_downcast=True) def create_lstm_ppl(self): def timestep(predictions, label, len_example, total_len_example): label_binary = T.gt(label[0:len_example-1], 0) oov_count = T.shape(label_binary)[0] - T.sum(label_binary) a = total_len_example return T.sum(T.log( 1./ predictions[T.arange(len_example-1), label[0:len_example-1]]) * label_binary ), oov_count result, _ = theano.scan(fn=timestep, sequences=[ self.lstm_predictions, self.input_mat[:, 1:], self.for_how_long ], non_sequences=T.sum(self.for_how_long)) oov_count_total = T.sum(result[1]) return T.exp(T.sum(result[0]).astype(theano.config.floatX)/(T.sum(self.for_how_long) - oov_count_total).astype(theano.config.floatX)).astype(theano.config.floatX) def create_final_ppl(self): def timestep(predictions, label, len_example, total_len_example): label_binary = T.gt(label[0:len_example-1], 0) oov_count = T.shape(label_binary)[0] - T.sum(label_binary) a = total_len_example return T.sum(T.log( 1./ predictions[T.arange(len_example-1), label[0:len_example-1]]) * label_binary ), oov_count result, _ = theano.scan(fn=timestep, sequences=[ self.final_predictions, self.input_mat[:, 1:], self.for_how_long ], non_sequences=T.sum(self.for_how_long)) oov_count_total = T.sum(result[1]) return T.exp(T.sum(result[0]).astype(theano.config.floatX)/(T.sum(self.for_how_long) - oov_count_total).astype(theano.config.floatX)).astype(theano.config.floatX) def create_ppl_function(self): self.lstm_ppl_fun = theano.function( inputs=[self.input_mat, self.for_how_long], outputs=self.lstm_ppl, allow_input_downcast=True) self.final_ppl_fun = theano.function( inputs=[self.input_mat, self.for_how_long], outputs=self.final_ppl, allow_input_downcast=True) def __call__(self, x): return self.pred_fun(x)#any problem??
cost = -T.sum(T.log(predicted[T.arange(predicted.shape[0]), label])) params = P.values() gradients = T.grad(cost, wrt=params) update_methods = { 'standard': [(p, p - 0.001 * g) for p, g in zip(params, gradients)], # 'rmsprop' : updates.rmsprop(params,gradients), # 'adadelta': updates.rmsprop(params,gradients), } P.save('init.pkl') for update_method in update_methods: print "Using update method:", update_method with open('train.%s.smart_init.log' % update_method, 'w') as log: train = theano.function( inputs=[X], outputs=cost, updates=update_methods[update_method], ) P.load('init.pkl') while True: cost_val = train( np.random.randint(0, 8, size=20).astype(np.int32)) log.write("%0.5f\n" % cost_val) print cost_val if cost_val < 0.01: break P.save('lstm.%s.smart_init.pkl' % update_method)
import model from theano_toolkit.parameters import Parameters from theano_toolkit import updates if __name__ == '__main__': model_filename = sys.argv[1] test_filename = sys.argv[2] train_filename = sys.argv[3] P = Parameters() data_X, df = data.load_test(test_filename, train_filename) f = model.build(P, input_size=data_X.shape[1], hidden_sizes=[256, 128, 64, 32] ) X = T.matrix('X') predict = theano.function( inputs=[X], outputs=f(X, test=True) > 0.5, ) P.load(model_filename) output = predict(data_X) print data_X.shape print output.shape print df.values.shape df['probs'] = predict(data_X) df['Class'] = 'b' df['Class'][df.probs > 0.5] = 's' df['RankOrder'] = df.probs.rank(ascending=False,method='first').astype(int) df.to_csv('data/submission.csv', cols=['EventId','RankOrder','Class'], index=False)
import model from theano_toolkit.parameters import Parameters if __name__ == "__main__": model_file = args.model_file temp_input = args.temperature id2char = pickle.load(args.vocab_file) char2id = vocab.load(args.vocab_file.name) prime_str = args.prime P = Parameters() sampler = model.build_sampler(P, character_count=len(char2id) + 1, embedding_size=20, hidden_size=100) P.load(model_file) temp = T.scalar('temp') char = T.iscalar('char') p_cell_1, p_hidden_1, p_cell_2, p_hidden_2 = T.vector( "p_cell_1"), T.vector("p_hidden_2"), T.vector("p_cell_2"), T.vector( "p_hidden_2") output, cell_1, hidden_1, cell_2, hidden_2 = sampler( temp, char, p_cell_1, p_hidden_1, p_cell_2, p_hidden_2) sample = theano.function( inputs=[temp, char, p_cell_1, p_hidden_1, p_cell_2, p_hidden_2], outputs=[output, cell_1, hidden_1, cell_2, hidden_2]) orig_c1 = P.init_recurrent_1_cell.get_value() orig_h1 = T.tanh(P.init_recurrent_1_hidden).eval() orig_c2 = P.init_recurrent_2_cell.get_value()
from theano_toolkit.parameters import Parameters if __name__ == "__main__": model_file = args.model_file temp_input = args.temperature id2char = pickle.load(args.vocab_file) char2id = vocab.load(args.vocab_file.name) prime_str = args.prime P = Parameters() sampler = model.build_sampler(P, character_count=len(char2id) + 1, embedding_size=20, hidden_size=100 ) P.load(model_file) temp = T.scalar('temp') char = T.iscalar('char') p_cell_1, p_hidden_1, p_cell_2, p_hidden_2 = T.vector("p_cell_1"), T.vector("p_hidden_2"), T.vector("p_cell_2"), T.vector("p_hidden_2") output, cell_1, hidden_1, cell_2, hidden_2 = sampler(temp, char, p_cell_1, p_hidden_1, p_cell_2, p_hidden_2) sample = theano.function( inputs=[temp, char, p_cell_1, p_hidden_1, p_cell_2, p_hidden_2], outputs=[output, cell_1, hidden_1, cell_2, hidden_2] ) orig_c1 = P.init_recurrent_1_cell.get_value() orig_h1 = T.tanh(P.init_recurrent_1_hidden).eval() orig_c2 = P.init_recurrent_2_cell.get_value() orig_h2 = T.tanh(P.init_recurrent_2_hidden).eval()
import theano.tensor as T import numpy as np from theano_toolkit import utils as U from theano_toolkit import hinton from theano_toolkit import updates from theano_toolkit.parameters import Parameters import ctc import font import lstm from ocr import * if __name__ == "__main__": import sys test_word = sys.argv[1] P = Parameters() X = T.matrix('X') predict = build_model(P,8,512,len(font.chars)+1) probs = predict(X) test = theano.function(inputs=[X],outputs=probs) P.load('model.pkl') image = font.imagify(test_word) hinton.plot(image.astype(np.float32).T[::-1]) y_seq = label_seq(test_word) probs = test(image) print " ", ' '.join(font.chars[i] if i < len(font.chars) else "_" for i in np.argmax(probs,axis=1)) hinton.plot(probs[:,y_seq].T,max_arr=1.)
'standard': [ (p, p - 0.001 * g) for p,g in zip(params,gradients) ], # 'rmsprop' : updates.rmsprop(params,gradients), # 'adadelta': updates.rmsprop(params,gradients), } P.save('init.pkl') for update_method in update_methods: print "Using update method:",update_method with open('train.%s.smart_init.log'%update_method,'w') as log: train = theano.function( inputs = [X], outputs = cost, updates = update_methods[update_method], ) P.load('init.pkl') while True: cost_val = train(np.random.randint(0,8,size=20).astype(np.int32)) log.write("%0.5f\n"%cost_val) print cost_val if cost_val < 0.01: break P.save('lstm.%s.smart_init.pkl'%update_method)
batched_stream = data_io.buffered_random(batched_stream, buffer_items=4) return batched_stream def validate(): stream = data_io.stream_file('data/train.%02d.pklgz' % 0) stream = data_io.buffered_sort(stream, key=lambda x: x[1].shape[0], buffer_items=128) batched_stream = reader.batch_and_pad(stream, batch_size=32, mean=mean, std=std) total_cost = 0 total_frames = 0 for data, lengths in batched_stream: batch_avg_cost = test(data,lengths) batch_frames = np.sum(lengths) total_cost += batch_avg_cost * batch_frames total_frames += batch_frames return total_cost / total_frames import train_loop train_loop.run( data_iterator=stream, train_fun=lambda batch:train(batch[0],batch[1]), validation_score=validate, save_best_params=lambda:P.save('model.pkl'), load_best_params=lambda:P.load('model.pkl'), max_epochs=1000, patience=5000, patience_increase=2, improvement_threshold=0.999, )
X = T.itensor4('X') X_hat, posteriors, priors = \ autoencoder(T.cast(X, 'float32') / np.float32(255.)) latent_kls = [ T.mean(vae.kl_divergence(po_m, po_s, pr_m, pr_s), axis=0) for (po_m, po_s), (pr_m, pr_s) in zip(posteriors, priors) ] recon_loss = model.cost(X_hat, X[:, :, 16:-16, 16:-16]) val_loss = (recon_loss + sum(latent_kls)) / (32**2) X_recon = inpaint(T.cast(X, 'float32') / np.float32(255.)) Y = model.predict(X_recon) fill = theano.function(inputs=[X], outputs=[Y, val_loss, recon_loss / (32**2)] + latent_kls) P.load('unval_model.pkl') stream = data_io.stream_file("data/val2014.pkl.gz") stream = data_io.buffered_random(stream) stream = data_io.chunks((x[0] for x in stream), buffer_items=10) for chunk in stream: pass fig = plt.figure(figsize=(20, 5)) fig.subplots_adjust(left=0, bottom=0, right=1, top=1, wspace=None, hspace=None) def plot(i): global chunk
stream = data_io.stream_file('data/train.%02d.pklgz' % 0) stream = data_io.buffered_sort(stream, key=lambda x: x[1].shape[0], buffer_items=128) batched_stream = reader.batch_and_pad(stream, batch_size=32, mean=mean, std=std) total_cost = 0 total_frames = 0 for data, lengths in batched_stream: batch_avg_cost = test(data, lengths) batch_frames = np.sum(lengths) total_cost += batch_avg_cost * batch_frames total_frames += batch_frames return total_cost / total_frames import train_loop train_loop.run( data_iterator=stream, train_fun=lambda batch: train(batch[0], batch[1]), validation_score=validate, save_best_params=lambda: P.save('model.pkl'), load_best_params=lambda: P.load('model.pkl'), max_epochs=1000, patience=5000, patience_increase=2, improvement_threshold=0.999, )