def random_search_gpu( modal_names, train_probs, val_probs, target_train, target_val, numpy_rng, n_iter=400): n_modal = train_probs.shape[0] n_cls = train_probs.shape[2] # sample random weights and normalize so the modalities sum to 1 # for each class weight_samples = T.ftensor3('weight_samples') probs = T.ftensor3('probs') targets = T.ivector('targets') preds = T.argmax( T.sum(probs.dimshuffle('x',0,1,2) * weight_samples.dimshuffle(0,1,'x',2), axis=1), axis=2) accs = T.mean(T.eq(preds, targets.dimshuffle('x',0)), axis=1) best_index = T.argmax(accs) best_acc = accs[best_index] best_weights = weight_samples[best_index] print 'compiling functtion' fn = theano.function([weight_samples, probs, targets], [best_weights, best_index, best_acc]) print 'done' weight_samples_np = numpy_rng.rand(n_iter, n_modal, n_cls).astype(np.float32) weight_samples_np /= weight_samples_np.sum(1)[:, None, :] return fn(weight_samples_np, val_probs, target_val)
def test_pycuda_elemwise_kernel(): x=T.fmatrix('x') y=T.fmatrix('y') f=theano.function([x,y],x+y, mode=mode_with_gpu) print f.maker.env.toposort() f2 = theano.function([x,y],x+y, mode=mode_with_gpu.including("local_pycuda_gpu_elemwise_kernel")) print f2.maker.env.toposort() assert any([ isinstance(node.op, theano.sandbox.cuda.GpuElemwise) for node in f.maker.env.toposort()]) assert any([ isinstance(node.op, PycudaElemwiseKernelOp) for node in f2.maker.env.toposort()]) val1 = numpy.asarray(numpy.random.rand(5,5), dtype='float32') val2 = numpy.asarray(numpy.random.rand(5,5), dtype='float32') #val1 = numpy.ones((5,5)) #val2 = numpy.arange(25).reshape(5,5) assert (f(val1,val2) == f2(val1,val2)).all() print f(val1,val2) print f2(val1,val2) x3=T.ftensor3('x') y3=T.ftensor3('y') z3=T.ftensor3('y') f4 = theano.function([x3,y3,z3],x3*y3+z3, mode=mode_with_gpu.including("local_pycuda_gpu_elemwise_kernel")) print f4.maker.env.toposort() assert any([ isinstance(node.op, PycudaElemwiseKernelOp) for node in f4.maker.env.toposort()]) val1 = numpy.random.rand(2,2,2) print val1 print f4(val1,val1,val1) assert numpy.allclose(f4(val1,val1,val1),val1*val1+val1)
def make_node(self, x, x2, x3, x4, x5): # check that the theano version has support for __props__. # This next line looks like it has a typo, # but it's actually a way to detect the theano version # is sufficiently recent to support the use of __props__. assert hasattr(self, '_props'), "Your version of theano is too old to support __props__." x = tensor.as_tensor_variable(x) x2 = tensor.as_tensor_variable(x2) x3 = tensor.as_tensor_variable(x3) x4 = tensor.as_tensor_variable(x4) x5 = tensor.as_tensor_variable(x5) if prm.att_doc: if prm.compute_emb: td = tensor.itensor4().type() else: td = tensor.ftensor4().type() tm = tensor.ftensor3().type() else: if prm.compute_emb: td = tensor.itensor3().type() else: td = tensor.ftensor3().type() tm = tensor.fmatrix().type() return theano.Apply(self, [x,x2,x3,x4,x5], [td, tm, \ tensor.fmatrix().type(), tensor.ivector().type()])
def test_attention_dot_does_not_crash(): Z = T.ftensor3('Z') B = T.ftensor3('B') #base W_re = T.fmatrix('W_re') W_att_quadr = T.fmatrix("W_att_quadr") W_att_in = T.fmatrix('W_att_in') c = T.fmatrix('c') #initial state y0 = T.fmatrix('y0') #initial activation i = T.matrix('i',dtype='int8') Y, H, d = LSTMCustomDotAttentionOpNoInplaceInstance(Z, c, y0, i, W_re, B, W_att_in, W_att_quadr) f = theano.function(inputs=[Z, B, c, y0, i, W_re, W_att_in, W_att_quadr], outputs=Y) n_B = 8 n_T = 5 n_batch = 4 n_cells = 8 numpy.random.seed(1234) Z_val = numpy.random.ranf((n_T,n_batch,4*n_cells)).astype('float32') B_val = numpy.random.ranf((n_B,n_batch,n_cells)).astype('float32') W_re_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32') W_att_quadr_val = numpy.eye(n_B).astype('float32') W_att_in_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32') c_val = numpy.random.ranf((n_batch, n_cells)).astype('float32') y0_val = numpy.random.ranf((n_batch, n_cells)).astype('float32') #i_val = numpy.ones((n_T, n_batch), dtype='int8') i_val = numpy.array([[1,1,1,1,1], [0,0,1,1,1], [0,0,1,1,1], [0,0,1,0,0]], dtype='int8').T Y_val = numpy.asarray(f(Z_val, B_val, c_val, y0_val, i_val, W_re_val, W_att_in_val, W_att_quadr_val)) #print Y_val print("success")
def weighting(): from theano.tensor import TensorType x = T.ftensor3() # w = TensorType('float32', (False, False, True))() w = T.ftensor3() # z = T.dot(w, x) # y = T.addbroadcast(w, 2) # y = w.reshape([w.shape[0], w.shape[1]]) y = T.flatten(w, 2) z = x * y f = theano.function(inputs=[x, w], outputs=z) input1 = np.arange(8).reshape([2, 2, 2]).astype('float32') input2 = np.array( [ [ [0.1], [0.2] ], [ [0.2], [0.4] ] ] ).astype('float32') print input1, input1.shape print print input2, input2.shape print print f(input1, input2)
def _setup_vars(self, sparse_input): '''Setup Theano variables for our network. Parameters ---------- sparse_input : bool Not used -- sparse inputs are not supported for recurrent networks. Returns ------- vars : list of theano variables A list of the variables that this network requires as inputs. ''' _warn_dimshuffle() assert not sparse_input, 'Theanets does not support sparse recurrent models!' self.src = TT.ftensor3('src') #self.src_mask = TT.imatrix('src_mask') self.src_mask = TT.matrix('src_mask') self.dst = TT.ftensor3('dst') self.labels = TT.imatrix('labels') self.weights = TT.matrix('weights') if self.weighted: return [self.src, self.src_mask, self.dst, self.labels, self.weights] return [self.src, self.dst]
def test_batched_dot(): a = T.ftensor3('a') b = T.ftensor3('b') c = my_batched_dot(a, b) # Test in with values dim1, dim2, dim3, dim4 = 10, 12, 15, 20 A_shape = (dim1, dim2, dim3) B_shape = (dim1, dim3, dim4) C_shape = (dim1, dim2, dim4) A = np.arange(np.prod(A_shape)).reshape(A_shape).astype(floatX) B = np.arange(np.prod(B_shape)).reshape(B_shape).astype(floatX) C = c.eval({a: A, b: B}) # check shape assert C.shape == C_shape # check content C_ = np.zeros((dim1, dim2, dim4)) for i in range(dim1): C_[i] = np.dot(A[i], B[i]) assert np.allclose(C, C_)
def test_infer_shape(self): # only matrix / matrix is supported admat = tensor.ftensor3() bdmat = tensor.ftensor3() admat_val = my_rand(7, 4, 5) bdmat_val = my_rand(7, 5, 3) self._compile_and_check([admat, bdmat], [GpuBatchedDot()(admat, bdmat)], [admat_val, bdmat_val], GpuBatchedDot)
def testSNLIExample(): """ Test an example actually taken from SNLI dataset on LSTM pipeline. """ start = time.time() table = EmbeddingTable(dataPath+"glove.6B.50d.txt.gz") dataStats= "/Users/mihaileric/Documents/Research/LSTM-NLI/data/" \ "test_dataStats.json" dataJSONFile= "/Users/mihaileric/Documents/Research/LSTM-NLI/data/" \ "snli_1.0_test.jsonl" premiseTensor, hypothesisTensor = table.convertDataToEmbeddingTensors( dataJSONFile, dataStats) symPremise = T.ftensor3("inputPremise") symHypothesis = T.ftensor3("inputHypothesis") premiseSent = premiseTensor[:, 0:3, :] hypothesisSent = hypothesisTensor[:, 0:3, :] network = LSTMP2H(numTimestepsPremise=57, numTimestepsHypothesis=30, dimInput=10, embedData="/Users/mihaileric/Documents/Research/" "LSTM-NLI/data/glove.6B.50d.txt.gz") network.printLSTMP2HParams() predictFunc = network.predictFunc(symPremise, symHypothesis) labels = network.predict(premiseSent, hypothesisSent, predictFunc) for l in labels: print "Label: %s" %(l) print "Time for evaluation: %f" %(time.time() - start)
def theano_vars(self): if self.cond: return [T.ftensor3('x'), T.fmatrix('mask'), T.ftensor3('y'), T.fmatrix('label_mask')] else: return [T.ftensor3('x'), T.fmatrix('mask')]
def cmp(a_shp, b_shp): a = numpy.random.randn(* a_shp).astype(numpy.float32) b = numpy.random.randn(* b_shp).astype(numpy.float32) x = tensor.ftensor3() y = tensor.ftensor3() f = theano.function([x, y], batched_dot(x, y), mode=mode_with_gpu) z0 = numpy.asarray(f(a, b)) ga = cuda_ndarray.CudaNdarray(a) gb = cuda_ndarray.CudaNdarray(b) z1 = numpy.asarray(f(ga, gb)) z_test = numpy.sum( a[:, :, :, None] * b[:, None, :, :], axis=-2) z1 = numpy.asarray(f(ga, gb)) z_test = numpy.sum( a[:, :, :, None] * b[:, None, :, :], axis=-2) unittest_tools.assert_allclose(z0, z_test) unittest_tools.assert_allclose(z1, z_test)
def test_multiple_inputs(): X = T.ftensor3('X') X2 = T.ftensor3('X') W = T.fmatrix('W') V_h = T.fmatrix('V_h') b = T.fvector('b') c = T.fmatrix('c') #initial state i = T.matrix('i',dtype='int8') X_val_mat0 = 0.1 * numpy.array([[1,2,3], [4,5,6]], dtype='float32') X_val_mat1 = 0.1 * numpy.array([[5,1,8], [7,0,1]], dtype='float32') X_val_mat2 = 0.1 * numpy.array([[2,1,1], [-7,0,-1]], dtype='float32') X_val = numpy.zeros((3,2,3), dtype='float32') X_val[0, :, :] = X_val_mat0 X_val[1, :, :] = X_val_mat1 X_val[2, :, :] = X_val_mat2 X_val2 = numpy.zeros_like(X_val) #should be divisable by 4 for lstm, attention: note the .T W_val = 0.1 * numpy.array([[3,1,2], [4,8,0], [7,7,1], [4,2,-5], [6,-1,-2], [-4,8,0], [-7,2,1], [4,-2,-5], [6,5,-2], [-4,8,-6], [-7,3,-1], [4,2,-5]], dtype='float32').T #(for lstm) size 1/4th V_h_val = 0.1 * numpy.array([[1,3,5], [2,-1,-1], [4, 8,-5], [0,-2,3], [7,7,7], [1,2,3], [5,2,1], [-4,8,-4], [-3,7,-7], [2,-2,-3], [-5,2,1], [-4,-5,-4]], dtype='float32').T b_val = 0.1 * numpy.array([1,2,3,4,5,6,7,8,9,10,11,12], dtype='float32') c_val = numpy.zeros((2,3), dtype='float32') i_val = numpy.ones((3,2),dtype='int8') Z1, H1, d1 = LSTMOp2Instance(V_h, c, b, i, X, W) Z2, H2, d2 = LSTMOp2Instance(V_h, c, b, i, X, X2, W, W) Z3, H3, d3 = LSTMOp2Instance(V_h, c, b, i) # no inputs! DX1 = T.grad(Z1.sum(), X) DW1 = T.grad(Z1.sum(), W) DV_h1 = T.grad(Z1.sum(), V_h) Db1 = T.grad(Z1.sum(), b) Dc1 = T.grad(Z1.sum(), c) DX2 = T.grad(Z2.sum(), X) DW2 = T.grad(Z2.sum(), W) DV_h2 = T.grad(Z2.sum(), V_h) Db2 = T.grad(Z2.sum(), b) Dc2 = T.grad(Z2.sum(), c) DV_h3 = T.grad(Z3.sum(), V_h) f = theano.function(inputs=[X, W, V_h, c, b, i], outputs=[Z1, DX1, DW1]) g = theano.function(inputs=[X, X2, W, V_h, c, b, i], outputs=[Z2, DX2, DW2]) h = theano.function(inputs=[V_h, c, b, i], outputs=[Z3, DV_h3]) h_res = [numpy.asarray(A, dtype='float32') for A in h(V_h_val, c_val, b_val, i_val)] #print h_res[0], h_res[1] f_res = [numpy.asarray(A, dtype='float32') for A in f(X_val, W_val, V_h_val, c_val, b_val, i_val)] g_res = [numpy.asarray(A, dtype='float32') for A in g(X_val, X_val2, W_val, V_h_val, c_val, b_val, i_val)] for A1, A2 in zip(f_res, g_res): assert numpy.allclose(A1, A2) #print f_res[0], g_res[0] print "success"
def test_outer_infershape(self): o = tensor.ftensor4() x = tensor.ftensor3() y = tensor.ftensor3() xIdx = tensor.imatrix() yIdx = tensor.imatrix() self._compile_and_check( [o, x, y, xIdx, yIdx], [self.outer_op(o, x, y, xIdx, yIdx)], self.outer_data(), self.outer_class )
def test_attention_time_gauss(): n_T = 4 n_batch = 2 n_inp_dim = 3 n_cells = 5 n_B = 5 custom_op = get_attention(RecurrentTransform.AttentionTimeGauss, n_out=n_cells, n_batches=n_batch, n_input_t=n_B, n_input_dim=n_inp_dim) att = custom_op.recurrent_transform Z_val = numpy.random.ranf((n_T,n_batch,4*n_cells)).astype('float32') W_re_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32') W_att_quadr_val = numpy.eye(n_B).astype('float32') W_att_in_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32') B_val = numpy.random.ranf((n_B,n_batch,n_cells)).astype('float32') c_val = numpy.random.ranf((n_batch, n_cells)).astype('float32') y0_val = numpy.random.ranf((n_batch, n_cells)).astype('float32') i_val = numpy.ones((n_T, n_batch), dtype='int8') Z = T.ftensor3('Z') B = T.ftensor3('B') #base W_re = T.fmatrix('W_re') W_att_quadr = T.fmatrix("W_att_quadr") W_att_in = T.fmatrix('W_att_in') c = T.fmatrix('c') #initial state y0 = T.fmatrix('y0') #initial activation i = T.matrix('i',dtype='int8') t0 = T.fvector('t0') custom_vars = att.get_sorted_custom_vars() initial_state_vars = att.get_sorted_state_vars_initial() custom_op_inputs = [Z, c, y0, i, W_re] + custom_vars + initial_state_vars print("input args num:", len(custom_op_inputs)) print("input args:", custom_op_inputs) custom_op_outputs = custom_op(*custom_op_inputs) print("output args num:", len(custom_op_outputs)) custom_op_outputs = [cuda.host_from_gpu(v) for v in custom_op_outputs] f = theano.function(inputs=[Z, c, y0, i, W_re], outputs=custom_op_outputs) res = f(Z_val, c_val, y0_val, i_val, W_re_val) #print res # res: (output) Y, (gates and cell state) H, (final cell state) d, state vars sequences (Y, H, d), state_var_seqs = res[:3], res[3:] # print "running custom dumped data" # custom_op_inputs = [theano.shared(numpy.load("../op.i.%i" % i)) for i in range(12)] # custom_op_outputs = custom_op(*custom_op_inputs) # custom_op_outputs = [cuda.host_from_gpu(v) for v in custom_op_outputs] # f = theano.function(inputs=[], outputs=custom_op_outputs) # res = f() print(res) assert False
def fail(a_shp, b_shp): a=numpy.random.randn(*a_shp).astype(numpy.float32) b=numpy.random.randn(*b_shp).astype(numpy.float32) x=tensor.ftensor3() y=tensor.ftensor3() f=theano.function([x,y], batched_dot(x,y), mode=mode_with_gpu) z = f(a,b)
def test_tensor3_roc_auc_scores(): true = np.random.binomial(n=1, p=.5, size=(20, 30, 40)).astype('float32') predicted = np.random.random((20, 30, 40)).astype('float32') yt, yp = T.ftensor3('yt'), T.ftensor3('yp') refscore = tmetrics.classification.last_axis_roc_auc_scores(true, predicted) roc_auc_scores = tmetrics.classification.roc_auc_scores(yt, yp) f = theano.function([yt, yp], roc_auc_scores) score = f(true, predicted) print 'refscore' print refscore print 'score' print score assert np.allclose(refscore, score, equal_nan=True)
def experiment(train_data, train_labels, test_data, test_labels): x = T.ftensor3('input_data') no_of_patches = 64 cs_args = { "train_args":{ "learning_rate": 0.08, "nepochs": 200, "cost_type": "crossentropy", "save_exp_data": False, "batch_size": 100, "randomize_mb": True, "enable_dropout": False }, "test_args":{ "save_exp_data":False, "batch_size": 2000 } } post_mlp = StructuredMLP(x, in_layer_shape=(no_of_patches, 81, 200, 128), layer2_in=1024, activation=NeuralActivations.Rectifier, n_out=1, quiet=True, #momentum=0.9, save_file="./pkls/structured_mlp_1000_11outs_1hot.pkl", use_adagrad=False) post_mlp.set_test_data(test_data, test_labels, patch_mode=False) print "=============((((()))))===============" print "Training on the dataset." post_mlp.train(train_data, train_labels, **cs_args["train_args"])
def create_iterator_functions(self): # Define input and target variables input_var = T.ftensor3('inputs') target_var = T.ivector('targets') hop_length = (par.STEP_SIZE / 1000.0) * par.SR self.net = build_model_small((None, par.N_COMPONENTS, int(par.MAX_LENGTH/hop_length)), input_var) #with open('models/499.pkl', 'rb') as f: #param_values = pickle.load(f) #lasagne.layers.set_all_param_values(self.net['prob'], param_values) # Define prediction and loss calculation prediction = lasagne.layers.get_output(self.net['prob'], inputs=input_var) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() # Define updates params = lasagne.layers.get_all_params(self.net['prob'], trainable=True) updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=self.update_learning_rate, momentum=0.9) # Define test time prediction test_prediction = lasagne.layers.get_output(self.net['prob'], inputs=input_var, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var) test_loss = test_loss.mean() # Compile functions self.train_fn = theano.function([input_var, target_var], loss, updates=updates) self.val_fn = theano.function([input_var, target_var], [test_loss, test_prediction])
def get_input_data(self, name, mcp, input_dims): """ """ input_names = mcp.safe_get_list(name, 'inputs') l = [] for e,dim in zip(input_names, input_dims): if e in self.symbolic_var_dic: print(' use symbolic variable {}'.format(e)) l += [self.symbolic_var_dic[e]] else: try: if isinstance(dim, int): sym, descr = tensor.fmatrix(e), 'fmatrix' elif isinstance(dim, tuple): t = len(dim) if t != 2 and t != 3: raise Exception('Unsupported dimension {}'.format(t)) if t == 2: sym, descr = tensor.ftensor3(e), 'tensor3' else: sym, descr = tensor.ftensor4(e), 'tensor4' print(' create symbolic variable {} as {}'.format(e, descr)) self.symbolic_var_dic[e] = sym l += [sym] except Exception as err: print err sys.exit(1) return input_names, l
def experiment(train_data, train_labels, test_data, test_labels): x = T.ftensor3('input_data') no_of_patches = 64 train_patches = get_dataset_patches(train_data) cs_args = { "train_args":{ "learning_rate": 0.0015, "nepochs": 60, "cost_type": "crossentropy", "save_exp_data": False, "batch_size": 100, "enable_dropout": False }, "test_args":{ "save_exp_data":False, "batch_size": 2000 } } post_mlp = StructuredMLP(x, in_layer_shape=(no_of_patches, no_of_patches, 200, 256), layer2_in=2048, activation=NeuralActivations.Rectifier, layer1_nout=100, n_out=1, quiet=True, save_file="structured_mlp_100k_100lbls_g.pkl", use_adagrad=False) post_mlp.set_test_data(test_data, test_labels) print "=============((((()))))===============" print "Training on the dataset." post_mlp.train(train_patches, train_labels, **cs_args["train_args"])
def test_does_not_crash(): Z = T.ftensor3('Z') W_re = T.fmatrix('W_re') W_att_in = T.fmatrix('W_att_in') c = T.fmatrix('c') #initial state y0 = T.fmatrix('y0') #initial activation i = T.matrix('i',dtype='int8') Y, H, d = LSTMCustomTestOpNoInplaceInstance(Z, c, y0, i, W_re, W_att_in) f = theano.function(inputs=[Z, c, y0, i, W_re, W_att_in], outputs=Y) n_T = 5 n_batch = 4 n_inp_dim = 3 n_cells = 8 numpy.random.seed(1234) Z_val = numpy.random.ranf((n_T,n_batch,4*n_cells)).astype('float32') W_re_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32') W_att_in_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32') c_val = numpy.random.ranf((n_batch, n_cells)).astype('float32') y0_val = numpy.random.ranf((n_batch, n_cells)).astype('float32') #i_val = numpy.ones((n_T, n_batch), dtype='int8') i_val = numpy.array([[1,1,1,1,1], [0,0,1,1,1], [0,0,1,1,1], [0,0,1,0,0]], dtype='int8').T Y_val = numpy.asarray(f(Z_val, c_val, y0_val, i_val, W_re_val, W_att_in_val)) #print Y_val print "success"
def build_loss_graph(self, saved_graph=None): print("Building loss graph...") for l in self.layers: l.set_training(False) Sentence = T.fmatrix('Sentence') Characters = T.ftensor3('Characters') WordLengths = T.ivector('WordLengths') GoldPredictions = T.fmatrix('GoldPredictions') weight_list = self.get_theano_weight_list() if self.feature_mode == 'character': result = self.theano_sentence_loss(Characters, WordLengths, GoldPredictions) input_list = [Characters, WordLengths, GoldPredictions] + list(weight_list) elif self.feature_mode == 'sentence': result = self.theano_sentence_loss(Sentence, GoldPredictions) input_list = [Sentence, GoldPredictions] + list(weight_list) elif self.feature_mode == 'both': result = self.theano_sentence_loss(Sentence, Characters, WordLengths, GoldPredictions) input_list = [Sentence, Characters, WordLengths, GoldPredictions] + list(weight_list) cgraph = theano.function(inputs=input_list, outputs=result, mode='FAST_RUN', allow_input_downcast=True) print("Done building graph.") return cgraph
def test_sparseblockgemvF(self): """ Test the fortan order for W (which can happen in the grad for some graphs). """ b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.imatrix() oIdx = tensor.imatrix() o = self.gemv_op(b.take(oIdx, axis=0), tensor.DimShuffle((False, False, False, False), (0, 1, 3, 2)) (tensor.as_tensor_variable(W)), h, iIdx, oIdx) f = theano.function([W, h, iIdx, b, oIdx], o, mode=self.mode) W_val, h_val, iIdx_val, b_val, oIdx_val = \ BlockSparse_Gemv_and_Outer.gemv_data() th_out = f(numpy.swapaxes(W_val, 2, 3), h_val, iIdx_val, b_val, oIdx_val) ref_out = BlockSparse_Gemv_and_Outer.gemv_numpy( b_val.take(oIdx_val, axis=0), W_val, h_val, iIdx_val, oIdx_val) utt.assert_allclose(ref_out, th_out)
def make_node(self, acts, input_lengths, flat_labels, label_lengths): acts_ = T.as_tensor_variable(acts) input_lengths_ = T.as_tensor_variable(input_lengths) flat_labels_ = T.as_tensor_variable(flat_labels) label_lengths_ = T.as_tensor_variable(label_lengths) if acts_.dtype != "float32": raise Exception("acts must be float32 instead of %s" % acts.dtype) if input_lengths.dtype != "int32": raise Exception("input_lengths must be int32 instead of %s" % input_lengths.dtype) if flat_labels.dtype != "int32": raise Exception("flat_labels must be int32 instead of %s" % flat_labels.dtype) if label_lengths.dtype != "int32": raise Exception("label_lengths must be int32 instead of %s" % label_lengths.dtype) # Normally a singleton Op instance is created, and different Apply nodes are # created for different inputs. # Here, we create an Op instance specifically for this application, # and store the gradient variable in it so that it can be used by grad(). op = CpuCtc() op.costs = T.fvector(name="ctc_cost") op.gradients = T.ftensor3(name="ctc_grad") # Don't compute gradient unless needed op.computeGradient = theano.shared(np.asarray([1], dtype=np.int32)) applyNode = theano.Apply(op, inputs=[acts_, input_lengths_, flat_labels_, label_lengths_, op.computeGradient], outputs=[op.costs, op.gradients]) # Return only the cost. Gradient will be returned by grad() self.default_output = 0 return applyNode
def test_transfer(self): tensor1 = self.rng.rand(20, 10, 5, 8).astype("float32") tensor2 = self.rng.rand(5, 8, 20).astype("float32") tensor3 = self.rng.rand(8, 20, 5).astype("float32") x = tensor.ftensor4("x") y = tensor.ftensor3("y") tdot1 = tensor.tensordot(x, y, 2) f1 = theano.function([x, y], tdot1, mode=mode_with_gpu) topo1 = f1.maker.fgraph.toposort() assert topo1[-1].op == cuda.host_from_gpu # Let DebugMode debug f1(tensor1, tensor2) tdot2 = tensor.tensordot(x, y, axes=[(0, 3), (1, 0)]) f2 = theano.function([x, y], tdot2, mode=mode_with_gpu) topo2 = f2.maker.fgraph.toposort() assert topo2[-1].op == cuda.host_from_gpu f2(tensor1, tensor3) tdot3 = tensor.tensordot(x, y, axes=[(0, 3, 2), (1, 0, 2)]) f3 = theano.function([x, y], tdot3, mode=mode_with_gpu) topo3 = f3.maker.fgraph.toposort() assert topo3[-1].op == cuda.host_from_gpu f3(tensor1, tensor3)
def build_theano_gru(self, innerdim, indim, batsize, gru): u = theano.shared(gru.u.d.get_value()) w = theano.shared(gru.w.d.get_value()) um = theano.shared(gru.um.d.get_value()) wm = theano.shared(gru.wm.d.get_value()) uhf = theano.shared(gru.uhf.d.get_value()) whf = theano.shared(gru.whf.d.get_value()) b = theano.shared(gru.b.d.get_value()) bm = theano.shared(gru.bm.d.get_value()) bhf = theano.shared(gru.bhf.d.get_value()) def rec(x_t, h_tm1): mgate = T.nnet.sigmoid(T.dot(h_tm1, um) + T.dot(x_t, wm) + bm) hfgate = T.nnet.sigmoid(T.dot(h_tm1, uhf) + T.dot(x_t, whf) + bhf) canh = T.tanh(T.dot(h_tm1 * hfgate, u) + T.dot(x_t, w) + b) h = mgate * h_tm1 + (1-mgate) * canh return [h, h] def apply(x): inputs = x.dimshuffle(1, 0, 2) # inputs is (seq_len, batsize, dim) init_h = T.zeros((batsize, innerdim)) outputs, _ = theano.scan(fn=rec, sequences=inputs, outputs_info=[None, init_h]) output = outputs[0] return output[-1, :, :] #.dimshuffle(1, 0, 2) # return is (batsize, seqlen, dim) inp = T.ftensor3() return inp, apply(inp)
def test_read(self): batch_size = 100 height, width = self.height, self.width N = self.N zaw = self.zaw # Create theano function images = T.ftensor3('images') center_y, center_x = T.fvectors('center_x', 'center_y') delta, sigma = T.fvectors('delta', 'sigma') readout = zaw.read(images, center_y, center_x, delta, sigma) do_read = theano.function( [images, center_y, center_x, delta, sigma], readout, name="do_read", allow_input_downcast=True) # Test theano function images = np.random.uniform(size=(batch_size, height, width)) center_y = np.linspace(-height, 2*height, batch_size) center_x = np.linspace(-width, 2*width, batch_size) delta = np.linspace(0.1, height, batch_size) sigma = np.linspace(0.1, height, batch_size) readout = do_read(images, center_y, center_x, delta, sigma) assert readout.shape == (batch_size, N**2) assert np.isfinite(readout).all() assert (readout >= 0.).all() assert (readout <= 1.).all()
def test_fwd_pass_compatible_with_OpLSTM(): Z = T.ftensor3('Z') W_re = T.fmatrix('W_re') W_att_in = T.fmatrix('W_att_in') c = T.fmatrix('c') #initial state y0 = T.fmatrix('y0') #initial activation i = T.matrix('i',dtype='int8') Y, H, d = LSTMCustomTestOpNoInplaceInstance(Z, c, y0, i, W_re, W_att_in) W_re_modified = W_re + W_att_in Z_modified = T.inc_subtensor(Z[0], T.dot(y0,W_re_modified)) Y2, H2, d2 = LSTMOpInstance(Z_modified, W_re_modified, c, i) f = theano.function(inputs=[Z, c, y0, i, W_re, W_att_in], outputs=Y) g = theano.function(inputs=[Z, W_re, c, y0, i, W_att_in], outputs=Y2) n_T = 5 n_batch = 4 n_inp_dim = 3 n_cells = 8 numpy.random.seed(1234) Z_val = numpy.random.ranf((n_T,n_batch,4*n_cells)).astype('float32') W_re_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32') W_att_in_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32') c_val = numpy.random.ranf((n_batch, n_cells)).astype('float32') y0_val = numpy.random.ranf((n_batch, n_cells)).astype('float32') #i_val = numpy.ones((n_T, n_batch), dtype='int8') i_val = numpy.array([[1,1,1,1,1], [0,0,1,1,1], [0,0,1,1,1], [0,0,1,0,0]], dtype='int8').T Y_val = numpy.asarray(f(Z_val, c_val, y0_val, i_val, W_re_val, W_att_in_val)) Y2_val = numpy.asarray(g(Z_val, W_re_val, c_val, y0_val, i_val, W_att_in_val)) assert numpy.allclose(Y_val, Y2_val) print("success")
def make_node(self, activations, labels, input_lengths): t_activations = T.as_tensor_variable(activations) # Ensure activations array is C-contiguous t_activations = cpu_contiguous(t_activations) t_labels = T.as_tensor_variable(labels) t_input_lengths = T.as_tensor_variable(input_lengths) if t_activations.type.dtype != 'float32': raise TypeError('activations must use the float32 type!') if t_activations.ndim != 3: raise ValueError('activations must have 3 dimensions.') if t_labels.type.dtype != 'int32': raise TypeError('labels must use the int32 type!') if t_labels.ndim != 2: raise ValueError('labels must have 2 dimensions.') if t_input_lengths.type.dtype != 'int32': raise TypeError('input_lengths must use the int32 type!') if t_input_lengths.ndim != 1: raise ValueError('input_lengths must have 1 dimension.') costs = T.fvector(name="ctc_cost") outputs = [costs] if self.compute_grad: gradients = T.ftensor3(name="ctc_grad") outputs += [gradients] return gof.Apply(self, inputs=[t_activations, t_labels, t_input_lengths], outputs=outputs)
def test_blocksparse_grad_merge(): b = tensor.fmatrix() h = tensor.ftensor3() iIdx = tensor.lmatrix() oIdx = tensor.lmatrix() W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data() W = float32_shared_constructor(W_val) o = sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx) gW = theano.grad(o.sum(), W) lr = numpy.asarray(0.05, dtype='float32') upd = W - lr * gW f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode_with_gpu) # not running with mode=gpu ensures that the elemwise is not merged in mode = None if theano.config.mode == 'FAST_COMPILE': mode = theano.compile.mode.get_mode('FAST_RUN') f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode) f2(h_val, iIdx_val, b_val, oIdx_val) W_ref = W.get_value() # reset the var W.set_value(W_val) f1(h_val, iIdx_val, b_val, oIdx_val) W_opt = W.get_value() utt.assert_allclose(W_ref, W_opt)
def __init__(self, Nbranches = 1, # number of branches (parallel models to be fused) Nlayers = 1, # number of layers Ndirs = 1, # unidirectional or bidirectional Nx = 100, # input size Nh = 100, # hidden layer size Ny = 100, # output size Ah = "relu", # hidden unit activation (e.g. relu, tanh, lstm) Ay = "linear", # output unit activation (e.g. linear, sigmoid, softmax) predictPer = "frame", # frame or sequence loss = None, # loss function (e.g. mse, ce, ce_group, hinge, squared_hinge) L1reg = 0.0, # L1 regularization L2reg = 0.0, # L2 regularization multiReg = 0.0, # regularization of agreement of predictions on data of different conditions momentum = 0.0, # SGD momentum seed = 15213, # random seed for initializing the weights frontEnd = None, # a lambda function for transforming the input filename = None, # initialize from file initParams = None, # initialize from given dict ): if filename is not None: # load parameters from file with smart_open(filename, "rb") as f: initParams = dill.load(f) if initParams is not None: # load parameters from given dict self.paramNames = [] self.params = [] for k, v in initParams.iteritems(): if type(v) is numpy.ndarray: self.addParam(k, v) else: setattr(self, k, v) self.paramNames.append(k) # F*ck, locals()[k] = v doesn't work; I have to do this statically Nbranches, Nlayers, Ndirs, Nx, Nh, Ny, Ah, Ay, predictPer, loss, L1reg, L2reg, momentum, frontEnd \ = self.Nbranches, self.Nlayers, self.Ndirs, self.Nx, self.Nh, self.Ny, self.Ah, self.Ay, self.predictPer, self.loss, self.L1reg, self.L2reg, self.momentum, self.frontEnd else: # Initialize parameters randomly # Names of parameters to save to file self.paramNames = ["Nbranches", "Nlayers", "Ndirs", "Nx", "Nh", "Ny", "Ah", "Ay", "predictPer", "loss", "L1reg", "L2reg", "momentum", "frontEnd"] for name in self.paramNames: value = locals()[name] setattr(self, name, value) # Values of parameters for building the computational graph self.params = [] # Initialize random number generators global rng rng = numpy.random.RandomState(seed) # Construct parameter matrices Nlstm = 4 if Ah == 'lstm' else 1 self.addParam("Win", rand_init((Nbranches, Nx, Nh * Ndirs * Nlstm), Ah)) self.addParam("Wrec", rand_init((Nbranches, Nlayers, Ndirs, Nh, Nh * Nlstm), Ah)) self.addParam("Wup", rand_init((Nbranches, Nlayers - 1, Nh * Ndirs, Nh * Ndirs * Nlstm), Ah)) self.addParam("Wout", rand_init((Nbranches, Nh * Ndirs, Ny), Ay)) if Ah != "lstm": self.addParam("Bhid", zeros((Nbranches, Nlayers, Nh * Ndirs))) else: self.addParam("Bhid", numpy.tile(numpy.concatenate([full((Nbranches, Nlayers, Nh), 1.0), zeros((Nbranches, Nlayers, Nh * 3))], 2), (1, 1, Ndirs))) self.addParam("Bout", zeros((Nbranches, Ny))) self.addParam("h0", zeros((Nbranches, Nlayers, Ndirs, Nh))) if Ah == "lstm": self.addParam("c0", zeros((Nbranches, Nlayers, Ndirs, Nh))) # Compute total number of parameters self.nParams = sum(x.get_value().size for x in self.params) # Initialize gradient tensors when using momentum if momentum > 0: self.dparams = [theano.shared(zeros(x.get_value().shape)) for x in self.params] # Build computation graph input = T.ftensor3() mask = T.imatrix() mask_int = [(mask % 2).nonzero(), (mask >= 2).nonzero()] mask_float = [T.cast((mask % 2).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX), T.cast((mask >= 2).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX)] # mask_int = [(mask & 1).nonzero(), (mask & 2).nonzero()] # mask_float = [T.cast((mask & 1).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX), # T.cast(((mask & 2) / 2).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX)] def step_rnn(x_t, mask, h_tm1, W, h0): h_tm1 = T.switch(mask, h0, h_tm1) return [ACTIVATION[Ah](x_t + h_tm1.dot(W))] def step_lstm(x_t, mask, c_tm1, h_tm1, W, c0, h0): c_tm1 = T.switch(mask, c0, c_tm1) h_tm1 = T.switch(mask, h0, h_tm1) a = x_t + h_tm1.dot(W) f_t = T.nnet.sigmoid(a[:, :Nh]) i_t = T.nnet.sigmoid(a[:, Nh : Nh * 2]) o_t = T.nnet.sigmoid(a[:, Nh * 2 : Nh * 3]) c_t = T.tanh(a[:, Nh * 3:]) * i_t + c_tm1 * f_t h_t = T.tanh(c_t) * o_t return [c_t, h_t] x = input if frontEnd is None else frontEnd(input) outputs = [] for k in range(Nbranches): for i in range(Nlayers): h = (x.dimshuffle((1, 0, 2)).dot(self.Win[k]) if i == 0 else h.dot(self.Wup[k, i-1])) + self.Bhid[k, i] rep = lambda x: T.extra_ops.repeat(x.reshape((1, -1)), h.shape[1], axis = 0) if Ah != "lstm": h = T.concatenate([theano.scan( fn = step_rnn, sequences = [h[:, :, Nh * d : Nh * (d+1)], mask_float[d]], outputs_info = [rep(self.h0[k, i, d])], non_sequences = [self.Wrec[k, i, d], rep(self.h0[k, i, d])], go_backwards = (d == 1), )[0][::(1 if d == 0 else -1)] for d in range(Ndirs)], axis = 2) else: h = T.concatenate([theano.scan( fn = step_lstm, sequences = [h[:, :, Nh * 4 * d : Nh * 4 * (d+1)], mask_float[d]], outputs_info = [rep(self.c0[k, i, d]), rep(self.h0[k, i, d])], non_sequences = [self.Wrec[k, i, d], rep(self.c0[k, i, d]), rep(self.h0[k, i, d])], go_backwards = (d == 1), )[0][1][::(1 if d == 0 else -1)] for d in range(Ndirs)], axis = 2) h = h.dimshuffle((1, 0, 2)) if predictPer == "sequence": h = T.concatenate([h[mask_int[1 - d]][:, Nh * d : Nh * (d+1)] for d in range(Ndirs)], axis = 1) outputs.append(ACTIVATION[Ay](h.dot(self.Wout[k]) + self.Bout[k])) output = T.stack(*outputs) # Deprecated in Theano 0.8 but accepted in Theano 0.7 output_mean = output.mean(axis = 0) output_var = output.var(axis = 0) # Compute loss function if loss is None: loss = {"linear": "mse", "sigmoid": "ce", "softmax": "ce_group"}[self.Ay] if loss == "ctc": label = T.imatrix() label_time = T.imatrix() tol = T.iscalar() cost = ctc_cost(output_mean, mask, label, label_time, tol) else: if predictPer == "sequence": label = T.fmatrix() y = output_mean t = label elif predictPer == "frame": label = T.ftensor3() indices = (mask >= 0).nonzero() y = output_mean[indices] t = label[indices] cost = T.mean({ "ce": -T.mean(T.log(y) * t + T.log(1 - y) * (1 - t), axis = 1), "ce_group": -T.log((y * t).sum(axis = 1)), "mse": T.mean((y - t) ** 2, axis = 1), "hinge": T.mean(relu(1 - y * (t * 2 - 1)), axis = 1), "squared_hinge": T.mean(relu(1 - y * (t * 2 - 1)) ** 2, axis = 1), }[loss]) # Add regularization cost += sum(abs(x).sum() for x in self.params) / self.nParams * L1reg cost += sum(T.sqr(x).sum() for x in self.params) / self.nParams * L2reg if predictPer == "sequence": cost += output_var.mean() * multiReg else: indices = (mask >= 0).nonzero() cost += output_var[indices].mean() * multiReg # Compute updates for network parameters updates = [] lrate = T.fscalar() clip = T.fscalar() grad = T.grad(cost, self.params) grad_clipped = [T.maximum(T.minimum(g, clip), -clip) for g in grad] if momentum > 0: for w, d, g in zip(self.params, self.dparams, grad_clipped): updates.append((w, w + momentum * momentum * d - (1 + momentum) * lrate * g)) updates.append((d, momentum * d - lrate * g)) else: for w, g in zip(self.params, grad_clipped): updates.append((w, w - lrate * g)) # Create functions to be called from outside if loss == "ctc": inputs = [input, mask, label, label_time, tol, lrate, clip] else: inputs = [input, mask, label, lrate, clip] self.train = theano.function( inputs = inputs, outputs = cost, updates = updates, ) self.predict = theano.function(inputs = [input, mask], outputs = output)
def UnitTest_OnestepAttend(): N = 2 #number of sample D = 5 #dimension of input H = 4 #dimension of hidden T_new = 1 #length of per each sample context_dim = 3 K = 5 x = np.linspace(-0.4, 0.6, num=N*T_new*D, dtype = theano.config.floatX).reshape(T_new, N, D) h0= np.linspace(-0.4, 0.8, num=N*H, dtype = theano.config.floatX).reshape(N, H) Wx= np.linspace(-0.2, 0.9, num=4*D*H, dtype = theano.config.floatX).reshape(D, 4*H) Wh= np.linspace(-0.3,0.6, num =4*H*H, dtype = theano.config.floatX).reshape(H,4*H) b = np.linspace(0.0, 0.0, num = 4*H, dtype = theano.config.floatX) Wz= np.linspace(-0.3, 0.6, num=4*H*context_dim, dtype = theano.config.floatX).reshape(context_dim, 4*H) Hcontext = np.linspace(-0.2, 0.6, num=H*K, dtype = theano.config.floatX).reshape(H, K) Zcontext = np.linspace(-0.2, 0.5, num=context_dim*K, dtype= theano.config.floatX).reshape(context_dim, K) Va= np.linspace(0.1, 0.4, num=K, dtype = theano.config.floatX) Va_reshape = Va.reshape(K,1) image_feature_3D = np.linspace(-0.2, 0.5, num=10*N*context_dim, dtype = theano.config.floatX).reshape(N,10, context_dim) h0_theano = h0.reshape(1, N, H) # h0_symb = theano.tensor.ftensor3("h_symb") # lstm_theano_layer.h_m1.set_value(h0_theano) c0_theano = np.zeros((1, N, H), dtype = theano.config.floatX) # c0_symb = theano.tensor.ftensor3("c_symb") # lstm_theano_layer.c_m1.set_value(c0_theano) z0_theano = np.zeros((1, N, context_dim), dtype = theano.config.floatX) x_theano = x.reshape(T_new, N, D, 1) image_feature_input = image_feature_3D weight_y_in_value = np.zeros(( 10, context_dim) , dtype= theano.config.floatX) b_theano= b.reshape(1, 1, 4*H) pdb.set_trace() #symbolic variables initial_h0_layer_out = theano.tensor.tensor3(name = 'h0_initial', dtype = theano.config.floatX) initial_c0_layer_out = theano.tensor.tensor3(name = 'c0_initial', dtype = theano.config.floatX) initial_z0 = T.tensor3(name= 'z0_initial', dtype = theano.config.floatX) weight_y_in = theano.tensor.fmatrix("weight_y") input_data = theano.tensor.tensor3(name ='x', dtype=theano.config.floatX) image_feature_region = theano.tensor.tensor3(name = 'feature_region', dtype = theano.config.floatX) Wi_sym, Wf_sym, Wc_sym, Wo_sym, Ui_sym, Uf_sym, Uc_sym, Uo_sym, Zi_sym, Zf_sym, Zc_sym, Zo_sym = T.fmatrices(12) Zcontext_sym, Hcontext_sym = T.fmatrices(2) bi = T.ftensor3("bi") bf = T.ftensor3("bf") bc = T.ftensor3("bc") bo = T.ftensor3("bo") Va_sym = T.fcol("Va") out_sym = onestep_attend_tell(input_data, initial_h0_layer_out, initial_c0_layer_out, initial_z0, Wi_sym, Wf_sym, Wc_sym, Wo_sym, Ui_sym, Uf_sym, Uc_sym, Uo_sym, Zi_sym, Zf_sym, Zc_sym, Zo_sym, Zcontext_sym, Hcontext_sym, Va_sym, bi, bf, bc, bo, image_feature_region, weight_y_in) onestep_func = theano.function([input_data, initial_h0_layer_out, initial_c0_layer_out, initial_z0, Wi_sym, Wf_sym, Wc_sym, Wo_sym, Ui_sym, Uf_sym, Uc_sym, Uo_sym, Zi_sym, Zf_sym, Zc_sym, Zo_sym, Zcontext_sym, Hcontext_sym, Va_sym, bi, bf, bc, bo, image_feature_region, weight_y_in], out_sym) list_output = onestep_func(x, h0_theano, c0_theano, z0_theano, Wx[:, :H], Wx[:, H:2*H], Wx[:, 2*H:3*H], Wx[:, 3*H:], Wh[:, :H], Wh[:, H:2*H], Wh[:, 2*H:3*H], Wh[:, 3*H:], Wz[:, :H], Wz[:, H:2*H], Wz[:, 2*H:3*H], Wz[:, 3*H:], Zcontext,Hcontext, Va_reshape, b_theano[:,: , :H], b_theano[:, :, H:2*H], b_theano[:, :, 2*H:3*H], b_theano[:, :, 3*H:], image_feature_input, weight_y_in_value) pdb.set_trace() print(list_output[0].shape) print(list_output[1].shape) print(list_output[2].shape) pdb.set_trace()
def __init__(self, input_size=2, inner_size=3, output_size=None, batch_size=10, lr=0.01, gamma=0.9): self.bsz = batch_size # Forget gate matrix self.W_f = init_sh_param(shape=(inner_size, inner_size), name='W_f') self.U_f = init_sh_param(shape=(inner_size, input_size), name='U_f') self.b_f = init_sh_param(shape=inner_size, name='b_f') # Insert gate matrix self.W_i = init_sh_param(shape=(inner_size, inner_size), name='W_i') self.U_i = init_sh_param(shape=(inner_size, input_size), name='U_i') self.b_i = init_sh_param(shape=inner_size, name='b_i') # Cell gate matrix self.W_c = init_sh_param(shape=(inner_size, inner_size), name='W_c') self.U_c = init_sh_param(shape=(inner_size, input_size), name='U_c') self.b_c = init_sh_param(shape=inner_size, name='b_c') # Output gate matrix self.W_o = init_sh_param(shape=(inner_size, inner_size), name='W_o') self.U_o = init_sh_param(shape=(inner_size, input_size), name='U_o') self.b_o = init_sh_param(shape=inner_size, name='b_o') # bundle self.params = [ self.W_f, self.U_f, self.b_f, self.W_i, self.U_i, self.b_i, self.W_c, self.U_c, self.b_c, self.W_o, self.U_o, self.b_o ] self.names = [ 'W_f', 'U_f', 'b_f', 'W_i', 'U_i', 'b_i', 'W_c', 'U_c', 'b_c', 'W_o', 'U_o', 'b_o' ] # Softmax layer if output_size != None: self.S = init_sh_param((output_size, inner_size), name='S_softmax') self.b_s = init_sh_param(output_size, name='b_s') self.params.append(self.S) self.params.append(self.b_s) self.names.append('S_softmax_data') self.names.append('b_s_data') # RMSProp data self.params_data = [] for elem, name in zip(self.params, self.names): self.params_data.append( init_sh_zero(elem.get_value().shape, name=name + '_data')) def step(x_t, h_t_1, C_t_1): f_t = T.dot(self.W_f, h_t_1) + T.dot(self.U_f, x_t) f_t = sigm(f_t.T + self.b_f).T i_t = T.dot(self.W_i, h_t_1) + T.dot(self.U_i, x_t) i_t = sigm(i_t.T + self.b_i).T o_t = T.dot(self.W_o, h_t_1) + T.dot(self.U_o, x_t) o_t = sigm(o_t.T + self.b_o).T C_t_c = T.dot(self.W_c, h_t_1) + T.dot(self.U_c, x_t) C_t_c = tanh(C_t_c.T + self.b_c).T C_t = f_t * C_t_1 + i_t * C_t_c h_t = o_t * T.tanh(C_t) return h_t, C_t x = T.ftensor3(name='x_input') y = T.fmatrix(name='y_input') (h_t, _), _ = theano.scan(fn=step, sequences=x, outputs_info=[ T.zeros(shape=(inner_size, batch_size), dtype='float32'), T.zeros(shape=(inner_size, batch_size), dtype='float32') ]) h_last = h_t[-1] if output_size == None: E = T.sum((h_last - y)**2) else: j = T.nnet.softmax(T.dot(self.S, h_last).T + self.b_s).T E = T.sum((j - y)**2) gradients = T.grad(E, self.params) updates = [] for param, grad, param_data in zip(self.params, gradients, self.params_data): r_t = (1 - gamma) * (grad**2) + gamma * param_data v_t_1 = lr * grad / T.sqrt(r_t) updates.append((param, param - v_t_1)) updates.append((param_data, r_t)) self.train = theano.function(inputs=[x, y], outputs=E, updates=OrderedDict(updates)) t = T.zeros(shape=(inner_size, 1), dtype='float32') t = T.unbroadcast(t, 1) (h_t_2, _), _ = theano.scan(fn=step, sequences=x, outputs_info=[t, t]) j_test = h_t_2[-1] if output_size != None: j_test = T.nnet.softmax(T.dot(self.S, j_test).T + self.b_s).T self.test = theano.function(inputs=[x], outputs=j_test)
def build(word_embeddings, len_voc, word_emb_dim, args, freeze=False): # input theano vars posts = T.imatrix() post_masks = T.fmatrix() ques_list = T.itensor3() ques_masks_list = T.ftensor3() ans_list = T.itensor3() ans_masks_list = T.ftensor3() labels = T.imatrix() N = args.no_of_candidates post_out, post_lstm_params = build_lstm(posts, post_masks, args.post_max_len, \ word_embeddings, word_emb_dim, args.hidden_dim, len_voc, args.batch_size) ques_out, ques_emb_out, ques_lstm_params = build_list_lstm(ques_list, ques_masks_list, N, args.ques_max_len, \ word_embeddings, word_emb_dim, args.hidden_dim, len_voc, args.batch_size) ans_out, ans_emb_out, ans_lstm_params = build_list_lstm(ans_list, ans_masks_list, N, args.ans_max_len, \ word_embeddings, word_emb_dim, args.hidden_dim, len_voc, args.batch_size) pqa_preds = [None] * (N * N) post_ques_ans = T.concatenate([post_out, ques_out[0], ans_out[0]], axis=1) l_post_ques_ans_in = lasagne.layers.InputLayer(shape=(args.batch_size, 3 * args.hidden_dim), input_var=post_ques_ans) l_post_ques_ans_denses = [None] * DEPTH for k in range(DEPTH): if k == 0: l_post_ques_ans_denses[k] = lasagne.layers.DenseLayer(l_post_ques_ans_in, num_units=args.hidden_dim,\ nonlinearity=lasagne.nonlinearities.rectify) else: l_post_ques_ans_denses[k] = lasagne.layers.DenseLayer(l_post_ques_ans_denses[k-1], num_units=args.hidden_dim,\ nonlinearity=lasagne.nonlinearities.rectify) l_post_ques_ans_dense = lasagne.layers.DenseLayer(l_post_ques_ans_denses[-1], num_units=1,\ nonlinearity=lasagne.nonlinearities.sigmoid) pqa_preds[0] = lasagne.layers.get_output(l_post_ques_ans_dense) loss = 0.0 for i in range(N): for j in range(N): if i == 0 and j == 0: continue post_ques_ans = T.concatenate([post_out, ques_out[i], ans_out[j]], axis=1) l_post_ques_ans_in_ = lasagne.layers.InputLayer( shape=(args.batch_size, 3 * args.hidden_dim), input_var=post_ques_ans) for k in range(DEPTH): if k == 0: l_post_ques_ans_dense_ = lasagne.layers.DenseLayer(l_post_ques_ans_in_, num_units=args.hidden_dim,\ nonlinearity=lasagne.nonlinearities.rectify,\ W=l_post_ques_ans_denses[k].W,\ b=l_post_ques_ans_denses[k].b) else: l_post_ques_ans_dense_ = lasagne.layers.DenseLayer(l_post_ques_ans_dense_, num_units=args.hidden_dim,\ nonlinearity=lasagne.nonlinearities.rectify,\ W=l_post_ques_ans_denses[k].W,\ b=l_post_ques_ans_denses[k].b) l_post_ques_ans_dense_ = lasagne.layers.DenseLayer(l_post_ques_ans_dense_, num_units=1,\ nonlinearity=lasagne.nonlinearities.sigmoid) pqa_preds[i * N + j] = lasagne.layers.get_output(l_post_ques_ans_dense_) loss += T.mean( lasagne.objectives.binary_crossentropy(pqa_preds[i * N + i], labels[:, i])) squared_errors = [None] * (N * N) for i in range(N): for j in range(N): squared_errors[i * N + j] = lasagne.objectives.squared_error( ans_out[i], ans_out[j]) post_ques_ans_dense_params = lasagne.layers.get_all_params( l_post_ques_ans_dense, trainable=True) all_params = post_lstm_params + ques_lstm_params + ans_lstm_params + post_ques_ans_dense_params print 'Params in concat ', lasagne.layers.count_params( l_post_ques_ans_dense) loss += args.rho * sum(T.sum(l**2) for l in all_params) updates = lasagne.updates.adam(loss, all_params, learning_rate=args.learning_rate) train_fn = theano.function([posts, post_masks, ques_list, ques_masks_list, ans_list, ans_masks_list, labels], \ [loss] + pqa_preds + squared_errors, updates=updates) test_fn = theano.function([posts, post_masks, ques_list, ques_masks_list, ans_list, ans_masks_list, labels], \ [loss] + pqa_preds + squared_errors,) return train_fn, test_fn
def _init_model(self, in_size, out_size, n_hid=10, learning_rate_sl=0.005, \ learning_rate_rl=0.005, batch_size=32, ment=0.1): # 2-layer MLP self.in_size = in_size # x and y coordinate self.out_size = out_size # up, down, right, left self.batch_size = batch_size self.learning_rate = learning_rate_rl self.n_hid = n_hid input_var, turn_mask, act_mask, reward_var = T.ftensor3('in'), T.imatrix('tm'), \ T.itensor3('am'), T.fvector('r') in_var = T.reshape( input_var, (input_var.shape[0] * input_var.shape[1], self.in_size)) l_mask_in = L.InputLayer(shape=(None, None), input_var=turn_mask) pol_in = T.fmatrix('pol-h') l_in = L.InputLayer(shape=(None, None, self.in_size), input_var=input_var) l_pol_rnn = L.GRULayer(l_in, n_hid, hid_init=pol_in, mask_input=l_mask_in) # B x H x D pol_out = L.get_output(l_pol_rnn)[:, -1, :] l_den_in = L.ReshapeLayer( l_pol_rnn, (turn_mask.shape[0] * turn_mask.shape[1], n_hid)) # BH x D l_out = L.DenseLayer(l_den_in, self.out_size, nonlinearity=lasagne.nonlinearities.softmax) self.network = l_out self.params = L.get_all_params(self.network) # rl probs = L.get_output(self.network) # BH x A out_probs = T.reshape(probs, (input_var.shape[0], input_var.shape[1], self.out_size)) # B x H x A log_probs = T.log(out_probs) act_probs = (log_probs * act_mask).sum(axis=2) # B x H ep_probs = (act_probs * turn_mask).sum(axis=1) # B H_probs = -T.sum(T.sum(out_probs * log_probs, axis=2), axis=1) # B self.loss = 0. - T.mean(ep_probs * reward_var + ment * H_probs) updates = lasagne.updates.rmsprop(self.loss, self.params, learning_rate=learning_rate_rl, \ epsilon=1e-4) self.inps = [input_var, turn_mask, act_mask, reward_var, pol_in] self.train_fn = theano.function(self.inps, self.loss, updates=updates) self.obj_fn = theano.function(self.inps, self.loss) self.act_fn = theano.function([input_var, turn_mask, pol_in], [out_probs, pol_out]) # sl sl_loss = 0. - T.mean(ep_probs) sl_updates = lasagne.updates.rmsprop(sl_loss, self.params, learning_rate=learning_rate_sl, \ epsilon=1e-4) self.sl_train_fn = theano.function([input_var, turn_mask, act_mask, pol_in], sl_loss, \ updates=sl_updates) self.sl_obj_fn = theano.function( [input_var, turn_mask, act_mask, pol_in], sl_loss)
import theano from theano import tensor as T from theano import function import numpy as np a = np.array([[[1, 2, 3], [3, 4, 5]], [[7, 8, 9], [45, 345, 12]]]) x0 = T.ftensor3() def create_atom_context(atom_vector): # type_vector = T.fvector() types_array = atom_vector[0] dists = atom_vector[1] w = [[1, 1, 1, 1], [2, 2, 2, 2], [3, 3, 3, 3], [4, 4, 4, 4], [5, 5, 5, 5]] # outputs_info = T.as_tensor_variable(np.asarray(0, dtype=np.float32)) # types, updates = theano.scan(fn=lambda atm_type: atm_type, # outputs_info=None, # sequences=type_vector) # mult = type_vector*2 # f = function(inputs=[type_vector], outputs=mult) # print(f([1, 2, 3])) # f = function(inputs=[type_vector], outputs=types) # return T.concatenate([types_array], [dists]) print(len(types_array)) ls = [] for tp in types_array: ls.append(tp) return ls
def build_fn(args, embeddings): """ Build training and testing functions. """ if args.para_shared_model is not None: dic = utils.load_params(args.para_shared_model) params_shared = dic['params'] params_name = [ 'W', 'o_layer1.W_in_to_updategate', 'o_layer1.W_hid_to_updategate', 'o_layer1.b_updategate', 'o_layer1.W_in_to_resetgate', 'o_layer1.W_hid_to_resetgate', 'o_layer1.b_resetgate', 'o_layer1.W_in_to_hidden_update', 'o_layer1.W_hid_to_hidden_update', 'o_layer1.b_hidden_update', 'o_layer1.hid_init', 'o_back_layer1.W_in_to_updategate', 'o_back_layer1.W_hid_to_updategate', 'o_back_layer1.b_updategate', 'o_back_layer1.W_in_to_resetgate', 'o_back_layer1.W_hid_to_resetgate', 'o_back_layer1.b_resetgate', 'o_back_layer1.W_in_to_hidden_update', 'o_back_layer1.W_hid_to_hidden_update', 'o_back_layer1.b_hidden_update', 'o_back_layer1.hid_init', 'd_layer1.W_in_to_updategate', 'd_layer1.W_hid_to_updategate', 'd_layer1.b_updategate', 'd_layer1.W_in_to_resetgate', 'd_layer1.W_hid_to_resetgate', 'd_layer1.b_resetgate', 'd_layer1.W_in_to_hidden_update', 'd_layer1.W_hid_to_hidden_update', 'd_layer1.b_hidden_update', 'd_layer1.hid_init', 'd_back_layer1.W_in_to_updategate', 'd_back_layer1.W_hid_to_updategate', 'd_back_layer1.b_updategate', 'd_back_layer1.W_in_to_resetgate', 'd_back_layer1.W_hid_to_resetgate', 'd_back_layer1.b_resetgate', 'd_back_layer1.W_in_to_hidden_update', 'd_back_layer1.W_hid_to_hidden_update', 'd_back_layer1.b_hidden_update', 'd_back_layer1.hid_init', 'q_layer1.W_in_to_updategate', 'q_layer1.W_hid_to_updategate', 'q_layer1.b_updategate', 'q_layer1.W_in_to_resetgate', 'q_layer1.W_hid_to_resetgate', 'q_layer1.b_resetgate', 'q_layer1.W_in_to_hidden_update', 'q_layer1.W_hid_to_hidden_update', 'q_layer1.b_hidden_update', 'q_layer1.hid_init', 'q_back_layer1.W_in_to_updategate', 'q_back_layer1.W_hid_to_updategate', 'q_back_layer1.b_updategate', 'q_back_layer1.W_in_to_resetgate', 'q_back_layer1.W_hid_to_resetgate', 'q_back_layer1.b_resetgate', 'q_back_layer1.W_in_to_hidden_update', 'q_back_layer1.W_hid_to_hidden_update', 'q_back_layer1.b_hidden_update', 'q_back_layer1.hid_init', 'W_bilinear', 'W_bilinear' ] in_x1 = T.imatrix('x1') in_x3 = T.imatrix('x3') in_mask1 = T.matrix('mask1') in_mask3 = T.matrix('mask3') in_y = T.ivector('y') #batch x word_num x mea_num in_x4 = T.ftensor3('x4') l_in1 = lasagne.layers.InputLayer((None, None), in_x1) l_mask1 = lasagne.layers.InputLayer((None, None), in_mask1) Embed_W = params_shared[params_name.index('W')] l_emb1 = lasagne.layers.EmbeddingLayer(l_in1, args.vocab_size, args.embedding_size, W=Embed_W) l_in3 = lasagne.layers.InputLayer((None, None), in_x3) l_mask3 = lasagne.layers.InputLayer((None, None), in_mask3) l_emb3 = lasagne.layers.EmbeddingLayer(l_in3, args.vocab_size, args.embedding_size, W=l_emb1.W) # x4 is the human attention l_in4 = lasagne.layers.InputLayer((None, None, args.mea_num), in_x4) if not args.tune_embedding: l_emb1.params[l_emb1.W].remove('trainable') l_emb3.params[l_emb3.W].remove('trainable') args.rnn_output_size = args.hidden_size * 2 if args.bidir else args.hidden_size assert args.model is None network1 = nn_layers.stack_rnn(l_emb1, l_mask1, args.num_layers, args.hidden_size, grad_clipping=args.grad_clipping, dropout_rate=args.dropout_rate, only_return_final=(args.att_func == 'last'), bidir=args.bidir, name='d', rnn_layer=args.rnn_layer) #weighted mean: passage embedding # weight_mlp_np = np.array([[1.]]) # b_mlp = np.array([0.]) # l_weight = lasagne.layers.DenseLayer(l_in4, 1, num_leading_axes=-1, # name='w_dense', W=weight_mlp_np, b=b_mlp) # pass a Linear layer and get human ATT l_weight: batch x word_num x 1 activation -- sigmoid l_weight = lasagne.layers.DenseLayer(l_in4, 1, num_leading_axes=-1, nonlinearity=nonlinearities.sigmoid, name='w_dense') att = nn_layers.WeightedAverageLayer([network1, l_weight, l_mask1], name='w_aver') if RAW: att = nn_layers.WeightedAverageLayer([network1, l_in4, l_mask1], name='w_aver') if SAG: # network1 1x1 conv # l_in4 1x1 conv pass #options network3 = nn_layers.stack_rnn(l_emb3, l_mask3, args.num_layers, args.hidden_size, grad_clipping=args.grad_clipping, dropout_rate=args.dropout_rate, only_return_final=True, bidir=args.bidir, name='o', rnn_layer=args.rnn_layer) network3 = lasagne.layers.ReshapeLayer( network3, (in_x1.shape[0], 4, args.rnn_output_size)) #answer network = nn_layers.BilinearDotLayer([network3, att], args.rnn_output_size) # if not args.tune_embedding: # network.params[network.W].remove('trainable') #parameter sharing params_initial = lasagne.layers.get_all_params(network) params_set = [] for params_initial_tmp in params_initial: if str(params_initial_tmp) in ['w_dense.W', 'w_dense.b']: params_set = params_set + [params_initial_tmp.get_value()] elif str(params_initial_tmp) == 'W_bilinear': params_set = params_set + [params_shared[-1]] else: params_set = params_set + [ params_shared[params_name.index(str(params_initial_tmp))] ] lasagne.layers.set_all_param_values(network, params_set) if args.pre_trained is not None: dic = utils.load_params(args.pre_trained) lasagne.layers.set_all_param_values(network, dic['params']) del dic['params'] logging.info('Loaded pre-trained model: %s' % args.pre_trained) for dic_param in dic.iteritems(): logging.info(dic_param) logging.info('#params: %d' % lasagne.layers.count_params(network, trainable=True)) logging.info('#fixed params: %d' % lasagne.layers.count_params(network, trainable=False)) for layer in lasagne.layers.get_all_layers(network): logging.info(layer) # Test functions test_prob = lasagne.layers.get_output(network, deterministic=True) test_prediction = T.argmax(test_prob, axis=-1) acc = T.sum(T.eq(test_prediction, in_y)) test_fn = theano.function([in_x1, in_mask1, in_x3, in_mask3, in_y, in_x4], [acc, test_prediction], on_unused_input='warn') # Train functions train_prediction = lasagne.layers.get_output(network) train_prediction = T.clip(train_prediction, 1e-7, 1.0 - 1e-7) loss = lasagne.objectives.categorical_crossentropy(train_prediction, in_y).mean() # TODO: lasagne.regularization.regularize_network_params(network, lasagne.regularization.l2) # params = lasagne.layers.get_all_params(network)#, trainable=True) params_init = lasagne.layers.get_all_params(network, trainable=True) params = lasagne.layers.get_all_params(network, trainable=True) if not (args.tune_sar): for params_tmp in params_init: if not (str(params_tmp) in ['w_dense.W', 'w_dense.b']): print(params_tmp) params.remove(params_tmp) print(len(params)) print(params) else: print(params_tmp) # params.remove(params_tmp) all_params = lasagne.layers.get_all_params(network) if args.optimizer == 'sgd': updates = lasagne.updates.sgd(loss, params, args.learning_rate) elif args.optimizer == 'adam': updates = lasagne.updates.adam(loss, params, learning_rate=args.learning_rate) elif args.optimizer == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, learning_rate=args.learning_rate) else: raise NotImplementedError('optimizer = %s' % args.optimizer) train_fn = theano.function([in_x1, in_mask1, in_x3, in_mask3, in_y, in_x4], loss, updates=updates, on_unused_input='warn') return train_fn, test_fn, params, all_params
def train( batch_size=64, n_epochs=25, ): #1 denotes positive rule,0 is negative rules = [["sweatshirts", "activewear pants", 1], ["cashmere", "leather", 1], ["tank tops", "shorts", 1]] extract_rule.extract(rules) rule_num = len(rules) # parameters of text non_static = False filter_hs = [2, 3, 4, 5] hidden_units = [100, 2] conv_non_linear = "relu" img_w = 300 print "loading w2v data...", x = cPickle.load(open("./cloth.binary.p", "rb")) revs, W, W2, word_idx_map, vocab = x[0], x[1], x[2], x[3], x[4] print "data loaded!" if non_static == True: print "model architecture: CNN-non-static" print "using: random vectors" U = W2 elif non_static == False: print "model architecture: CNN-static" print "using: word2vec vectors, dim=%d" % W.shape[1] U = W # make text data datasets = make_idx_data(revs, word_idx_map, max_l=55, k=300, filter_h=filter_hs[-1]) train_text_i, train_text_j, train_text_k = datasets[0], datasets[1], datasets[2] valid_text_i, valid_text_j, valid_text_k = datasets[3], datasets[4], datasets[5] test_text_i, test_text_j, test_text_k = datasets[6], datasets[7], datasets[8] # load visual data print 'loading visual data' print('now():' + str(datetime.now())) with open("./data_mm/AUC_new_dataset_train_811_norm.pkl", "rb") as f: train_set = np.asarray(cPickle.load(f), dtype='float32') with open("./data_mm/AUC_new_dataset_valid_811_norm.pkl", "rb") as f: valid_set = np.asarray(cPickle.load(f), dtype='float32') with open("./data_mm/AUC_new_dataset_test_811_norm.pkl", "rb") as f: test_set = np.asarray(cPickle.load(f), dtype='float32') print 'visual data loaded' print('now():' + str(datetime.now())) print 'loading rule ind' print 'loading train ind' with open("./rule_ind/train_rules_ind.pkl", "rb") as f: train_rules_ind = np.asarray(cPickle.load(f), dtype='float32') print 'loading valid ind' with open("./rule_ind/valid_rules_ind.pkl", "rb") as f: valid_rules_ind = np.asarray(cPickle.load(f), dtype='float32') print 'loading test ind' with open("./rule_ind/test_rules_ind.pkl", "rb") as f: test_rules_ind = np.asarray(cPickle.load(f), dtype='float32') print 'rules ind loaded' train_set_size = train_set[0].shape[0] valid_set_size = valid_set[0].shape[0] test_set_size = test_set[0].shape[0] train_set_i, train_set_j, train_set_k = train_set[0], train_set[1], train_set[2] valid_set_i, valid_set_j, valid_set_k = valid_set[0], valid_set[1], valid_set[2] test_set_i, test_set_j, test_set_k = test_set[0], test_set[1], test_set[2] train_rules_ind = train_rules_ind[0] valid_rules_ind = valid_rules_ind[0] test_rules_ind = test_rules_ind[0] np.random.seed(3435) # training data if train_set_size % batch_size > 0: extra_data_num = batch_size - train_set_size % batch_size ''' permutation_order = np.random.permutation(train_set_size) train_set_i = train_set_i[permutation_order] train_set_j = train_set_j[permutation_order] train_set_k = train_set_k[permutation_order] train_text_i = train_text_i[permutation_order] train_text_j = train_text_j[permutation_order] train_text_k = train_text_k[permutation_order] ''' extra_data_i = train_set_i[:extra_data_num] extra_data_j = train_set_j[:extra_data_num] extra_data_k = train_set_k[:extra_data_num] extra_text_i = train_text_i[:extra_data_num] extra_text_j = train_text_j[:extra_data_num] extra_text_k = train_text_k[:extra_data_num] train_set_i = np.append(train_set_i, extra_data_i, axis=0) train_set_j = np.append(train_set_j, extra_data_j, axis=0) train_set_k = np.append(train_set_k, extra_data_k, axis=0) train_text_i = np.append(train_text_i, extra_text_i, axis=0) train_text_j = np.append(train_text_j, extra_text_j, axis=0) train_text_k = np.append(train_text_k, extra_text_k, axis=0) new_train_rules_ind = np.zeros( (len(train_rules_ind), len(train_rules_ind[0]) + extra_data_num, len(train_rules_ind[0][0]))) for i in range(len(train_rules_ind)): #train_rules_ind[i] = train_rules_ind[i][permutation_order] extra_rules_ind_i = train_rules_ind[i][:extra_data_num] train_rules_ind_i = np.append(train_rules_ind[i], extra_rules_ind_i, axis=0) new_train_rules_ind[i] = train_rules_ind_i #print(len(new_train_rules_ind[0])) train_rules_ind = new_train_rules_ind train_set_size = train_set_i.shape[0] train_set_i = shared_dataset_x(train_set_i) train_set_j = shared_dataset_x(train_set_j) train_set_k = shared_dataset_x(train_set_k) train_text_i = shared_dataset_x(train_text_i) train_text_j = shared_dataset_x(train_text_j) train_text_k = shared_dataset_x(train_text_k) train_rules_ind = theano.shared(np.asarray(train_rules_ind,dtype=theano.config.floatX),borrow=True) # valid data if valid_set_size % batch_size > 0: extra_data_num = batch_size - valid_set_size % batch_size ''' permutation_order = np.random.permutation(valid_set_size) valid_set_i = valid_set_i[permutation_order] valid_set_j = valid_set_j[permutation_order] valid_set_k = valid_set_k[permutation_order] valid_text_i = valid_text_i[permutation_order] valid_text_j = valid_text_j[permutation_order] valid_text_k = valid_text_k[permutation_order] ''' extra_data_i = valid_set_i[:extra_data_num] extra_data_j = valid_set_j[:extra_data_num] extra_data_k = valid_set_k[:extra_data_num] extra_text_i = valid_text_i[:extra_data_num] extra_text_j = valid_text_j[:extra_data_num] extra_text_k = valid_text_k[:extra_data_num] valid_set_i = np.append(valid_set_i, extra_data_i, axis=0) valid_set_j = np.append(valid_set_j, extra_data_j, axis=0) valid_set_k = np.append(valid_set_k, extra_data_k, axis=0) valid_text_i = np.append(valid_text_i, extra_text_i, axis=0) valid_text_j = np.append(valid_text_j, extra_text_j, axis=0) valid_text_k = np.append(valid_text_k, extra_text_k, axis=0) new_valid_rules_ind = np.zeros( (len(valid_rules_ind), len(valid_rules_ind[0]) + extra_data_num, len(valid_rules_ind[0][0]))) for i in range(len(valid_rules_ind)): # valid_rules_ind[i] = valid_rules_ind[i][permutation_order] extra_rules_ind_i = valid_rules_ind[i][:extra_data_num] valid_rules_ind_i = np.append(valid_rules_ind[i], extra_rules_ind_i, axis=0) new_valid_rules_ind[i] = valid_rules_ind_i # print(len(new_valid_rules_ind[0])) valid_rules_ind = new_valid_rules_ind valid_set_size = valid_set_i.shape[0] valid_set_i = shared_dataset_x(valid_set_i) valid_set_j = shared_dataset_x(valid_set_j) valid_set_k = shared_dataset_x(valid_set_k) valid_text_i = shared_dataset_x(valid_text_i) valid_text_j = shared_dataset_x(valid_text_j) valid_text_k = shared_dataset_x(valid_text_k) valid_rules_ind = theano.shared(np.asarray(valid_rules_ind,dtype=theano.config.floatX),borrow=True) # test data if test_set_size % batch_size > 0: extra_data_num = batch_size - test_set_size % batch_size ''' permutation_order = np.random.permutation(test_set_size) test_set_i = test_set_i[permutation_order] test_set_j = test_set_j[permutation_order] test_set_k = test_set_k[permutation_order] test_text_i = test_text_i[permutation_order] test_text_j = test_text_j[permutation_order] test_text_k = test_text_k[permutation_order] ''' extra_data_i = test_set_i[:extra_data_num] extra_data_j = test_set_j[:extra_data_num] extra_data_k = test_set_k[:extra_data_num] extra_text_i = test_text_i[:extra_data_num] extra_text_j = test_text_j[:extra_data_num] extra_text_k = test_text_k[:extra_data_num] test_set_i = np.append(test_set_i, extra_data_i, axis=0) test_set_j = np.append(test_set_j, extra_data_j, axis=0) test_set_k = np.append(test_set_k, extra_data_k, axis=0) test_text_i = np.append(test_text_i, extra_text_i, axis=0) test_text_j = np.append(test_text_j, extra_text_j, axis=0) test_text_k = np.append(test_text_k, extra_text_k, axis=0) new_test_rules_ind = np.zeros( (len(test_rules_ind), len(test_rules_ind[0]) + extra_data_num, len(test_rules_ind[0][0]))) for i in range(len(test_rules_ind)): # test_rules_ind[i] = test_rules_ind[i][permutation_order] extra_rules_ind_i = test_rules_ind[i][:extra_data_num] test_rules_ind_i = np.append(test_rules_ind[i], extra_rules_ind_i, axis=0) new_test_rules_ind[i] = test_rules_ind_i # print(len(new_test_rules_ind[0])) test_rules_ind = new_test_rules_ind test_set_size = test_set_i.shape[0] test_set_i = shared_dataset_x(test_set_i) test_set_j = shared_dataset_x(test_set_j) test_set_k = shared_dataset_x(test_set_k) test_text_i = shared_dataset_x(test_text_i) test_text_j = shared_dataset_x(test_text_j) test_text_k = shared_dataset_x(test_text_k) test_rules_ind = theano.shared(np.asarray(test_rules_ind,dtype=theano.config.floatX),borrow=True) print 'train size:%f , valid size:%f , test size:%f'%(train_set_size,valid_set_size,test_set_size) n_train_batches = train_set_size / batch_size n_valid_batches = valid_set_size / batch_size n_test_batches = test_set_size / batch_size iteration = 0 best_val_q_perf = 0.0 best_test_q_perf = 0.0 ret_test_q_perf = 0.0 ret_test_p_perf = 0.0 ret_iteration = 0 ret_dropout_rate = 0.0 ret_mu_param = 0.0 #_attention_hidden = 512 #_learning_rate = 0.05 for _learning_rate in [0.05]: for _mu_param in [[0.01, 0.1],[0.001,0.05]]: for _attention_hidden in [256,512]: # parameters of classifier n_hidden = 1024 n_in = 4096 n_out = n_hidden n2_in = 400 n2_out = n_hidden dropout_rate_v = 0.0 dropout_rate_t = 0.4 # parameters of logicnn pi_params = [0.95, 0] #pi = [1.0, 0] C=0 is train p only learning_rate = _learning_rate momentum = 0.9 C = 3.0 mu_param = _mu_param #weight of Sqr attention_hidden = _attention_hidden #hidden num of attention index = T.lscalar() input1 = T.matrix('input1') input2 = T.matrix('input2') input3 = T.matrix('input3') input1_t = T.matrix('input1_t') input2_t = T.matrix('input2_t') input3_t = T.matrix('input3_t') rules_ind = T.ftensor3('rules_ind') # convolution setup rng = np.random.RandomState(3435) img_h = len(datasets[0][0]) filter_w = img_w feature_maps = hidden_units[0] filter_shapes = [] pool_sizes = [] for filter_h in filter_hs: filter_shapes.append((feature_maps, 1, filter_h, filter_w)) pool_sizes.append((img_h - filter_h + 1, img_w - filter_w + 1)) parameters = [("image shape", img_h, img_w), ("filter shape", filter_shapes), ("hidden_units", hidden_units), ("conv_non_linear", conv_non_linear)] print parameters Words = theano.shared(value=U, name="Words") zero_vec_tensor = T.vector() zero_vec = np.zeros(img_w) set_zero = theano.function([zero_vec_tensor], updates=[(Words, T.set_subtensor(Words[0, :], zero_vec_tensor))], allow_input_downcast=True) layer0_input_i = Words[T.cast(input1_t.flatten(), dtype="int32")].reshape( (input1_t.shape[0], 1, input1_t.shape[1], Words.shape[1])) layer0_input_j = Words[T.cast(input2_t.flatten(), dtype="int32")].reshape( (input2_t.shape[0], 1, input2_t.shape[1], Words.shape[1])) layer0_input_k = Words[T.cast(input3_t.flatten(), dtype="int32")].reshape( (input3_t.shape[0], 1, input3_t.shape[1], Words.shape[1])) layer0_input = [layer0_input_i, layer0_input_j, layer0_input_k] # convolution conv_layers = [] layer1_inputs_i = [] layer1_inputs_j = [] layer1_inputs_k = [] for i in xrange(len(filter_hs)): filter_shape = filter_shapes[i] pool_size = pool_sizes[i] conv_layer = matching_attention_classes.LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, img_h, img_w), filter_shape=filter_shape, poolsize=pool_size, non_linear=conv_non_linear) layer1_input_i = conv_layer.output_i.flatten(2) layer1_input_j = conv_layer.output_j.flatten(2) layer1_input_k = conv_layer.output_k.flatten(2) conv_layers.append(conv_layer) layer1_inputs_i.append(layer1_input_i) layer1_inputs_j.append(layer1_input_j) layer1_inputs_k.append(layer1_input_k) layer1_input_i = T.concatenate(layer1_inputs_i, 1) layer1_input_j = T.concatenate(layer1_inputs_j, 1) layer1_input_k = T.concatenate(layer1_inputs_k, 1) network = matching_attention_classes.MLP(rng, input1=input1, input2=input2, input3=input3, input1_t=layer1_input_i, input2_t=layer1_input_j, input3_t=layer1_input_k, dropout_rate_v=dropout_rate_v, dropout_rate_t=dropout_rate_t, n_in=n_in, n_out=n_out, n2_in=n2_in, n2_out=n2_out ) rules = [] for i in range(rule_num): rules.append(matching_attention_classes.Rule(rules_ind[i])) new_pi = get_pi(cur_iter=0, params=pi_params) logic_nn = matching_attention_classes.LogicNN(input1=input1, input2=input2, input3=input3, network=network, rules=rules, rule_num=rule_num, n_hidden=n_hidden, attention_hidden=attention_hidden, C=C, pi=new_pi, mu_param=mu_param) # parameters to update params = logic_nn.params for conv_layer in conv_layers: params += conv_layer.params if non_static: params += [Words] cost = logic_nn.cost() dropout_cost = logic_nn.dropout_cost() # momentum gparams = T.grad(dropout_cost, params) updates = [] for p, g in zip(params, gparams): mparam_i = theano.shared(np.zeros(p.get_value().shape, dtype=theano.config.floatX)) v = momentum * mparam_i - learning_rate * g updates.append((mparam_i, v)) updates.append((p, p + v)) train_model = theano.function([index], cost, updates=updates, givens={ input1: train_set_i[index * batch_size:(index + 1) * batch_size], input2: train_set_j[index * batch_size:(index + 1) * batch_size], input3: train_set_k[index * batch_size:(index + 1) * batch_size], input1_t: train_text_i[index * batch_size:(index + 1) * batch_size], input2_t: train_text_j[index * batch_size:(index + 1) * batch_size], input3_t: train_text_k[index * batch_size:(index + 1) * batch_size], rules_ind: train_rules_ind[:,index * batch_size:(index + 1) * batch_size] }, allow_input_downcast=True, on_unused_input='warn') train_test_model = theano.function([index], logic_nn.sup(), givens={ input1: train_set_i[index * batch_size:(index + 1) * batch_size], input2: train_set_j[index * batch_size:(index + 1) * batch_size], input3: train_set_k[index * batch_size:(index + 1) * batch_size], input1_t: train_text_i[index * batch_size:(index + 1) * batch_size], input2_t: train_text_j[index * batch_size:(index + 1) * batch_size], input3_t: train_text_k[index * batch_size:(index + 1) * batch_size], rules_ind: train_rules_ind[:,index * batch_size:(index + 1) * batch_size] }, allow_input_downcast=True, on_unused_input='warn') val_model = theano.function([index], logic_nn.sup(), givens={ input1: valid_set_i[index * batch_size:(index + 1) * batch_size], input2: valid_set_j[index * batch_size:(index + 1) * batch_size], input3: valid_set_k[index * batch_size:(index + 1) * batch_size], input1_t: valid_text_i[index * batch_size:(index + 1) * batch_size], input2_t: valid_text_j[index * batch_size:(index + 1) * batch_size], input3_t: valid_text_k[index * batch_size:(index + 1) * batch_size], rules_ind: valid_rules_ind[:,index * batch_size:(index + 1) * batch_size] }, allow_input_downcast=True, on_unused_input='warn') test_model = theano.function([index], logic_nn.sup(), givens={ input1: test_set_i[index * batch_size:(index + 1) * batch_size], input2: test_set_j[index * batch_size:(index + 1) * batch_size], input3: test_set_k[index * batch_size:(index + 1) * batch_size], input1_t: test_text_i[index * batch_size:(index + 1) * batch_size], input2_t: test_text_j[index * batch_size:(index + 1) * batch_size], input3_t: test_text_k[index * batch_size:(index + 1) * batch_size], rules_ind: test_rules_ind[:,index * batch_size:(index + 1) * batch_size] }, allow_input_downcast=True, on_unused_input='warn') test_mijk = theano.function([index], logic_nn.mijk(), givens={ input1: test_set_i[index * batch_size:(index + 1) * batch_size], input2: test_set_j[index * batch_size:(index + 1) * batch_size], input3: test_set_k[index * batch_size:(index + 1) * batch_size], input1_t: test_text_i[index * batch_size:(index + 1) * batch_size], input2_t: test_text_j[index * batch_size:(index + 1) * batch_size], input3_t: test_text_k[index * batch_size:(index + 1) * batch_size], rules_ind: test_rules_ind[:,index * batch_size:(index + 1) * batch_size] }, allow_input_downcast=True, on_unused_input='warn') print 'training...' fi = open('mm_attention_color_coatdress.txt', 'a+') epoch = 0 batch = 0 iteration += 1 best_val_p_iter = 0.0 best_test_q_iter = 0.0 print 'iteration: %i' % iteration fi.write('################iteration: %f\n' % iteration) fi.write('parameters: hidden%.5f, attention_hidden%f, lr: %.4f, mu: %.5f %.5f\n' % (n_hidden, attention_hidden, learning_rate,mu_param[0],mu_param[1])) fi.flush() while (epoch < n_epochs): start_time = time.time() epoch = epoch + 1 if epoch > 5: learning_rate = 0.02 cost = 0.0 L_sup = 0.0 L_p_q = 0.0 L_sqr = 0.0 # train for minibatch_index in xrange(n_train_batches): batch = batch + 1 new_pi = get_pi(cur_iter=batch * 1. / n_train_batches, params=pi_params) logic_nn.set_pi(new_pi) set_zero(zero_vec) cost_batch = train_model(minibatch_index) cost += cost_batch[0] L_sup += cost_batch[1] L_p_q += cost_batch[2] L_sqr += cost_batch[3] print 'epoch: %i, cost: %.4f, L_sup: %.4f, L_p_q: %.4f, L_sqr: %.4f' % ( epoch, cost, L_sup, L_p_q, L_sqr) # training result train_sup = [train_test_model(i) for i in xrange(n_train_batches)] train_sup = np.array(train_sup) train_q_sup = train_sup[:, 0] train_p_sup = train_sup[:, 1] count_q = 0.0 count_p = 0.0 for i in range(train_q_sup.shape[0]): for j in range(train_q_sup.shape[1]): if train_q_sup[i, j, 0] > 0.5: count_q += 1 if train_p_sup[i, j, 0] > 0.5: count_p += 1 train_q_perf = count_q / (train_q_sup.shape[0] * train_q_sup.shape[1]) train_p_perf = count_p / (train_p_sup.shape[0] * train_p_sup.shape[1]) print('training time: %.2f secs; q_train perf: %.4f %% ,p_train perf: %.4f %% ' % \ (time.time() - start_time, train_q_perf * 100., train_p_perf * 100.)) # valid result valid_sup = [val_model(i) for i in xrange(n_valid_batches)] valid_sup = np.array(valid_sup) valid_q_sup = valid_sup[:, 0] valid_p_sup = valid_sup[:, 1] count_q = 0.0 count_p = 0.0 for i in range(valid_q_sup.shape[0]): for j in range(valid_q_sup.shape[1]): if valid_q_sup[i, j, 0] > 0.5: count_q += 1 if valid_p_sup[i, j, 0] > 0.5: count_p += 1 val_q_perf = count_q / (valid_q_sup.shape[0] * valid_q_sup.shape[1]) val_p_perf = count_p / (valid_p_sup.shape[0] * valid_p_sup.shape[1]) # testing result test_sup = [test_model(i) for i in xrange(n_test_batches)] test_sup = np.array(test_sup) test_q_sup = test_sup[:, 0] test_p_sup = test_sup[:, 1] count_q = 0.0 count_p = 0.0 for i in range(test_q_sup.shape[0]): for j in range(test_q_sup.shape[1]): if test_q_sup[i, j, 0] > 0.5: count_q += 1 if test_p_sup[i, j, 0] > 0.5: count_p += 1 test_q_perf = count_q / (test_q_sup.shape[0] * test_q_sup.shape[1]) test_p_perf = count_p / (test_p_sup.shape[0] * test_p_sup.shape[1]) print 'valid perf: q %.4f %%, p %.4f %%' % (val_q_perf * 100., val_p_perf * 100.) print 'test perf: q %.4f %%, p %.4f %%' % (test_q_perf * 100., test_p_perf * 100.) fi.write('%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\n' % ( cost, L_sup, L_p_q, L_sqr, train_q_perf * 100., train_p_perf * 100., val_q_perf * 100., val_p_perf * 100., test_q_perf * 100., test_p_perf * 100.)) fi.flush() # select if test_q_perf > best_test_q_iter: best_test_q_iter = test_q_perf iter_test_q_perf = test_q_perf iter_test_p_perf = test_p_perf if test_q_perf > best_test_q_perf: best_test_q_perf = test_q_perf ret_test_q_perf = test_q_perf ret_test_p_perf = test_p_perf ret_iteration = iteration ret_dropout_rate = dropout_rate_v best_w1 = network.W1.get_value() best_w2 = network.W2.get_value() best_w1t = network.W1t.get_value() best_w2t = network.W2t.get_value() best_b1 = network.b1.get_value() best_b2 = network.b2.get_value() count = 0 for conv_layer in conv_layers: if count == 0: wc0 = conv_layer.W.get_value() bc0 = conv_layer.b.get_value() if count == 1: wc1 = conv_layer.W.get_value() bc1 = conv_layer.b.get_value() if count == 2: wc2 = conv_layer.W.get_value() bc2 = conv_layer.b.get_value() if count == 3: wc3 = conv_layer.W.get_value() bc3 = conv_layer.b.get_value() count += 1 mij = [] mik = [] qmij = [] qmik = [] rule_lambda = [] masks = [] raw_rule_lambda = [] for batch_index in xrange(n_test_batches): mijk_batch = test_mijk(batch_index) mij.append(np.array(mijk_batch[0])) mik.append(np.array(mijk_batch[1])) qmij.append(np.array(mijk_batch[2])) qmik.append(np.array(mijk_batch[3])) rule_lambda.append(np.array(mijk_batch[4])) masks.append(np.array(mijk_batch[5])) raw_rule_lambda.append(np.array(mijk_batch[6])) mij = np.array(mij) mik = np.array(mik) qmij = np.array(mij) qmik = np.array(mik) rule_lambda = np.array(rule_lambda,dtype="float32") masks = np.array(masks,dtype="float32") raw_rule_lambda = np.array(raw_rule_lambda,dtype="float32") print '###interation: %i: test q perf: %.4f%%, test p perf: %.4f%%' % ( iteration, iter_test_q_perf * 100., iter_test_p_perf * 100.) fi.write('interation###: %i, test q perf: %.4f %%\n, test p perf: %.4f %%\n' % ( iteration, iter_test_q_perf * 100., iter_test_p_perf * 100.)) fi.flush() fi.close() print '##best q perf: %.4f%%, p perf: %.4f%%' % (ret_test_q_perf * 100., ret_test_p_perf * 100.) print 'in iteration: %i, dropout_rate: %.4f' % ( ret_iteration, ret_dropout_rate) np.savetxt('./parameters/mij_Ar.csv', mij) np.savetxt('./parameters/mik_Ar.csv', mik) np.savetxt('./parameters/qmij_Ar.csv', qmij) np.savetxt('./parameters/qmik_Ar.csv', qmik) cPickle.dump(rule_lambda, open('./parameters/rule_lambda.pkl', "wb")) cPickle.dump(masks, open('./parameters/masks.pkl', "wb")) cPickle.dump(raw_rule_lambda, open('./parameters/raw_rule_lambda.pkl', "wb")) np.savetxt('./parameters/W1.csv', best_w1) np.savetxt('./parameters/W2.csv', best_w2) np.savetxt('./parameters/W1t.csv', best_w1t) np.savetxt('./parameters/W2t.csv', best_w2t) np.savetxt('./parameters/b1.csv', best_b1) np.savetxt('./parameters/b2.csv', best_b2) cPickle.dump(wc0, open("./parameters/Wc0.pkl", "wb")) cPickle.dump(wc1, open("./parameters/Wc1.pkl", "wb")) cPickle.dump(wc2, open("./parameters/Wc2.pkl", "wb")) cPickle.dump(wc3, open("./parameters/Wc3.pkl", "wb")) cPickle.dump(bc0, open("./parameters/bc0.pkl", "wb")) cPickle.dump(bc1, open("./parameters/bc1.pkl", "wb")) cPickle.dump(bc2, open("./parameters/bc2.pkl", "wb")) cPickle.dump(bc3, open("./parameters/bc3.pkl", "wb"))
def build_model(options, tparams): """Build up the whole computation graph Input is the features extracted from googleNet. """ last_n = options['last_n'] actionNum = options['actions'] decay_c = options['decay_c'] use_dropout = options['use_dropout'] use_wta = options['use_wta'] location_dim = options['locations'] feature_dim = options['featureMaps'] trng = RandomStreams(1234) use_noise = theano.shared(np.float32(0.)) """combine model""" x = T.ftensor3('x') n_steps = x.shape[0] n_samples = x.shape[1] mask = T.fmatrix('mask') y = T.ftensor3('y') # one hot vector,n_steps*n_samples*actionNum _x = x.reshape([n_steps * n_samples, location_dim, feature_dim]) feature = _x.mean(1) # feature=feature/feature.max(1,keepdims=True); feature = feature.reshape([n_steps, n_samples, feature_dim]) feature = feature + use_noise * trng.normal( feature.shape, avg=1, std=0.05, dtype=feature.dtype) #noisy if use_dropout: feature = dropout_layer(feature, use_noise, trng) if use_wta: feature = WTA_Layer(feature, 4, 2, ndim=options['featureMaps']) f1 = ff_build(tparams, feature, prefix="recog", name='fullconn', active="tanh") if use_dropout: f1 = dropout_layer(f1, use_noise, trng) lin = ff_build(tparams, f1, prefix="recog", name='output', active="linear") # n_steps*n_samples*actionNum probs = T.nnet.softmax(lin.reshape([-1, actionNum])) probs = probs.reshape([n_steps, n_samples, actionNum]) """compute cost""" cost = 0 # cross entropy entropy_cost = -y * T.log(probs + 1e-8) entropy_cost = (entropy_cost.sum(2) * mask).mean(0).sum() * 100 cost += entropy_cost # weight decay weight_decay = 0. if decay_c > 0.: decay_c = theano.shared(np.float32(decay_c), name='decay_c') for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay """Predictions""" preds = T.sum(probs[-last_n:, :, :], axis=0) preds = T.argmax(preds, axis=1) # n_samples # preds=T.argmax(probs[-last_n:,:,:],axis=2); return cost, preds, [], [x, mask, y], use_noise
def __init__(self, nam, maxlen=0, load=False, training=False): # 创建2个LSTM单元(参数:WUb)放入词典中,并初始化参数 # Generate 2 LSTM unit with Guassian innitialization # Type: Dictionary self.maxlen = maxlen newp = creatrnnx() self.model_name = nam # 让两个LSTM单元的参数WUb的初始相同 # Make the weights(WUb) of both LSTM unit same for i in newp.keys(): if i[0] == '1': newp['2' + i[1:]] = newp[i] # Create 5 tensors (symoblic) variables (y, mask11, mask21, emb11, emb21) # Here, config.floatX = 'float32' y = T.vector('y', dtype = config.floatX) mask11 = T.matrix('mask11', dtype = config.floatX) mask21 = T.matrix('mask21', dtype = config.floatX) emb11 = T.ftensor3('emb11') emb21 = T.ftensor3('emb21') # 3-D float-type tensor # Load the existed model (pre-trained weights) if needed if load == True: newp = pickle.load(open(nam,'rb')) # Convert 'newp' to shared-tensor-type dictionary 'tnewp' # Shared tenasor variable self.tnewp = init_tparams(newp) # Set tensor-type noise use_noise = theano.shared(numpy_floatX(0.)) # Set tensor-type random number generator # rng -> random number generator trng = RandomStreams(1234) # ??? rrng? # create a 3-D random tensor for "dropout"? rate = 0.5 rrng = trng.binomial(emb11.shape, p = 1 - rate, n = 1, dtype = emb11.dtype) # print "rrng:" # print "type of rrng:", type(rrng) # print rrng # 具体化LSTM模型的结构和参数(核心)proj代表着一个mini-batch输入以后的输出值 # Implement the LSTM module; # Here 'False' -> NOT apply DROPOUT layers; # Since the input is in the format: (Max No. of words in batch, No. of Samples, 300) # Note: that the 1st term and 2nd term are exchanged! # 只需要getp()即scan循环以后的最后一次(timesteps)结果,之前记录LSTM输出的结果都抛弃 # proj11[-1] -> (No. of samples[N], Hidden unit dimension[timesteps]) -> (N, 50) # proj11 takes the inputs as embedding matrix emb1 and gives the o/p of the LSTM_A proj11 = getpl2(emb11, '1lstm1', mask11, False, rrng, 50, self.tnewp)[-1] proj21 = getpl2(emb21, '2lstm1', mask21, False, rrng, 50, self.tnewp)[-1] # Define the cost function dif = (proj21 - proj11).norm(L = 1, axis = 1) s2 = T.exp(-dif) sim = T.clip(s2, 1e-7, 1.0-1e-7) # Similarity lr = tensor.scalar(name = 'lr') # learning rate ys = T.clip((y-1.0) / 4.0, 1e-7, 1.0-1e-7) cost = T.mean((sim - ys) ** 2) ns=emb11.shape[1] self.f2sim = theano.function([emb11, mask11, emb21, mask21], sim, allow_input_downcast = True) self.f_proj11 = theano.function([emb11, mask11], proj11, allow_input_downcast = True) # NOT used self.f_cost = theano.function([emb11, mask11, emb21, mask21, y], cost, allow_input_downcast = True) # NOT used # Prepare for the backpropogation and gradiant descend if training == True: # 计算cost对不同参数的导数,并且平均两个LSTM模型的参数 # The gradi refers to gradients wrt. cost, and is a list containing gradients to be update weights # We average out the gradients by appending to another list grads[] # So, we average out the gradients : wrt LSTM_A and wrt LSTM_B # i.e, gradient= (grad(wrt(LSTM_A)+grad(wrt(LSTM_B))/2.0 to maintain the symmetricity between either LSTMs # wrt: (variable or list of variables) – term[s] for which we want gradients gradi = tensor.grad(cost, wrt = self.tnewp.values()) # T.grad -> differential grads = [] l = len(gradi) for i in range(0, l/2): gravg = (gradi[i] + gradi[i + l / 2]) / (4.0) #print i,i+9 grads.append(gravg) for i in range(0, len(self.tnewp.keys()) / 2): grads.append(grads[i]) # Here, the f_grad_shared and f_update are theano functions self.f_grad_shared, self.f_update = adadelta(lr, self.tnewp, grads, emb11, mask11, emb21, mask21, y, cost)
def __init__(self, glimpse_shape, glimpse_times, dim_hidden, dim_fc, dim_out, reward_base, rng_std=1.0, activation=T.tanh, bptt_truncate=-1, lmbd=0.1 # gdupdate + lmbd*rlupdate ): if reward_base == None: reward_base = np.zeros((glimpse_times)).astype('float32') reward_base[-1] = 1.0 x = T.ftensor3('x') # N * W * H y = T.ivector('y') # label lr = T.fscalar('lr') reward_base = theano.shared(name='reward_base', value=np.array(reward_base).astype(theano.config.floatX), borrow=True) # Time (vector) reward_bias = T.fvector('reward_bias') rng = MRG_RandomStreams(np.random.randint(9999999)) # rng = theano.tensor.shared_randomstreams.RandomStreams(np.random.randint(9999999)) i = InputLayer(x) au = AttentionUnit(x, glimpse_shape, glimpse_times, dim_hidden, rng, rng_std, activation, bptt_truncate) # All hidden states are put into decoder # layers = [i, au, InputLayer(au.output[:,:,:].flatten(2))] # dim_fc = [glimpse_times*dim_hidden] + dim_fc + [dim_out] # Only the last hidden states layers = [i, au, InputLayer(au.output[:,-1,:])] dim_fc = [dim_hidden] + dim_fc + [dim_out] for Idim, Odim in zip(dim_fc[:-1], dim_fc[1:]): fc = FullConnectLayer(layers[-1].output, Idim, Odim, activation, 'FC') layers.append(fc) sm = SoftmaxLayer(layers[-1].output) layers.append(sm) output = sm.output # N * classes hidoutput = au.output # N * dim_output location = au.location # N * T * dim_hidden prediction = output.argmax(1) # N # calc equalvec = T.eq(prediction, y) # [0, 1, 0, 0, 1 ...] correct = T.cast(T.sum(equalvec), 'float32') # noequalvec = T.neq(prediction, y) # nocorrect = T.cast(T.sum(noequalvec), 'float32') logLoss = T.log(output)[T.arange(y.shape[0]), y] # reward_biased = T.outer(equalvec, reward_base)-reward_bias.dimshuffle('x', 0) # N * Time # (R_t - b_t), where b = E[R] # gradient descent gdobjective = logLoss.sum()/x.shape[0] # correct * dim_output (only has value on the correctly predicted sample) gdparams = reduce(lambda x, y: x+y.params, layers, []) gdupdates = map(lambda x: (x, x+lr*T.grad(gdobjective, x)), gdparams) # reinforce learning rlobjective = (reward_biased.dimshuffle(0, 1, 'x') * T.log(au.location_p)).sum() / x.shape[0] # location_p: N * Time * 2 # location_logp: N * Time # reward_biased: N * 2 rlparams = au.reinforceParams rlupdates = map(lambda x: (x, x+lr*lmbd*T.grad(rlobjective, x)), rlparams) # Hidden state keeps unchange in time deltas = T.stack(*[((au.output[:,i,:].mean(0)-au.output[:,i+1,:].mean(0))**2).sum() for i in xrange(glimpse_times-1)]) # N * Time * dim_hidden print 'compile step()' self.step = theano.function([x, y, lr, reward_bias], [gdobjective, rlobjective, correct, T.outer(equalvec, reward_base)], updates=gdupdates+rlupdates) # print 'compile gdstep()' # self.gdstep = theano.function([x, y, lr], [gdobjective, correct, location], updates=gdupdates) # print 'compile rlstep()' # self.rlstep = theano.function([x, y, lr], [rlobjective], updates=rlupdates) print 'compile predict()' self.predict = theano.function([x], prediction) # print 'compile forward()' # self.forward = theano.function([x], map(lambda x: x.output, layers)) #[layers[-3].output, fc.output]) # print 'compile error()' # self.error = theano.function([x, y], gdobjective) print 'compile locate()' self.locate = theano.function([x], [au.location_mean, location]) #[layers[-3].output, fc.output]) print 'compile debug()' self.debug = theano.function([x, y, lr, reward_bias], [deltas, au.location_p], on_unused_input='warn') # self.xxx self.glimpse_times = glimpse_times
batch_size=args.batch_size, use_ivectors=True) valid_datastream = get_datastream(path=args.data_path, which_set=args.valid_dataset, batch_size=args.batch_size, use_ivectors=True) test_datastream = get_datastream(path=args.data_path, which_set=args.test_dataset, batch_size=args.batch_size, use_ivectors=True) ################# # build network # ################# print('Building and compiling network') input_data = T.ftensor3('input_data') input_cond = T.ftensor3('input_cond') input_mask = T.fmatrix('input_mask') target_data = T.imatrix('target_data') target_mask = T.fmatrix('target_mask') network_output = deep_projection_ivector_ln_model_fix( input_var=input_data, cond_var=input_cond, mask_var=input_mask, num_inputs=input_dim, num_outputs=output_dim, num_layers=args.num_layers, num_conds=args.num_conds, num_factors=args.num_factors, num_units=args.num_units, grad_clipping=args.grad_clipping,
def setup(self): """ Set up the model to train. """ # input_words: shape (n_batch, n_sentence, sentence_len) input_words = T.itensor3() n_batch, n_sentences, sentence_len = input_words.shape # query_words: shape (n_batch, query_len) query_words = T.imatrix() # correct_output: shape (n_batch, ?, num_output_words) correct_output = T.ftensor3() # graph_num_new_nodes: shape(n_batch, n_sentence) graph_num_new_nodes = T.imatrix() # graph_new_node_strengths: shape(n_batch, n_sentence, new_nodes_per_iter) graph_new_node_strengths = T.ftensor3() # graph_new_node_ids: shape(n_batch, n_sentence, new_nodes_per_iter, num_node_ids) graph_new_node_ids = T.ftensor4() # graph_new_edges: shape(n_batch, n_sentence, pad_graph_size, pad_graph_size, num_edge_types) graph_new_edges = T.TensorType('floatX', (False, ) * 5)() def _build(with_correct_graph, snap_to_best, using_dropout, evaluate_accuracy): info = {} # Process each sentence, flattened to (?, sentence_len) flat_input_words = input_words.reshape([-1, sentence_len]) flat_input_reprs, flat_ref_matrices = self.input_transformer.process( flat_input_words) # flat_input_reprs of shape (?, input_repr_size) # flat_ref_matrices of shape (?, num_node_ids, input_repr_size) input_reprs = flat_input_reprs.reshape( [n_batch, n_sentences, self.input_repr_size]) ref_matrices = flat_ref_matrices.reshape([ n_batch, n_sentences, self.num_node_ids, self.input_repr_size ]) query_repr, query_ref_matrix = self.input_transformer.process( query_words) if using_dropout: iter_dropouts = [] states_mask = util.make_dropout_mask( (self.node_state_size, ), self.dropout_keep, self.srng) if self.nodes_mutable: iter_dropouts.extend( self.node_state_updater.dropout_masks( self.srng, states_mask)) if len(self.word_node_mapping) > 0: iter_dropouts.extend( self.direct_reference_updater.dropout_masks( self.srng, states_mask)) if self.intermediate_propagate != 0: iter_dropouts.extend( self.intermediate_propagator.dropout_masks( self.srng, states_mask)) if self.dynamic_nodes: iter_dropouts.extend( self.new_node_adder.dropout_masks(self.srng)) iter_dropouts.extend( self.edge_state_updater.dropout_masks(self.srng)) else: iter_dropouts = [] states_mask = None def _iter_fn(input_repr, ref_matrix, gstate, correct_num_new_nodes=None, correct_new_strengths=None, correct_new_node_ids=None, correct_edges=None, dropout_masks=None): # If necessary, update node state if self.nodes_mutable: gstate, dropout_masks = self.node_state_updater.process( gstate, input_repr, dropout_masks) if len(self.word_node_mapping) > 0: gstate, dropout_masks = self.direct_reference_updater.process( gstate, ref_matrix, dropout_masks) # If necessary, propagate node state if self.intermediate_propagate != 0: gstate, dropout_masks = self.intermediate_propagator.process_multiple( gstate, self.intermediate_propagate, dropout_masks) node_loss = None node_accuracy = None # Propose and vote on new nodes if self.dynamic_nodes: new_strengths, new_ids, dropout_masks = self.new_node_adder.get_candidates( gstate, input_repr, self.new_nodes_per_iter, dropout_masks) # new_strengths and correct_new_strengths are of shape (n_batch, new_nodes_per_iter) # new_ids and correct_new_node_ids are of shape (n_batch, new_nodes_per_iter, num_node_ids) if with_correct_graph: perm_idxs = np.array( list( itertools.permutations( range(self.new_nodes_per_iter)))) permuted_correct_str = correct_new_strengths[:, perm_idxs] permuted_correct_ids = correct_new_node_ids[:, perm_idxs] # due to advanced indexing, we should have shape (n_batch, permutation, new_nodes_per_iter, num_node_ids) ext_new_str = T.shape_padaxis(new_strengths, 1) ext_new_ids = T.shape_padaxis(new_ids, 1) strength_ll = permuted_correct_str * T.log( ext_new_str + util.EPSILON) + (1 - permuted_correct_str) * T.log( 1 - ext_new_str + util.EPSILON) ids_ll = permuted_correct_ids * T.log(ext_new_ids + util.EPSILON) reduced_perm_lls = T.sum(strength_ll, axis=2) + T.sum( ids_ll, axis=[2, 3]) if self.best_node_match_only: node_loss = -T.max(reduced_perm_lls, 1) else: full_ll = util.reduce_log_sum(reduced_perm_lls, 1) # Note that some of these permutations are identical, since we likely did not add the maximum # amount of nodes. Thus we will have added repeated elements here. # We have log(x+x+...+x) = log(kx), where k is the repetition factor and x is the probability we want # log(kx) = log(k) + log(x) # Our repetition factor k is given by (new_nodes_per_iter - correct_num_new_nodes)! # Recall that n! = gamma(n+1) # so log(x) = log(kx) - log(gamma(k+1)) log_rep_factor = T.gammaln( T.cast( self.new_nodes_per_iter - correct_num_new_nodes + 1, 'floatX')) scaled_ll = full_ll - log_rep_factor node_loss = -scaled_ll if evaluate_accuracy: best_match_idx = T.argmax(reduced_perm_lls, 1) # should be of shape (n_batch), indexing the best permutation best_correct_str = permuted_correct_str[ T.arange(n_batch), best_match_idx] best_correct_ids = permuted_correct_ids[ T.arange(n_batch), best_match_idx] snapped_strengths = util.independent_best( new_strengths) snapped_ids = util.categorical_best( new_ids) * T.shape_padright(snapped_strengths) close_strengths = T.all( T.isclose(best_correct_str, snapped_strengths), (1)) close_ids = T.all( T.isclose(best_correct_ids, snapped_ids), (1, 2)) node_accuracy = T.and_(close_strengths, close_ids) # now substitute in the correct nodes gstate = gstate.with_additional_nodes( correct_new_strengths, correct_new_node_ids) elif snap_to_best: snapped_strengths = util.independent_best( new_strengths) snapped_ids = util.categorical_best(new_ids) gstate = gstate.with_additional_nodes( snapped_strengths, snapped_ids) else: gstate = gstate.with_additional_nodes( new_strengths, new_ids) # Update edge state gstate, dropout_masks = self.edge_state_updater.process( gstate, input_repr, dropout_masks) if with_correct_graph: cropped_correct_edges = correct_edges[:, :gstate.n_nodes, : gstate.n_nodes, :] edge_lls = cropped_correct_edges * T.log( gstate.edge_strengths + util.EPSILON) + (1 - cropped_correct_edges) * T.log( 1 - gstate.edge_strengths + util.EPSILON) # edge_lls currently penalizes for edges connected to nodes that do not exist # we do not want it to do this, so we mask it with node strengths mask_src = util.shape_padaxes(gstate.node_strengths, [2, 3]) mask_dest = util.shape_padaxes(gstate.node_strengths, [1, 3]) masked_edge_lls = edge_lls * mask_src * mask_dest edge_loss = -T.sum(masked_edge_lls, axis=[1, 2, 3]) if evaluate_accuracy: snapped_edges = util.independent_best( gstate.edge_strengths) close_edges = T.isclose(cropped_correct_edges, snapped_edges) ok_mask = 1 - T.cast( mask_src * mask_dest, 'int8' ) # its OK for things not to match if node strengths are NOT both 1 edge_accuracy = T.all(T.or_(close_edges, ok_mask), (1, 2, 3)) overall_accuracy = edge_accuracy if node_accuracy is None else T.and_( node_accuracy, edge_accuracy) else: overall_accuracy = None gstate = gstate.with_updates( edge_strengths=cropped_correct_edges) return gstate, node_loss, edge_loss, overall_accuracy elif snap_to_best: snapped_edges = util.independent_best( gstate.edge_strengths) gstate = gstate.with_updates(edge_strengths=snapped_edges) return gstate else: return gstate # Scan over each sentence def _scan_fn( input_repr, *stuff ): # (input_repr, [ref_matrix?], [*correct_graph_stuff?], [dropout_masks?], *flat_graph_state, pad_graph_size) stuff = list(stuff) if len(self.word_node_mapping) > 0: ref_matrix = stuff[0] stuff = stuff[1:] else: ref_matrix = None if with_correct_graph: c_num_new_nodes, c_new_strengths, c_new_node_ids, c_edges = stuff[: 4] stuff = stuff[4:] if using_dropout: dropout_masks = stuff[:len(iter_dropouts)] stuff = stuff[len(iter_dropouts):] else: dropout_masks = None flat_graph_state = stuff[:-1] pad_graph_size = stuff[-1] gstate = GraphState.unflatten_from_const_size(flat_graph_state) if with_correct_graph: gstate, node_loss, edge_loss, overall_accuracy = _iter_fn( input_repr, ref_matrix, gstate, c_num_new_nodes, c_new_strengths, c_new_node_ids, c_edges, dropout_masks=dropout_masks) else: gstate = _iter_fn(input_repr, ref_matrix, gstate, dropout_masks=dropout_masks) retvals = gstate.flatten_to_const_size(pad_graph_size) if with_correct_graph: if self.dynamic_nodes: retvals.append(node_loss) retvals.append(edge_loss) if evaluate_accuracy: retvals.append(overall_accuracy) return retvals if self.dynamic_nodes: initial_gstate = GraphState.create_empty( n_batch, self.num_node_ids, self.node_state_size, self.num_edge_types) else: initial_gstate = GraphState.create_full_unique( n_batch, self.num_node_ids, self.node_state_size, self.num_edge_types) # Account for all nodes, plus the extra padding node to prevent GPU unpleasantness if self.dynamic_nodes: pad_graph_size = n_sentences * self.new_nodes_per_iter + 1 else: pad_graph_size = self.num_node_ids outputs_info = initial_gstate.flatten_to_const_size(pad_graph_size) prepped_input = input_reprs.dimshuffle([1, 0, 2]) sequences = [prepped_input] if len(self.word_node_mapping) > 0: sequences.append(ref_matrices.dimshuffle([1, 0, 2, 3])) if with_correct_graph: sequences.append(graph_num_new_nodes.swapaxes(0, 1)) sequences.append(graph_new_node_strengths.swapaxes(0, 1)) sequences.append(graph_new_node_ids.swapaxes(0, 1)) sequences.append(graph_new_edges.swapaxes(0, 1)) if self.dynamic_nodes: outputs_info.extend([None]) if evaluate_accuracy: outputs_info.extend([None]) outputs_info.extend([None]) if using_dropout: sequences.extend(iter_dropouts) all_scan_out, _ = theano.scan(_scan_fn, sequences=sequences, outputs_info=outputs_info, non_sequences=[pad_graph_size]) graph_accurate_list = None if with_correct_graph: if evaluate_accuracy: full_graph_accuracy = all_scan_out[-1] all_scan_out = all_scan_out[:-1] graph_accurate_list = T.all(full_graph_accuracy, 0) info["graph_accuracy"] = T.sum(graph_accurate_list, dtype='floatX') / T.cast( n_batch, 'floatX') if self.dynamic_nodes: all_flat_gstates = all_scan_out[:-2] node_loss, edge_loss = all_scan_out[-2:] reduced_node_loss = T.sum(node_loss) / T.cast( n_batch, 'floatX') reduced_edge_loss = T.sum(edge_loss) / T.cast( n_batch, 'floatX') avg_graph_loss = (reduced_node_loss + reduced_edge_loss) / T.cast( input_words.shape[1], 'floatX') info["node_loss"] = reduced_node_loss info["edge_loss"] = reduced_edge_loss else: all_flat_gstates = all_scan_out[:-1] edge_loss = all_scan_out[-1] reduced_edge_loss = T.sum(edge_loss) / T.cast( n_batch, 'floatX') avg_graph_loss = reduced_edge_loss / T.cast( input_words.shape[1], 'floatX') info["edge_loss"] = reduced_edge_loss else: all_flat_gstates = all_scan_out if self.sequence_representation: # Each part of all_flat_gstates is of shape (n_sentences, n_batch, ...) # except for the last one, which we handle separately # Swap to (n_batch, n_sentences, ...) # Then flatten to (n_batch*n_sentences, ...) for further processing final_flat_gstate = [ x.swapaxes(0, 1).reshape(T.concatenate([[-1], x.shape[2:]]), ndim=(x.ndim - 1)) for x in all_flat_gstates[:-1] ] # As for the last one, we need to get a single scalar value. The last one will be the biggest # so we will take that. Note that this will introduce a bunch of zero-nodes, but thats # OK and we can process that later. (We REQUIRE that padding in graph_state makes zero strength # nodes here!) final_flat_gstate.append(all_flat_gstates[-1][-1]) # We also need to repeat query_repr and query_ref_matrix so that they broadcast together query_repr = T.extra_ops.repeat(query_repr, n_sentences, 0) query_ref_matrix = T.extra_ops.repeat(query_ref_matrix, n_sentences, 0) else: # Extract last timestep final_flat_gstate = [x[-1] for x in all_flat_gstates] final_gstate = GraphState.unflatten_from_const_size( final_flat_gstate) if self.train_with_query: if self.wipe_node_state: final_gstate = final_gstate.with_updates( node_states=T.zeros_like(final_gstate.node_states)) qnsu_dropout_masks = self.query_node_state_updater.dropout_masks( self.srng, states_mask) query_gstate, _ = self.query_node_state_updater.process( final_gstate, query_repr, qnsu_dropout_masks) if len(self.word_node_mapping) > 0: qdru_dropout_masks = self.query_direct_reference_updater.dropout_masks( self.srng, states_mask) query_gstate, _ = self.query_direct_reference_updater.process( query_gstate, query_ref_matrix, qdru_dropout_masks) fp_dropout_masks = self.final_propagator.dropout_masks( self.srng, states_mask) propagated_gstate, _ = self.final_propagator.process_multiple( query_gstate, self.final_propagate, fp_dropout_masks) agg_dropout_masks = self.aggregator.dropout_masks(self.srng) aggregated_repr, _ = self.aggregator.process( propagated_gstate, agg_dropout_masks) # shape (n_batch, output_repr_size) if self.sequence_representation: # aggregated_repr is of shape (n_batch*n_sentences, repr_width) # We want to split back to timesteps: (n_batch, n_sentences, repr_width) agg_repr_seq = aggregated_repr.reshape( [n_batch, n_sentences, -1]) # Now collapse it to a summary representation aggsum_dropout_masks = self.aggregate_summarizer.dropout_masks( self.srng) aggregated_repr, _ = self.aggregate_summarizer.process( agg_repr_seq, aggsum_dropout_masks) # At this point aggregated_repr is (n_batch, repr_width) as desired max_seq_len = correct_output.shape[1] if self.output_format == ModelOutputFormat.sequence: final_output = self.output_processor.process( aggregated_repr, max_seq_len) # shape (n_batch, ?, num_output_words) else: final_output = self.output_processor.process( aggregated_repr) if snap_to_best: final_output = self.output_processor.snap_to_best( final_output) if self.output_format == ModelOutputFormat.subset: elemwise_loss = T.nnet.binary_crossentropy( final_output, correct_output) query_loss = T.sum(elemwise_loss) else: flat_final_output = final_output.reshape( [-1, self.num_output_words]) flat_correct_output = correct_output.reshape( [-1, self.num_output_words]) timewise_loss = T.nnet.categorical_crossentropy( flat_final_output, flat_correct_output) query_loss = T.sum(timewise_loss) query_loss = query_loss / T.cast(n_batch, 'floatX') info["query_loss"] = query_loss else: final_output = T.zeros([]) full_loss = np.array(0.0, np.float32) if with_correct_graph: full_loss = full_loss + avg_graph_loss if self.train_with_query: full_loss = full_loss + query_loss if self.train_with_query: adjusted_query_gstates = [ x.reshape(T.concatenate([[n_batch, n_sentences], x.shape[1:]]), ndim=(x.ndim + 1)) if self.sequence_representation else T.shape_padaxis(x, 1) for x in query_gstate.flatten() ] adjusted_prop_gstates = [ x.reshape(T.concatenate([[n_batch, n_sentences], x.shape[1:]]), ndim=(x.ndim + 1)) if self.sequence_representation else T.shape_padaxis(x, 1) for x in propagated_gstate.flatten() ] full_flat_gstates = [ T.concatenate([a.swapaxes(0, 1), b, c], 1) for a, b, c in zip(all_flat_gstates[:-1], adjusted_query_gstates, adjusted_prop_gstates) ] else: full_flat_gstates = [ a.swapaxes(0, 1) for a in all_flat_gstates[:-1] ] max_seq_len = T.iscalar() return full_loss, final_output, full_flat_gstates, graph_accurate_list, max_seq_len, info train_loss, _, _, _, _, train_info = _build(self.train_with_graph, False, True, False) adam_updates = Adam(train_loss, self.params, lr=self.learning_rate_var) self.info_keys = list(train_info.keys()) print("Compiling...") optimizer = theano.compile.predefined_optimizers[ 'fast_run' if self.check_mode == 'debug' else theano.config.optimizer] optimizer = optimizer.excluding( "scanOp_pushout_output", "remove_constants_and_unused_inputs_scan") if self.check_mode == 'nan': mode = NanGuardMode(optimizer=optimizer, nan_is_error=True, inf_is_error=True, big_is_error=True) elif self.check_mode == 'debug': mode = DebugMode(optimizer=optimizer, check_isfinite=False, check_py_code=False, stability_patience=1) theano.tensor.TensorType.filter_checks_isfinite = False else: mode = theano.Mode(optimizer=optimizer) self.train_fn = theano.function([ input_words, query_words, correct_output, graph_num_new_nodes, graph_new_node_strengths, graph_new_node_ids, graph_new_edges ], [train_loss] + list(train_info.values()), updates=adam_updates, allow_input_downcast=True, on_unused_input='ignore', mode=mode) eval_loss, _, full_flat_gstates, graph_accurate_list, _, eval_info = _build( self.train_with_graph, False, False, True) self.eval_info_keys = list(eval_info.keys()) self.eval_fn = theano.function([ input_words, query_words, correct_output, graph_num_new_nodes, graph_new_node_strengths, graph_new_node_ids, graph_new_edges ], [eval_loss, graph_accurate_list] + list(eval_info.values()), allow_input_downcast=True, on_unused_input='ignore', mode=mode) self.debug_test_fn = theano.function([ input_words, query_words, correct_output, graph_num_new_nodes, graph_new_node_strengths, graph_new_node_ids, graph_new_edges ], full_flat_gstates, allow_input_downcast=True, on_unused_input='ignore', mode=mode) test_loss, final_output, full_flat_gstates, _, max_seq_len, _ = _build( False, False, False, False) self.fuzzy_test_fn = theano.function( [input_words, query_words] + ([max_seq_len] if self.output_format == ModelOutputFormat.sequence else []), [final_output] + full_flat_gstates, allow_input_downcast=True, on_unused_input='ignore', mode=mode) test_loss, final_output, full_flat_gstates, _, max_seq_len, _ = _build( False, True, False, False) self.snap_test_fn = theano.function( [input_words, query_words] + ([max_seq_len] if self.output_format == ModelOutputFormat.sequence else []), [final_output] + full_flat_gstates, allow_input_downcast=True, on_unused_input='ignore', mode=mode)
def __init__(self, nh, nc, ne, de, cs): """ nh :: dimension of the hidden layer nc :: number of classes ne :: number of word embeddings in the vocabulary de :: dimension of the word embeddings cs :: word window context size """ # # parameters of the model # self.nh = nh self.nc = nc self.ne = ne self.de = de self.cs = cs # add one for PADDING at the end #self.emb = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0, (ne + 1, de)).astype(theano.config.floatX)) #self.emb = gensim.models.Word2Vec.load_word2vec_format('vectors.bin', binary=True) # parameters for the input layer self.Wx = theano.shared( 0.2 * numpy.random.uniform(-1.0, 1.0, (de * cs, nh)).astype(theano.config.floatX)) # parameters for stored histories in the hidden layer self.Wh = theano.shared( 0.2 * numpy.random.uniform(-1.0, 1.0, (nh, nh)).astype(theano.config.floatX)) self.bh = theano.shared(numpy.zeros(nh, dtype=theano.config.floatX)) # parameters for the output layer self.W = theano.shared( 0.2 * numpy.random.uniform(-1.0, 1.0, (nh, nc)).astype(theano.config.floatX)) self.b = theano.shared(numpy.zeros(nc, dtype=theano.config.floatX)) # initial value of the stored histories in the hidden layer self.h0 = theano.shared(numpy.zeros(nh, dtype=theano.config.floatX)) # bundle #self.params = [self.emb, self.Wx, self.Wh, self.W, self.bh, self.b, self.h0] self.params = [self.Wx, self.Wh, self.W, self.bh, self.b, self.h0] self.names = ['Wx', 'Wh', 'W', 'bh', 'b', 'h0'] idxs = T.ftensor3( ) # as many columns as context window size/lines as words in the sentence #self.x = self.emb[idxs].reshape((idxs.shape[0], de * cs)) self.x = idxs.reshape((idxs.shape[0], de * cs)) y = T.iscalar('y') # label # x_t: the input at time t # s_t: the output of the output layer (real output) at time t # h_tm1: the output of the hidden layer at time (t - 1) def recurrence(x_t, h_tm1): h_t = T.nnet.sigmoid( T.dot(x_t, self.Wx) + T.dot(h_tm1, self.Wh) + self.bh) s_t = T.nnet.softmax(T.dot(h_t, self.W) + self.b) return [h_t, s_t] [h, s], _ = theano.scan(fn=recurrence, sequences=self.x, outputs_info=[self.h0, None], n_steps=self.x.shape[0]) p_y_given_x_lastword = s[-1, 0, :] p_y_given_x_sentence = s[:, 0, :] y_pred = T.argmax(p_y_given_x_sentence, axis=1) # cost and gradients and learning rate lr = T.scalar('lr') nll = -T.mean(T.log(p_y_given_x_lastword)[y]) gradients = T.grad(nll, self.params) updates = OrderedDict( (p, p - lr * g) for p, g in zip(self.params, gradients)) # theano functions self.classify = theano.function(inputs=[idxs], outputs=y_pred) self.test = theano.function(inputs=[idxs], outputs=p_y_given_x_sentence) self.train = theano.function(inputs=[idxs, y, lr], outputs=nll, updates=updates)
def create_model(n_in, n_out, n_enc, n_hid, n_cyc): x = T.ftensor3('x') # x <batch_size, sequence_len, n_in> y = T.imatrix('y') # y <batch_size, sequence_len> # Layers c0 = Conv2D(16, 1, 3, 3) c1 = Conv2D(32, 16, 3, 3) #c1 = Conv2D(32, 32, 3, 3) #c1 = Conv2D(32, 1, 3, 3) #g0 = GRU(n_in, n_enc) #g1 = GRU(32*12, n_enc) #g2 = GRU(32*12, n_enc) g1 = GRU(32*7, n_enc) g2 = GRU(32*7, n_enc) d2 = TimeDistributedDense(n_enc*2, n_enc) #att = AttentionARSGy(n_enc, n_hid, n_cyc) att = AttentionARSGy(n_enc, n_out, n_hid, n_cyc) #do = Dense(n_cyc, n_out) #do = TimeDistributedDense(n_cyc, n_out) params = [ c0.params, c1.params, #g0.params, g1.params, g2.params, att.params, #do.params, #d_0.params, #d_1.params, d2.params, ] # Logic x0 = x.reshape((x.shape[0], 1, x.shape[1], x.shape[2])) xc = relu(c0.apply(x0)) xc = max_pool_2d(xc, (2,2)) xc = relu(c1.apply(xc)) #xc = max_pool_2d(xc, (2,2)) #xc = relu(c1.apply(xc)) x1 = xc.dimshuffle(0,2,1,3) x1 = x1.reshape((x1.shape[0], x1.shape[1], -1)) #x0 = g0.apply(x0) #x1 = x0[:, ::skip_rate[0]] #x1 = d_0.apply(x1) x2_f = g1.apply(x1) #, truncate_gradient=30) x2_b = g2.apply(x1[:,::-1]) #, truncate_gradient=30) x2 = T.concatenate([x2_f, x2_b[:,::-1]], axis=2) #x2 = x2[:, ::skip_rate[0]] #x2 = d_1.apply(x2) x3 = d2.apply(x2) xe = x3 Y = [] A = [] # extract glimplse H, alphas, out = att.apply(xe, y.shape[1]) # H: batch_size, y_len, n_hid # alphas: batch_size, x_len o_shp = out.shape o = T.reshape(out, (-1, o_shp[2])) loss = T.nnet.categorical_crossentropy(o, y.flatten()).mean() params = [p for pp in params for p in pp] return [x, y], out, loss, params, alphas
def test_gpu_memory_usage(self): # This test validates that the memory usage of the defined theano # function is reasonnable when executed on the GPU. It checks for # a bug in which one of scan's optimization was not applied which # made the scan node compute large and unnecessary outputs which # brought memory usage on the GPU to ~12G. # Dimensionality of input and output data (not one-hot coded) n_in = 100 n_out = 100 # Number of neurons in hidden layer n_hid = 4000 # Number of minibatches mb_size = 2 # Time steps in minibatch mb_length = 200 # Define input variables xin = tensor.ftensor3(name="xin") yout = tensor.ftensor3(name="yout") # Initialize the network parameters U = theano.shared(np.zeros((n_in, n_hid), dtype="float32"), name="W_xin_to_l1") V = theano.shared(np.zeros((n_hid, n_hid), dtype="float32"), name="W_l1_to_l1") W = theano.shared(np.zeros((n_hid, n_out), dtype="float32"), name="W_l1_to_l2") nparams = [U, V, W] # Build the forward pass l1_base = tensor.dot(xin, U) def scan_l(baseline, last_step): return baseline + tensor.dot(last_step, V) zero_output = tensor.alloc(np.asarray(0.0, dtype="float32"), mb_size, n_hid) l1_out, _ = scan( scan_l, sequences=[l1_base], outputs_info=[zero_output], mode=self.mode_with_gpu_nodebug, ) l2_out = tensor.dot(l1_out, W) # Compute the cost and take the gradient wrt params cost = tensor.sum((l2_out - yout) ** 2) grads = tensor.grad(cost, nparams) updates = list(zip(nparams, (n - g for n, g in zip(nparams, grads)))) # Compile the theano function feval_backprop = theano.function( [xin, yout], cost, updates=updates, mode=self.mode_with_gpu_nodebug ) # Validate that the PushOutScanOutput optimization has been applied # by checking the number of outputs of the grad Scan node in the # compiled function. nodes = feval_backprop.maker.fgraph.toposort() scan_nodes = [n for n in nodes if isinstance(n.op, Scan)] # The grad scan is always the 2nd one according to toposort. If the # optimization has been applied, it has 2 outputs, otherwise 3. grad_scan_node = scan_nodes[1] assert len(grad_scan_node.outputs) == 2, len(grad_scan_node.outputs) # Call the theano function to ensure the absence of a memory error feval_backprop( np.zeros((mb_length, mb_size, n_in), dtype="float32"), np.zeros((mb_length, mb_size, n_out), dtype="float32"), )
def __init__(self, config): self.config = config batch_size = config['batch_size'] num_seq = config['num_seq'] self.n_timesteps = config['num_timesteps'] num_joints = config['num_joints'] classes_num = config['classes_num'] # ##################### BUILD NETWORK ########################## mask = T.fvector('mask') y = T.lvector('y') target = T.ftensor3('target') rand = T.fvector('rand') trng = RandomStreams(1234) use_noise = T.fscalar('use_noise') print '... building the model' self.layers = [] params = [] weight_types = [] conv_fea = T.ftensor4('conv_fea') #(49, 16, 8, 1024) lstm_att_layer15 = JointAttentionLstmLayer(config, num_joints, conv_fea=conv_fea, mask=mask, batch_size=batch_size, num_seq=num_seq, trng=trng, use_noise=use_noise, n_in=1024 * 5, n_out=1024, dim_part=32) self.layers.append(lstm_att_layer15) params += lstm_att_layer15.params weight_types += lstm_att_layer15.weight_type self.conv_fea = conv_fea softmax_input = lstm_att_layer15.output softmax_layer15 = SoftmaxLayer(input=softmax_input, n_in=1024, n_out=21) self.layers.append(softmax_layer15) params += softmax_layer15.params weight_types += softmax_layer15.weight_type # #################### NETWORK BUILT ####################### self.cost_nll = softmax_layer15.negative_log_likelihood(y, mask) self.cost_jhmdb_attention = T.mean(T.sum(T.sum( 0.5 * (lstm_att_layer15.attention - target)**2, axis=1), axis=1), axis=0, dtype=theano.config.floatX) self.cost = self.cost_nll + self.cost_jhmdb_attention self.errors_video = softmax_layer15.errors_video( y, mask, batch_size, num_seq) self.params = params self.prob = softmax_layer15.p_y_given_x self.mask = mask self.y = y self.target = target self.rand = rand self.weight_types = weight_types self.batch_size = batch_size self.num_seq = num_seq self.use_noise = use_noise
def build_theano_functions(self): x = T.ftensor3('x') # shape of input : batch X time X value y = T.ftensor3('y') z = T.ftensor3('z') layers_input = [x] dims = np.array([self.input_dim]) for dim in self.lstm_layers_dim: dims = np.append(dims, dim) print "Dimensions =", dims # layer is just an index of the layer for layer in range(len(self.lstm_layers_dim)): # before the cell, input, forget and output gates, x needs to # be transformed linear = Linear( dims[layer], dims[layer + 1] * 4, #weights_init=Uniform(mean=data_mean, std=1), weights_init=IsotropicGaussian(mean=1., std=1), biases_init=Constant(0), name="linear" + str(layer)) linear.initialize() lstm_input = linear.apply(layers_input[layer]) # the lstm wants batch X time X value lstm = LSTM(dim=dims[layer + 1], weights_init=IsotropicGaussian(mean=0., std=0.5), biases_init=Constant(1), name="lstm" + str(layer)) lstm.initialize() # hack to use Orthogonal on lstm w_state lstm.W_state.set_value(Orthogonal().generate( np.random, lstm.W_state.get_value().shape)) h, _dummy = lstm.apply(lstm_input) layers_input.append(h) # the idea is to have one gaussian parametrize every frequency bin print "Last linear transform dim :", dims[1:].sum() output_transform = Linear( dims[1:].sum(), self.output_dim, weights_init=IsotropicGaussian(mean=0., std=1), biases_init=Constant(0), #use_bias=False, name="output_transform") output_transform.initialize() if len(self.lstm_layers_dim) == 1: print "hallo there, only one layer speaking" y_hat = output_transform.apply(layers_input[-1]) else: y_hat = output_transform.apply( T.concatenate(layers_input[1:], axis=2)) sig = T.nnet.relu(y_hat[:, :, :self.output_dim / 2]) + 0.05 mus = y_hat[:, :, self.output_dim / 2:] # sum likelihood with targets # sum inside log accross mixtures, sum outside log accross time inside_expo = -0.5 * ((y - mus)**2) / sig**2 expo = T.exp(inside_expo) coeff = 1. / (T.sqrt(2. * np.pi) * sig) inside_log = T.log(coeff * expo) inside_log_max = T.max(inside_log, axis=2, keepdims=True) LL = -(inside_log_max + T.log( T.sum(T.exp(inside_log - inside_log_max), axis=2, keepdims=True))).sum() #zinside_expo = -0.5*((z-mus)**2)/sig**2 #zexpo = T.exp(zinside_expo) #zcoeff = pis*(1./(T.sqrt(2.*np.pi)*sig)) #zinside_log = (zcoeff*zexpo).sum(axis=2) #zLL = -(T.log(zinside_log)).sum() model = Model(LL) self.model = model parameters = model.parameters grads = T.grad(LL, parameters) updates = [] lr = T.scalar('lr') for i in range(len(grads)): #updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]])) updates.append( tuple([parameters[i], parameters[i] - lr * grads[i]])) #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) if self.debug: gradf = theano.function([x, y, lr], [LL, mus, sig], updates=updates) else: #gradf = theano.function([x, y, z],[zLL],updates=updates) gradf = theano.function([x, y, lr], [LL], updates=updates) f = theano.function([x], [sig, mus]) return gradf, f
def RelationStackMaker(chips, params, graph=False, weighted=False, batched=False): if batched: emb_input = T.itensor3('emb_input') entities_tv = [ T.fmatrix('enidx_' + str(i)).astype(theano.config.floatX) for i in range(params['num_entity']) ] sample_weights = T.fvector('sample_weight') if graph: if weighted: masks = T.ftensor4('child_mask') else: masks = T.ftensor3('child_mask') else: masks = T.fmatrix('batch_mask') else: emb_input = T.imatrix('emb_input') entities_tv = [ T.fvector('enidx_' + str(i)).astype(theano.config.floatX) for i in range(params['num_entity']) ] sample_weights = T.fvector('sample_weight') if graph: if weighted: masks = T.ftensor3('child_mask') else: masks = T.fmatrix('child_mask') else: masks = None #print masks, type(masks), masks.ndim current_chip = Start(params['voc_size'], emb_input) print '\n', 'Building Stack now', '\n', 'Start: ', params[ 'voc_size'], 'out_tv dim:', current_chip.output_tv.ndim instantiated_chips = stackLayers(chips, current_chip, params, entity_size=params['num_entity']) regularizable_params = computeLayers(instantiated_chips, current_chip, params, entities_input=entities_tv, mask=masks, sample_weights=sample_weights) ### Debug use: Get the attention co-efficiency and visualize. ### for c in instantiated_chips: if c[1].endswith('Entity_Att'): assert hasattr(c[0], 'att_wt_arry') assert hasattr(c[0], 'entity_tvs') attention_weights = c[0].att_wt_arry entity_tvs = c[0].entity_tvs current_chip = instantiated_chips[-1][0] if current_chip.output_tv.ndim == 2: pred_y = current_chip.output_tv #T.argmax(current_chip.output_tv, axis=1) else: pred_y = current_chip.output_tv #T.argmax(current_chip.output_tv) #, axis=1) gold_y = (current_chip.gold_y if hasattr(current_chip, 'gold_y') else None) # Show all parameters that would be needed in this system params_needed = calculate_params_needed(instantiated_chips) print "Parameters Needed", params_needed for k in params_needed: assert k in params, k print k, params[k] assert hasattr(current_chip, 'score') cost = current_chip.score #/ params['nsentences'] cost_arr = [cost] for layer in instantiated_chips[:-1]: if hasattr(layer[0], 'score'): print layer[1] cost += params['cost_coef'] * layer[0].score cost_arr.append(params['cost_coef'] * layer[0].score) grads = T.grad(cost, wrt=regularizable_params) #[params[k] for k in params if (hasattr(params[k], 'is_regularizable') and params[k].is_regularizable)]) print 'Regularizable parameters:' for k, v in params.items(): if hasattr(v, 'is_regularizable'): print k, v, v.is_regularizable if graph or batched: #return (emb_input, masks, entities_tv, attention_weights, entity_tvs, gold_y, pred_y, cost, grads, regularizable_params) return (emb_input, masks, entities_tv, sample_weights, gold_y, pred_y, cost, grads, regularizable_params) else: return (emb_input, entities_tv, sample_weights, gold_y, pred_y, cost, grads, regularizable_params, sample_weights)
logging.getLogger().addHandler(logging.StreamHandler()) logging.info('Experiment starts') logging.info('Saving experiment data to %s', conf['out_path']) logging.info("Setting random state...") start_time = time.clock() rng = conf['rng'] srng = theano.tensor.shared_randomstreams.RandomStreams( rng.randint(999999)) np.random.seed(conf['seed']) logging.info("...done %f" % (time.clock() - start_time)) logging.info("Creating the model...") start_time = time.clock() x = T.ftensor3('x') label = T.fvector('label') #ldm_index = T.ivector('ldm_index') disk = Tsp.csr_matrix('disk') layer_begin = T.imatrix('layer_begin') layer_end = T.imatrix('layer_end') model = LSCNN(rng, conf['layers'], conf['drop']) model.inputs = [x, label, disk, layer_begin, layer_end] model.fwd_inputs = [x, label, disk, layer_begin, layer_end] model.w_constraints = eval(conf['w_constraints']) logging.info("...done %f" % (time.clock() - start_time)) logging.info("Checking if there is already a best model to load...") start_time = time.clock()
import theano from theano import tensor import numpy as np import mkl_gru_op_v x = tensor.ftensor3('x') x_m = tensor.ftensor3('x_m') h_init = tensor.fmatrix('h_init') W_h = tensor.fmatrix('W_h') W_hzr = tensor.fmatrix('W_hzr') W_hh = tensor.fmatrix('W_hh') W_x = tensor.fmatrix('W_x') b = tensor.ftensor3('b') o = mkl_gru_op_v.GRU(units=1000, timesteps=10, batch_size=80, input_dim=620)(x, x_m, h_init, W_h, W_x, b) f = theano.function([x, x_m, h_init, W_h, W_x, b], o) units = 1000 timesteps = 10 batch_size = 80 input_dim = 620 x = np.random.rand(timesteps, input_dim, batch_size).astype(np.float32) x_m = np.random.rand(timesteps, units, batch_size).astype(np.float32) - np.random.rand( timesteps, units, batch_size).astype(np.float32) h_init = np.random.rand(units, batch_size).astype(np.float32) - np.random.rand( units, batch_size).astype(np.float32) W_x = np.random.rand(units * 3, input_dim).astype(np.float32) - np.random.rand(
sys.exit(1) if not os.path.exists(args.model): print('File not found: {}'.format(args.model)) sys.exit(1) if not os.path.exists(args.hmrnn_model): print('File not found: {}'.format(args.hmrnn_model)) sys.exit(1) print('Loading an hmrnn model') hmrnn = HMRNNModel(args) hmrnn.load(args.hmrnn_model) input_data = T.ftensor3('input_data') input_mask = T.fmatrix('input_mask') ivector_data = None if args.use_ivector_input: ivector_data = T.ftensor3('ivector_data') network = build_deep_lstm(input_var=input_data, mask_var=input_mask, input_dim=args.input_dim, num_layers=args.num_layers, num_units=args.num_units, num_proj_units=args.num_proj_units, output_dim=args.output_dim, grad_clipping=args.grad_clipping, is_bidir=not args.uni,
def build_evpi_model(word_embeddings, len_voc, word_emb_dim, N, args, freeze=False): # input theano vars posts = T.imatrix() post_masks = T.fmatrix() ques_list = T.itensor3() ques_masks_list = T.ftensor3() ans_list = T.itensor3() ans_masks_list = T.ftensor3() labels = T.imatrix() utility_posts = T.imatrix() utility_post_masks = T.fmatrix() utility_labels = T.ivector() post_out, post_lstm_params = build_lstm_posts(posts, post_masks, args.post_max_len, \ word_embeddings, word_emb_dim, args.hidden_dim, len_voc, args.batch_size) ques_out, ques_lstm_params = build_lstm(ques_list, ques_masks_list, N, args.ques_max_len, \ word_embeddings, word_emb_dim, args.hidden_dim, len_voc, args.batch_size) ans_out, ans_lstm_params = build_lstm(ans_list, ans_masks_list, N, args.ans_max_len, \ word_embeddings, word_emb_dim, args.hidden_dim, len_voc, args.batch_size) # pqa_preds = [None]*N # post_ques_ans = T.concatenate([post_out, ques_out[0], ans_out[0]], axis=1) # l_post_ques_ans_in = lasagne.layers.InputLayer(shape=(args.batch_size, 3*args.hidden_dim), input_var=post_ques_ans) # l_post_ques_ans_dense = lasagne.layers.DenseLayer(l_post_ques_ans_in, num_units=args.hidden_dim,\ # nonlinearity=lasagne.nonlinearities.rectify) # l_post_ques_ans_dense2 = lasagne.layers.DenseLayer(l_post_ques_ans_dense, num_units=1,\ # nonlinearity=lasagne.nonlinearities.sigmoid) # pqa_preds[0] = lasagne.layers.get_output(l_post_ques_ans_dense2) # loss = T.sum(lasagne.objectives.binary_crossentropy(pqa_preds[0], labels[:,0])) # for i in range(1, N): # post_ques_ans = T.concatenate([post_out, ques_out[i], ans_out[i]], axis=1) # l_post_ques_ans_in_ = lasagne.layers.InputLayer(shape=(args.batch_size, 3*args.hidden_dim), input_var=post_ques_ans) # l_post_ques_ans_dense_ = lasagne.layers.DenseLayer(l_post_ques_ans_in_, num_units=args.hidden_dim,\ # nonlinearity=lasagne.nonlinearities.rectify,\ # W=l_post_ques_ans_dense.W,\ # b=l_post_ques_ans_dense.b) # l_post_ques_ans_dense2_ = lasagne.layers.DenseLayer(l_post_ques_ans_dense_, num_units=1,\ # nonlinearity=lasagne.nonlinearities.sigmoid,\ # W=l_post_ques_ans_dense2.W,\ # b=l_post_ques_ans_dense2.b) # pqa_preds[i] = lasagne.layers.get_output(l_post_ques_ans_dense2_) # loss += T.sum(lasagne.objectives.binary_crossentropy(pqa_preds[i], labels[:,i])) # # post_ques_ans_dense2_params = lasagne.layers.get_all_params(l_post_ques_ans_dense2, trainable=True) pq_out = [None] * N post_ques = T.concatenate([post_out, ques_out[0]], axis=1) l_post_ques_in = lasagne.layers.InputLayer(shape=(args.batch_size, 2 * args.hidden_dim), input_var=post_ques) l_post_ques_dense = lasagne.layers.DenseLayer(l_post_ques_in, num_units=args.hidden_dim,\ nonlinearity=lasagne.nonlinearities.rectify) l_post_ques_dense2 = lasagne.layers.DenseLayer(l_post_ques_dense, num_units=1,\ nonlinearity=lasagne.nonlinearities.sigmoid) pq_out[0] = lasagne.layers.get_output(l_post_ques_dense2) for i in range(1, N): post_ques = T.concatenate([post_out, ques_out[i]], axis=1) l_post_ques_in_ = lasagne.layers.InputLayer(shape=(args.batch_size, 2 * args.hidden_dim), input_var=post_ques) l_post_ques_dense_ = lasagne.layers.DenseLayer(l_post_ques_in_, num_units=args.hidden_dim,\ nonlinearity=lasagne.nonlinearities.rectify,\ W=l_post_ques_dense.W,\ b=l_post_ques_dense.b) l_post_ques_dense2_ = lasagne.layers.DenseLayer(l_post_ques_dense_, num_units=1,\ nonlinearity=lasagne.nonlinearities.sigmoid,\ W=l_post_ques_dense2.W,\ b=l_post_ques_dense2.b) pq_out[i] = lasagne.layers.get_output(l_post_ques_dense2_) post_ques_dense2_params = lasagne.layers.get_all_params(l_post_ques_dense2, trainable=True) all_sq_errors = [None] * (N * N) loss = 0.0 for i in range(N): for j in range(N): all_sq_errors[i * N + j] = T.sum(lasagne.objectives.squared_error( pq_out[i], ans_out[j]), axis=1) loss += T.sum( lasagne.objectives.squared_error(pq_out[i], ans_out[j]) * labels[:, i, None]) utility_preds, utility_post_ans_preds, utility_params = build_utility_lstm(utility_posts, utility_post_masks, \ posts, post_masks, ans_list, ans_masks_list, \ N, args.post_max_len, args.ans_max_len, \ word_embeddings, word_emb_dim, args.hidden_dim, len_voc) utility_loss = T.sum( lasagne.objectives.binary_crossentropy(utility_preds, utility_labels)) # for i in range(N): # loss += T.sum(lasagne.objectives.binary_crossentropy(utility_post_ans_preds[i], labels[:,i])) #all_params = post_lstm_params + ques_lstm_params + ans_lstm_params + post_ques_ans_dense2_params + utility_params all_params = post_lstm_params + ques_lstm_params + post_ques_dense2_params loss += args.rho * sum(T.sum(l**2) for l in all_params) utility_loss += args.rho * sum(T.sum(l**2) for l in utility_params) updates = lasagne.updates.adam(loss, all_params, learning_rate=args.learning_rate) utility_updates = lasagne.updates.adam(utility_loss, utility_params, learning_rate=args.learning_rate) train_fn = theano.function([posts, post_masks, ques_list, ques_masks_list, ans_list, ans_masks_list, labels], \ [loss] + utility_post_ans_preds + all_sq_errors, updates=updates) dev_fn = theano.function([posts, post_masks, ques_list, ques_masks_list, ans_list, ans_masks_list, labels], \ [loss] + utility_post_ans_preds + all_sq_errors,) # train_fn = theano.function([posts, post_masks, ques_list, ques_masks_list, ans_list, ans_masks_list, labels], \ # [loss] + pqa_preds + utility_post_ans_preds, updates=updates) # dev_fn = theano.function([posts, post_masks, ques_list, ques_masks_list, ans_list, ans_masks_list, labels], \ # [loss] + pqa_preds + utility_post_ans_preds,) utility_train_fn = theano.function([utility_posts, utility_post_masks, utility_labels], \ [utility_preds, utility_loss], updates=utility_updates) utility_dev_fn = theano.function([utility_posts, utility_post_masks, utility_labels], \ [utility_preds, utility_loss],) return train_fn, dev_fn, utility_train_fn, utility_dev_fn
def evaluate_lenet5(learning_rate=0.02, n_epochs=100, emb_size=300, batch_size=50, filter_size=[3], sent_len=40, claim_len=40, cand_size=10, hidden_size=[300, 300], max_pred_pick=5): model_options = locals().copy() print "model options", model_options pred_id2label = {1: 'SUPPORTS', 0: 'REFUTES', 2: 'NOT ENOUGH INFO'} seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) "load raw data" train_sents, train_sent_masks, train_sent_labels, train_claims, train_claim_mask, train_labels, word2id = load_fever_train( sent_len, claim_len, cand_size) train_3th_sents, train_3th_sent_masks, train_3th_sent_labels, train_3th_claims, train_3th_claim_mask, train_3th_labels, word2id = load_fever_train_NoEnoughInfo( sent_len, claim_len, cand_size, word2id) test_sents, test_sent_masks, test_sent_labels, test_claims, test_claim_mask, test_sent_names, test_ground_names, test_labels, word2id = load_fever_dev( sent_len, claim_len, cand_size, word2id) test_3th_sents, test_3th_sent_masks, test_3th_sent_labels, test_3th_claims, test_3th_claim_mask, test_3th_labels, word2id = load_fever_dev_NoEnoughInfo( sent_len, claim_len, cand_size, word2id) dev_sents, dev_sent_masks, dev_sent_labels, dev_claims, dev_claim_mask, dev_sent_names, dev_ground_names, dev_labels, word2id = load_fever_test( sent_len, claim_len, cand_size, word2id) dev_3th_sents, dev_3th_sent_masks, dev_3th_sent_labels, dev_3th_claims, dev_3th_claim_mask, dev_3th_labels, word2id = load_fever_test_NoEnoughInfo( sent_len, claim_len, cand_size, word2id) train_sents = np.asarray(train_sents, dtype='int32') train_3th_sents = np.asarray(train_3th_sents, dtype='int32') joint_train_sents = np.concatenate((train_sents, train_3th_sents)) test_sents = np.asarray(test_sents, dtype='int32') test_3th_sents = np.asarray(test_3th_sents, dtype='int32') joint_test_sents = np.concatenate((test_sents, test_3th_sents)) dev_sents = np.asarray(dev_sents, dtype='int32') dev_3th_sents = np.asarray(dev_3th_sents, dtype='int32') joint_dev_sents = np.concatenate((dev_sents, dev_3th_sents)) train_sent_masks = np.asarray(train_sent_masks, dtype=theano.config.floatX) train_3th_sent_masks = np.asarray(train_3th_sent_masks, dtype=theano.config.floatX) joint_train_sent_masks = np.concatenate( (train_sent_masks, train_3th_sent_masks)) test_sent_masks = np.asarray(test_sent_masks, dtype=theano.config.floatX) test_3th_sent_masks = np.asarray(test_3th_sent_masks, dtype=theano.config.floatX) joint_test_sent_masks = np.concatenate( (test_sent_masks, test_3th_sent_masks)) dev_sent_masks = np.asarray(dev_sent_masks, dtype=theano.config.floatX) dev_3th_sent_masks = np.asarray(dev_3th_sent_masks, dtype=theano.config.floatX) joint_dev_sent_masks = np.concatenate((dev_sent_masks, dev_3th_sent_masks)) train_sent_labels = np.asarray(train_sent_labels, dtype='int32') train_3th_sent_labels = np.asarray(train_3th_sent_labels, dtype='int32') joint_train_sent_labels = np.concatenate( (train_sent_labels, train_3th_sent_labels)) test_sent_labels = np.asarray(test_sent_labels, dtype='int32') test_3th_sent_labels = np.asarray(test_3th_sent_labels, dtype='int32') joint_test_sent_labels = np.concatenate( (test_sent_labels, test_3th_sent_labels)) dev_sent_labels = np.asarray(dev_sent_labels, dtype='int32') dev_3th_sent_labels = np.asarray(dev_3th_sent_labels, dtype='int32') joint_dev_sent_labels = np.concatenate( (dev_sent_labels, dev_3th_sent_labels)) train_claims = np.asarray(train_claims, dtype='int32') train_3th_claims = np.asarray(train_3th_claims, dtype='int32') joint_train_claims = np.concatenate((train_claims, train_3th_claims)) test_claims = np.asarray(test_claims, dtype='int32') test_3th_claims = np.asarray(test_3th_claims, dtype='int32') joint_test_claims = np.concatenate((test_claims, test_3th_claims)) dev_claims = np.asarray(dev_claims, dtype='int32') dev_3th_claims = np.asarray(dev_3th_claims, dtype='int32') joint_dev_claims = np.concatenate((dev_claims, dev_3th_claims)) train_claim_mask = np.asarray(train_claim_mask, dtype=theano.config.floatX) train_3th_claim_mask = np.asarray(train_3th_claim_mask, dtype=theano.config.floatX) joint_train_claim_mask = np.concatenate( (train_claim_mask, train_3th_claim_mask)) test_claim_mask = np.asarray(test_claim_mask, dtype=theano.config.floatX) test_3th_claim_mask = np.asarray(test_3th_claim_mask, dtype=theano.config.floatX) joint_test_claim_mask = np.concatenate( (test_claim_mask, test_3th_claim_mask)) dev_claim_mask = np.asarray(dev_claim_mask, dtype=theano.config.floatX) dev_3th_claim_mask = np.asarray(dev_3th_claim_mask, dtype=theano.config.floatX) joint_dev_claim_mask = np.concatenate((dev_claim_mask, dev_3th_claim_mask)) train_labels = np.asarray(train_labels, dtype='int32') train_3th_labels = np.asarray(train_3th_labels, dtype='int32') joint_train_labels = np.concatenate((train_labels, train_3th_labels)) test_labels = np.asarray(test_labels, dtype='int32') test_3th_labels = np.asarray(test_3th_labels, dtype='int32') joint_test_labels = np.concatenate((test_labels, test_3th_labels)) dev_labels = np.asarray(dev_labels, dtype='int32') dev_3th_labels = np.asarray(dev_3th_labels, dtype='int32') joint_dev_labels = np.concatenate((dev_labels, dev_3th_labels)) joint_train_size = len(joint_train_claims) joint_test_size = len(joint_test_claims) joint_dev_size = len(joint_dev_claims) train_size = len(train_claims) test_size = len(test_claims) dev_size = len(dev_claims) test_3th_size = len(test_3th_claims) dev_3th_size = len(dev_3th_claims) vocab_size = len(word2id) + 1 print 'joint_train size: ', joint_train_size, ' joint_dev size: ', joint_test_size, ' joint_test size: ', joint_dev_size print 'train size: ', train_size, ' dev size: ', test_size, ' test size: ', dev_size print 'vocab size: ', vocab_size rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_word2vec() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable "now, start to build the input form of the model" sents_ids = T.itensor3() #(batch, cand_size, sent_len) sents_mask = T.ftensor3() sents_labels = T.imatrix() #(batch, cand_size) claim_ids = T.imatrix() #(batch, claim_len) claim_mask = T.fmatrix() joint_sents_ids = T.itensor3() #(batch, cand_size, sent_len) joint_sents_mask = T.ftensor3() joint_sents_labels = T.imatrix() #(batch, cand_size) joint_claim_ids = T.imatrix() #(batch, claim_len) joint_claim_mask = T.fmatrix() joint_labels = T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' embed_input_sents = init_embeddings[sents_ids.flatten( )].reshape((batch_size * cand_size, sent_len, emb_size)).dimshuffle( 0, 2, 1 ) #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_claim = init_embeddings[claim_ids.flatten()].reshape( (batch_size, claim_len, emb_size)).dimshuffle(0, 2, 1) conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) task1_att_conv_W, task1_att_conv_b = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) task1_conv_W_context, task1_conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) att_conv_W, att_conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) NN_para = [ conv_W, conv_b, task1_att_conv_W, task1_att_conv_b, att_conv_W, att_conv_b, task1_conv_W_context, conv_W_context ] conv_model_sents = Conv_with_Mask( rng, input_tensor3=embed_input_sents, mask_matrix=sents_mask.reshape( (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings = conv_model_sents.maxpool_vec #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size batch_sent_emb = sent_embeddings.reshape( (batch_size, cand_size, hidden_size[0])) conv_model_claims = Conv_with_Mask( rng, input_tensor3=embed_input_claim, mask_matrix=claim_mask, image_shape=(batch_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero claim_embeddings = conv_model_claims.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size batch_claim_emb = T.repeat(claim_embeddings.dimshuffle(0, 'x', 1), cand_size, axis=1) ''' attentive conv for task1 ''' task1_attentive_conv_layer = Attentive_Conv_for_Pair_easy_version( rng, input_tensor3= embed_input_sents, #batch_size*cand_size, emb_size, sent_len input_tensor3_r=T.repeat(embed_input_claim, cand_size, axis=0), mask_matrix=sents_mask.reshape( (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])), mask_matrix_r=T.repeat(claim_mask, cand_size, axis=0), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=task1_att_conv_W, b=task1_att_conv_b, W_context=task1_conv_W_context, b_context=task1_conv_b_context) task1_attentive_sent_embeddings_l = task1_attentive_conv_layer.attentive_maxpool_vec_l #(batch_size*cand_size, hidden_size) task1_attentive_sent_embeddings_r = task1_attentive_conv_layer.attentive_maxpool_vec_r concate_claim_sent = T.concatenate([ batch_claim_emb, batch_sent_emb, T.sum(batch_claim_emb * batch_sent_emb, axis=2).dimshuffle(0, 1, 'x') ], axis=2) concate_2_matrix = concate_claim_sent.reshape( (batch_size * cand_size, hidden_size[0] * 2 + 1)) LR_input = T.concatenate([ concate_2_matrix, task1_attentive_sent_embeddings_l, task1_attentive_sent_embeddings_r ], axis=1) LR_input_size = hidden_size[0] * 2 + 1 + hidden_size[0] * 2 # LR_input = concate_2_matrix # LR_input_size = hidden_size[0]*2+1 #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative U_a = create_ensemble_para( rng, 1, LR_input_size) # the weight matrix hidden_size*2 # LR_b = theano.shared(value=np.zeros((8,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class LR_para = [U_a] # layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=8, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector score_matrix = T.nnet.sigmoid(LR_input.dot(U_a)) #batch * 12 inter_matrix = score_matrix.reshape((batch_size, cand_size)) # inter_sent_claim = T.batched_dot(batch_sent_emb, batch_claim_emb) #(batch_size, cand_size, 1) # inter_matrix = T.nnet.sigmoid(inter_sent_claim.reshape((batch_size, cand_size))) ''' maybe 1.0-inter_matrix can be rewritten into 1/e^(inter_matrix) ''' # prob_pos = T.where( sents_labels < 1, 1.0-inter_matrix, inter_matrix) # loss = -T.mean(T.log(prob_pos)) #f1 as loss batch_overlap = T.sum(sents_labels * inter_matrix, axis=1) batch_recall = batch_overlap / T.sum(sents_labels, axis=1) batch_precision = batch_overlap / T.sum(inter_matrix, axis=1) batch_f1 = 2.0 * batch_recall * batch_precision / (batch_recall + batch_precision) loss = -T.mean(T.log(batch_f1)) # loss = T.nnet.nnet.binary_crossentropy(inter_matrix, sents_labels).mean() ''' training task2, predict 3 labels ''' joint_embed_input_sents = init_embeddings[joint_sents_ids.flatten( )].reshape((batch_size * cand_size, sent_len, emb_size)).dimshuffle( 0, 2, 1 ) #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM joint_embed_input_claim = init_embeddings[ joint_claim_ids.flatten()].reshape( (batch_size, claim_len, emb_size)).dimshuffle(0, 2, 1) joint_conv_model_sents = Conv_with_Mask( rng, input_tensor3=joint_embed_input_sents, mask_matrix=joint_sents_mask.reshape( (joint_sents_mask.shape[0] * joint_sents_mask.shape[1], joint_sents_mask.shape[2])), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero joint_sent_embeddings = joint_conv_model_sents.maxpool_vec #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size joint_batch_sent_emb = joint_sent_embeddings.reshape( (batch_size, cand_size, hidden_size[0])) joint_premise_emb = T.sum(joint_batch_sent_emb * joint_sents_labels.dimshuffle(0, 1, 'x'), axis=1) #(batch, hidden_size) joint_conv_model_claims = Conv_with_Mask( rng, input_tensor3=joint_embed_input_claim, mask_matrix=joint_claim_mask, image_shape=(batch_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero joint_claim_embeddings = joint_conv_model_claims.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size joint_premise_hypo_emb = T.concatenate( [joint_premise_emb, joint_claim_embeddings], axis=1) #(batch, 2*hidden_size) ''' attentive conv in task2 ''' joint_sents_tensor3 = joint_embed_input_sents.dimshuffle(0, 2, 1).reshape( (batch_size, cand_size * sent_len, emb_size)) joint_sents_dot = T.batched_dot( joint_sents_tensor3, joint_sents_tensor3.dimshuffle( 0, 2, 1)) #(batch_size, cand_size*sent_len, cand_size*sent_len) joint_sents_dot_2_matrix = T.nnet.softmax( joint_sents_dot.reshape( (batch_size * cand_size * sent_len, cand_size * sent_len))) joint_sents_context = T.batched_dot( joint_sents_dot_2_matrix.reshape( (batch_size, cand_size * sent_len, cand_size * sent_len)), joint_sents_tensor3) #(batch_size, cand_size*sent_len, emb_size) joint_add_sents_context = joint_embed_input_sents + joint_sents_context.reshape( (batch_size * cand_size, sent_len, emb_size) ).dimshuffle( 0, 2, 1 ) #T.concatenate([joint_embed_input_sents, joint_sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len) attentive_conv_layer = Attentive_Conv_for_Pair_easy_version( rng, input_tensor3= joint_add_sents_context, #batch_size*cand_size, 2*emb_size, sent_len input_tensor3_r=T.repeat(joint_embed_input_claim, cand_size, axis=0), mask_matrix=joint_sents_mask.reshape( (joint_sents_mask.shape[0] * joint_sents_mask.shape[1], joint_sents_mask.shape[2])), mask_matrix_r=T.repeat(joint_claim_mask, cand_size, axis=0), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=att_conv_W, b=att_conv_b, W_context=conv_W_context, b_context=conv_b_context) attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l.reshape( (batch_size, cand_size, hidden_size[0])) #(batch_size*cand_size, hidden_size) attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r.reshape( (batch_size, cand_size, hidden_size[0])) masked_sents_attconv = attentive_sent_embeddings_l * joint_sents_labels.dimshuffle( 0, 1, 'x') masked_claim_attconv = attentive_sent_embeddings_r * joint_sents_labels.dimshuffle( 0, 1, 'x') fine_max = T.concatenate([ T.max(masked_sents_attconv, axis=1), T.max(masked_claim_attconv, axis=1) ], axis=1) #(batch, 2*hidden) # fine_sum = T.concatenate([T.sum(masked_sents_attconv, axis=1),T.sum(masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden) "Logistic Regression layer" joint_LR_input = T.concatenate([joint_premise_hypo_emb, fine_max], axis=1) joint_LR_input_size = 2 * hidden_size[0] + 2 * hidden_size[0] joint_U_a = create_ensemble_para(rng, 3, joint_LR_input_size) # (input_size, 3) joint_LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class joint_LR_para = [joint_U_a, joint_LR_b] joint_layer_LR = LogisticRegression( rng, input=joint_LR_input, n_in=joint_LR_input_size, n_out=3, W=joint_U_a, b=joint_LR_b ) #basically it is a multiplication between weight matrix and input feature vector joint_loss = joint_layer_LR.negative_log_likelihood( joint_labels ) #for classification task, we usually used negative log likelihood as loss, the lower the better. ''' testing ''' # binarize_prob = T.where( inter_matrix > 0.5, 1, 0) #(batch_size, cand_size masked_inter_matrix = inter_matrix * sents_labels #(batch, cand_size) test_premise_emb = T.sum(batch_sent_emb * masked_inter_matrix.dimshuffle(0, 1, 'x'), axis=1) test_premise_hypo_emb = T.concatenate([test_premise_emb, claim_embeddings], axis=1) #fine-maxsum sents_tensor3 = embed_input_sents.dimshuffle(0, 2, 1).reshape( (batch_size, cand_size * sent_len, emb_size)) sents_dot = T.batched_dot(sents_tensor3, sents_tensor3.dimshuffle( 0, 2, 1)) #(batch_size, cand_size*sent_len, cand_size*sent_len) sents_dot_2_matrix = T.nnet.softmax( sents_dot.reshape( (batch_size * cand_size * sent_len, cand_size * sent_len))) sents_context = T.batched_dot( sents_dot_2_matrix.reshape( (batch_size, cand_size * sent_len, cand_size * sent_len)), sents_tensor3) #(batch_size, cand_size*sent_len, emb_size) add_sents_context = embed_input_sents + sents_context.reshape( (batch_size * cand_size, sent_len, emb_size) ).dimshuffle( 0, 2, 1 ) #T.concatenate([embed_input_sents, sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len) test_attentive_conv_layer = Attentive_Conv_for_Pair_easy_version( rng, input_tensor3= add_sents_context, #batch_size*cand_size, 2*emb_size, sent_len input_tensor3_r=T.repeat(embed_input_claim, cand_size, axis=0), mask_matrix=sents_mask.reshape( (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])), mask_matrix_r=T.repeat(claim_mask, cand_size, axis=0), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=att_conv_W, b=att_conv_b, W_context=conv_W_context, b_context=conv_b_context) # attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l #(batch_size*cand_size, hidden_size) # attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r test_attentive_sent_embeddings_l = test_attentive_conv_layer.attentive_maxpool_vec_l.reshape( (batch_size, cand_size, hidden_size[0])) #(batch_size*cand_size, hidden_size) test_attentive_sent_embeddings_r = test_attentive_conv_layer.attentive_maxpool_vec_r.reshape( (batch_size, cand_size, hidden_size[0])) test_masked_sents_attconv = test_attentive_sent_embeddings_l * masked_inter_matrix.dimshuffle( 0, 1, 'x') test_masked_claim_attconv = test_attentive_sent_embeddings_r * masked_inter_matrix.dimshuffle( 0, 1, 'x') test_fine_max = T.concatenate([ T.max(test_masked_sents_attconv, axis=1), T.max(test_masked_claim_attconv, axis=1) ], axis=1) #(batch, 2*hidden) # test_fine_sum = T.concatenate([T.sum(test_masked_sents_attconv, axis=1),T.sum(test_masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden) test_LR_input = T.concatenate([test_premise_hypo_emb, test_fine_max], axis=1) test_LR_input_size = joint_LR_input_size test_layer_LR = LogisticRegression( rng, input=test_LR_input, n_in=test_LR_input_size, n_out=3, W=joint_U_a, b=joint_LR_b ) #basically it is a multiplication between weight matrix and input feature vector params = [init_embeddings] + NN_para + LR_para + joint_LR_para cost = loss + joint_loss "Use AdaGrad to update parameters" updates = Gradient_Cost_Para(cost, params, learning_rate) train_model = theano.function([ sents_ids, sents_mask, sents_labels, claim_ids, claim_mask, joint_sents_ids, joint_sents_mask, joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels ], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function([ sents_ids, sents_mask, sents_labels, claim_ids, claim_mask, joint_labels ], [ inter_matrix, test_layer_LR.errors(joint_labels), test_layer_LR.y_pred ], allow_input_downcast=True, on_unused_input='ignore') dev_model = theano.function([ sents_ids, sents_mask, sents_labels, claim_ids, claim_mask, joint_labels ], [ inter_matrix, test_layer_LR.errors(joint_labels), test_layer_LR.y_pred ], allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False joint_n_train_batches = joint_train_size / batch_size joint_train_batch_start = list( np.arange(joint_n_train_batches) * batch_size) + [joint_train_size - batch_size] n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_test_batches = test_size / batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] n_test_3th_batches = test_3th_size / batch_size test_3th_batch_start = list(np.arange(n_test_3th_batches) * batch_size) + [test_3th_size - batch_size] n_dev_batches = dev_size / batch_size dev_batch_start = list( np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size] n_dev_3th_batches = dev_3th_size / batch_size dev_3th_batch_start = list(np.arange(n_dev_3th_batches) * batch_size) + [dev_3th_size - batch_size] max_acc = 0.0 max_test_f1 = 0.0 max_test_acc = 0.0 cost_i = 0.0 joint_train_indices = range(joint_train_size) train_indices = range(train_size) while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle( joint_train_indices ) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed random.Random(100).shuffle(train_indices) iter_accu = 0 for joint_batch_id in joint_train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * joint_n_train_batches + iter_accu + 1 iter_accu += 1 joint_train_id_batch = joint_train_indices[ joint_batch_id:joint_batch_id + batch_size] for i in range(3): batch_id = random.choice(train_batch_start) train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model( train_sents[train_id_batch], train_sent_masks[train_id_batch], train_sent_labels[train_id_batch], train_claims[train_id_batch], train_claim_mask[train_id_batch], #joint_sents_ids,joint_sents_mask,joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels joint_train_sents[joint_train_id_batch], joint_train_sent_masks[joint_train_id_batch], joint_train_sent_labels[joint_train_id_batch], joint_train_claims[joint_train_id_batch], joint_train_claim_mask[joint_train_id_batch], joint_train_labels[joint_train_id_batch]) #after each 1000 batches, we test the performance of the model on all test data # if (epoch==1 and iter%1000==0) or (epoch>=2 and iter%5==0): if iter % 100 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() f1_sum = 0.0 error_sum = 0.0 full_evi = 0 predictions = [] for test_batch_id in test_batch_start: # for each test batch batch_prob, error_i, pred_i = test_model( test_sents[test_batch_id:test_batch_id + batch_size], test_sent_masks[test_batch_id:test_batch_id + batch_size], test_sent_labels[test_batch_id:test_batch_id + batch_size], test_claims[test_batch_id:test_batch_id + batch_size], test_claim_mask[test_batch_id:test_batch_id + batch_size], test_labels[test_batch_id:test_batch_id + batch_size]) error_sum += error_i batch_sent_labels = test_sent_labels[ test_batch_id:test_batch_id + batch_size] batch_sent_names = test_sent_names[ test_batch_id:test_batch_id + batch_size] batch_ground_names = test_ground_names[ test_batch_id:test_batch_id + batch_size] batch_ground_labels = test_labels[ test_batch_id:test_batch_id + batch_size] for i in range(batch_size): instance_i = {} instance_i['label'] = pred_id2label.get( batch_ground_labels[i]) instance_i['predicted_label'] = pred_id2label.get( pred_i[i]) pred_sent_names = [] gold_sent_names = batch_ground_names[i] zipped = [(batch_prob[i, k], batch_sent_labels[i][k], batch_sent_names[i][k]) for k in range(cand_size)] sorted_zip = sorted(zipped, key=lambda x: x[0], reverse=True) for j in range(cand_size): triple = sorted_zip[j] if triple[1] == 1.0: ''' we should consider a rank, instead of binary if triple[0] >0.5: can control the recall, influence the strict_acc ''' if triple[0] > 0.5: # pred_sent_names.append(batch_sent_names[i][j]) pred_sent_names.append(triple[2]) # if len(pred_sent_names) == max_pred_pick: # break instance_i['predicted_evidence'] = pred_sent_names # print 'pred_sent_names:',pred_sent_names # print 'gold_sent_names:',gold_sent_names new_gold_names = [] for gold_name in gold_sent_names: new_gold_names.append([None, None] + gold_name) instance_i['evidence'] = [new_gold_names] predictions.append(instance_i) strict_score, label_accuracy, precision, recall, f1 = fever_score( predictions) print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1 # test_f1=f1_sum/(len(test_batch_start)*batch_size) for test_batch_id in test_3th_batch_start: # for each test batch _, error_i, pred_i = test_model( test_3th_sents[test_batch_id:test_batch_id + batch_size], test_3th_sent_masks[test_batch_id:test_batch_id + batch_size], test_3th_sent_labels[test_batch_id:test_batch_id + batch_size], test_3th_claims[test_batch_id:test_batch_id + batch_size], test_3th_claim_mask[test_batch_id:test_batch_id + batch_size], test_3th_labels[test_batch_id:test_batch_id + batch_size]) for i in range(batch_size): instance_i = {} instance_i['label'] = pred_id2label.get(2) instance_i['predicted_label'] = pred_id2label.get( pred_i[i]) instance_i['predicted_evidence'] = [] instance_i['evidence'] = [] predictions.append(instance_i) strict_score, label_accuracy, precision, recall, f1 = fever_score( predictions) print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1 if f1 > max_test_f1 or strict_score > max_test_acc: if f1 > max_test_f1: max_test_f1 = f1 if strict_score > max_test_acc: max_test_acc = strict_score #test print '....................\n' f1_sum = 0.0 error_sum = 0.0 full_evi = 0 predictions = [] fine_grained_sent_predictions = { 1: [], 2: [], 3: [], 4: [], 5: [] } fine_grained_page_predictions = { 1: [], 2: [], 3: [], 4: [] } for dev_batch_id in dev_batch_start: # for each test batch batch_prob, error_i, pred_i = dev_model( dev_sents[dev_batch_id:dev_batch_id + batch_size], dev_sent_masks[dev_batch_id:dev_batch_id + batch_size], dev_sent_labels[dev_batch_id:dev_batch_id + batch_size], dev_claims[dev_batch_id:dev_batch_id + batch_size], dev_claim_mask[dev_batch_id:dev_batch_id + batch_size], dev_labels[dev_batch_id:dev_batch_id + batch_size]) error_sum += error_i batch_sent_labels = dev_sent_labels[ dev_batch_id:dev_batch_id + batch_size] batch_sent_names = dev_sent_names[ dev_batch_id:dev_batch_id + batch_size] batch_ground_names = dev_ground_names[ dev_batch_id:dev_batch_id + batch_size] batch_ground_labels = dev_labels[ dev_batch_id:dev_batch_id + batch_size] for i in range(batch_size): instance_i = {} instance_i['label'] = pred_id2label.get( batch_ground_labels[i]) instance_i['predicted_label'] = pred_id2label.get( pred_i[i]) pred_sent_names = [] gold_sent_names = batch_ground_names[i] zipped = [(batch_prob[i, k], batch_sent_labels[i][k], batch_sent_names[i][k]) for k in range(cand_size)] sorted_zip = sorted(zipped, key=lambda x: x[0], reverse=True) for j in range(cand_size): triple = sorted_zip[j] if triple[1] == 1.0: ''' we should consider a rank, instead of binary if triple[0] >0.5: can control the recall, influence the strict_acc ''' if triple[0] > 0.5: # pred_sent_names.append(batch_sent_names[i][j]) pred_sent_names.append(triple[2]) # if len(pred_sent_names) == max_pred_pick: # break instance_i['predicted_evidence'] = pred_sent_names # print 'pred_sent_names:',pred_sent_names # print 'gold_sent_names:',gold_sent_names new_gold_names = [] for gold_name in gold_sent_names: new_gold_names.append([None, None] + gold_name) instance_i['evidence'] = [new_gold_names] predictions.append(instance_i) evi_sent_size, evi_page_size = count_sent_page( gold_sent_names) fine_grained_sent_predictions.get( evi_sent_size).append(instance_i) fine_grained_page_predictions.get( evi_page_size).append(instance_i) strict_score, label_accuracy, precision, recall, f1 = fever_score( predictions) print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1 print '......sent...\n' for i in range(1, 6): predictions_i = fine_grained_sent_predictions.get(i) if len(predictions_i) > 0: strict_score, label_accuracy, precision, recall, f1 = fever_score( predictions_i) print i, '\tstrict, all, pre, rec, f1: ', strict_score, label_accuracy, precision, recall, f1 else: print i, '\tstrict, all, pre, rec, f1: ', 0.0, 0.0, 0.0, 0.0, 0.0 print '......page...\n' for i in range(1, 5): predictions_i = fine_grained_page_predictions.get(i) if len(predictions_i) > 0: strict_score, label_accuracy, precision, recall, f1 = fever_score( predictions_i) print i, '\tstrict, all, pre, rec, f1: ', strict_score, label_accuracy, precision, recall, f1 else: print i, '\tstrict, all, pre, rec, f1: ', 0.0, 0.0, 0.0, 0.0, 0.0 for dev_batch_id in dev_3th_batch_start: # for each test batch _, error_i, pred_i = dev_model( dev_3th_sents[dev_batch_id:dev_batch_id + batch_size], dev_3th_sent_masks[dev_batch_id:dev_batch_id + batch_size], dev_3th_sent_labels[dev_batch_id:dev_batch_id + batch_size], dev_3th_claims[dev_batch_id:dev_batch_id + batch_size], dev_3th_claim_mask[dev_batch_id:dev_batch_id + batch_size], dev_3th_labels[dev_batch_id:dev_batch_id + batch_size]) for i in range(batch_size): instance_i = {} instance_i['label'] = pred_id2label.get(2) instance_i['predicted_label'] = pred_id2label.get( pred_i[i]) instance_i['predicted_evidence'] = [] instance_i['evidence'] = [] predictions.append(instance_i) strict_score, label_accuracy, precision, recall, f1 = fever_score( predictions) print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1 print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_acc_test
lstm_b[42:63] = (5. * np.ones((21, ))).astype(theano.config.floatX) lstm_b[63:] = bh rat = np.zeros((21, )) for i in range(21): rat[i] = 1. * sum(y1.flatten() == i) / y1.size print rat # endregion # region Build Model print 'Building Model...' np.random.seed(seed=123) tX = T.ftensor3('tX') tH = T.fmatrix('tH') tC = T.fmatrix('tC') tm = T.fmatrix('tm') ty = T.imatrix('ty') classifier = LSTM_A(seqs=tX, h0s=tH, c0s=tC, masks=tm, dim_x=909, dim_h=21, dim_y=21, wt_y=wt_y, lstm_W=lstm_W, lstm_U=lstm_U,
def main(options): print 'Build and compile network' input_data = T.ftensor3('input_data') input_mask = T.fmatrix('input_mask') target_data = T.imatrix('target_data') target_mask = T.fmatrix('target_mask') network = build_network(input_data=input_data, input_mask=input_mask, num_inputs=options['num_inputs'], num_units_list=options['num_units_list'], num_outputs=options['num_outputs'], dropout_ratio=options['dropout_ratio'], weight_noise=options['weight_noise'], use_layer_norm=options['use_layer_norm'], peepholes=options['peepholes'], learn_init=options['learn_init'], grad_clipping=options['grad_clipping'], gradient_steps=options['gradient_steps'], use_projection=options['use_projection']) network_params = get_all_params(network, trainable=True) print("number of parameters in model: %d" % count_params(network, trainable=True)) if options['reload_model']: print('Loading Parameters...') pretrain_network_params_val, pretrain_update_params_val, pretrain_total_batch_cnt = pickle.load(open(options['reload_model'], 'rb')) print('Applying Parameters...') set_model_param_value(network_params, pretrain_network_params_val) else: pretrain_update_params_val = None pretrain_total_batch_cnt = 0 print 'Build network trainer' training_fn, trainer_params = set_network_trainer(input_data=input_data, input_mask=input_mask, target_data=target_data, target_mask=target_mask, num_outputs=options['num_outputs'], network=network, updater=options['updater'], learning_rate=options['lr'], grad_max_norm=options['grad_norm'], l2_lambda=options['l2_lambda'], load_updater_params=pretrain_update_params_val) print 'Build network predictor' predict_fn = set_network_predictor(input_data=input_data, input_mask=input_mask, target_data=target_data, target_mask=target_mask, num_outputs=options['num_outputs'], network=network) print 'Load data stream' train_datastream = get_datastream(path=options['data_path'], which_set='train_si84', batch_size=options['batch_size']) print 'Start training' if os.path.exists(options['save_path'] + '_eval_history.npz'): evaluation_history = numpy.load(options['save_path'] + '_eval_history.npz')['eval_history'].tolist() else: evaluation_history = [[[10.0, 10.0, 1.0], [10.0, 10.0, 1.0]]] early_stop_flag = False early_stop_cnt = 0 total_batch_cnt = 0 try: # for each epoch for e_idx in range(options['num_epochs']): # for each batch for b_idx, data in enumerate(train_datastream.get_epoch_iterator()): total_batch_cnt += 1 if pretrain_total_batch_cnt>=total_batch_cnt: continue # get input, target data input_data = data[0].astype(floatX) input_mask = data[1].astype(floatX) # get target data target_data = data[2] target_mask = data[3].astype(floatX) # get output train_output = training_fn(input_data, input_mask, target_data, target_mask) train_predict_cost = train_output[0] network_grads_norm = train_output[1] # show intermediate result if total_batch_cnt%options['train_disp_freq'] == 0 and total_batch_cnt!=0: best_idx = numpy.asarray(evaluation_history)[:, 1, 2].argmin() print '============================================================================================' print 'Model Name: ', options['save_path'].split('/')[-1] print '============================================================================================' print 'Epoch: ', str(e_idx), ', Update: ', str(total_batch_cnt) print '--------------------------------------------------------------------------------------------' print 'Prediction Cost: ', str(train_predict_cost) print 'Gradient Norm: ', str(network_grads_norm) print '--------------------------------------------------------------------------------------------' print 'Train NLL: ', str(evaluation_history[-1][0][0]), ', BPC: ', str(evaluation_history[-1][0][1]), ', FER: ', str(evaluation_history[-1][0][2]) print 'Valid NLL: ', str(evaluation_history[-1][1][0]), ', BPC: ', str(evaluation_history[-1][1][1]), ', FER: ', str(evaluation_history[-1][1][2]) print '--------------------------------------------------------------------------------------------' print 'Best NLL: ', str(evaluation_history[best_idx][1][0]), ', BPC: ', str(evaluation_history[best_idx][1][1]), ', FER: ', str(evaluation_history[best_idx][1][2]) # evaluation if total_batch_cnt%options['train_eval_freq'] == 0 and total_batch_cnt!=0: train_eval_datastream = get_datastream(path=options['data_path'], which_set='train_si84', batch_size=options['eval_batch_size']) valid_eval_datastream = get_datastream(path=options['data_path'], which_set='test_dev93', batch_size=options['eval_batch_size']) train_nll, train_bpc, train_fer = network_evaluation(predict_fn, train_eval_datastream) valid_nll, valid_bpc, valid_fer = network_evaluation(predict_fn, valid_eval_datastream) # check over-fitting if valid_fer>numpy.asarray(evaluation_history)[:, 1, 2].min(): early_stop_cnt += 1. else: early_stop_cnt = 0. best_network_params_vals = get_model_param_values(network_params) pickle.dump(best_network_params_vals, open(options['save_path'] + '_best_model.pkl', 'wb')) if early_stop_cnt>10: early_stop_flag = True break # save results evaluation_history.append([[train_nll, train_bpc, train_fer], [valid_nll, valid_bpc, valid_fer]]) numpy.savez(options['save_path'] + '_eval_history', eval_history=evaluation_history) # save network if total_batch_cnt%options['train_save_freq'] == 0 and total_batch_cnt!=0: cur_network_params_val = get_model_param_values(network_params) cur_trainer_params_val = get_update_params_values(trainer_params) cur_total_batch_cnt = total_batch_cnt pickle.dump([cur_network_params_val, cur_trainer_params_val, cur_total_batch_cnt], open(options['save_path'] + '_last_model.pkl', 'wb')) if early_stop_flag: break except KeyboardInterrupt: print 'Training Interrupted' cur_network_params_val = get_model_param_values(network_params) cur_trainer_params_val = get_update_params_values(trainer_params) cur_total_batch_cnt = total_batch_cnt pickle.dump([cur_network_params_val, cur_trainer_params_val, cur_total_batch_cnt], open(options['save_path'] + '_last_model.pkl', 'wb'))
def __init__(self, We, params): lstm_layers_num = 1 emb_size = We.shape[1] self.eta = params.eta self.num_labels = params.num_labels self.en_hidden_size = params.en_hidden_size self.de_hidden_size = params.de_hidden_size self.lstm_layers_num = params.lstm_layers_num self._train = None self._utter = None self.params = [] self.encoder_lstm_layers = [] self.decoder_lstm_layers = [] self.hos = [] self.Cos = [] encoderInputs = tensor.imatrix() decoderInputs, decoderTarget = tensor.imatrices(2) encoderMask, TF, decoderMask, decoderInputs0 = tensor.fmatrices(4) self.lookuptable = theano.shared(We) #### the last one is for the stary symbole self.de_lookuptable = theano.shared(name="Decoder LookUpTable", value=init_xavier_uniform(self.num_labels +1, self.de_hidden_size), borrow=True) self.linear = theano.shared(name="Linear", value = init_xavier_uniform(self.de_hidden_size+2*self.en_hidden_size, self.num_labels), borrow= True) self.linear_bias = theano.shared(name="Hidden to Bias", value=np.asarray(np.random.randn(self.num_labels, )*0., dtype=theano.config.floatX), borrow=True) #self.hidden_decode = theano.shared(name="Hidden to Decode", value= init_xavier_uniform(2*en_hidden_size, self.de_hidden_size), borrow = True) #self.hidden_bias = theano.shared( # name="Hidden to Bias", # value=np.asarray(np.random.randn(self.de_hidden_size, )*0., dtype=theano.config.floatX) , # borrow=True # ) #self.params += [self.linear, self.de_lookuptable, self.hidden_decode, self.hidden_bias] #concatenate self.params += [self.linear, self.linear_bias , self.de_lookuptable] #the initial hidden state of decoder lstm is zeros #(max_sent_size, batch_size, hidden_size) state_below = self.lookuptable[encoderInputs.flatten()].reshape((encoderInputs.shape[0], encoderInputs.shape[1], emb_size)) for _ in range(self.lstm_layers_num): enclstm_f = LSTM(emb_size, self.en_hidden_size) enclstm_b = LSTM(emb_size, self.en_hidden_size, True) self.encoder_lstm_layers.append(enclstm_f) #append self.encoder_lstm_layers.append(enclstm_b) #append self.params += enclstm_f.params + enclstm_b.params #concatenate hs_f, Cs_f = enclstm_f.forward(state_below, encoderMask) hs_b, Cs_b = enclstm_b.forward(state_below, encoderMask) hs = tensor.concatenate([hs_f, hs_b], axis=2) Cs = tensor.concatenate([Cs_f, Cs_b], axis=2) hs0 = tensor.concatenate([hs_f[-1], hs_b[0]], axis=1) Cs0 = tensor.concatenate([Cs_f[-1], Cs_b[0]], axis=1) #self.hos += tensor.tanh(tensor.dot(hs0, self.hidden_decode) + self.hidden_bias), #self.Cos += tensor.tanh(tensor.dot(Cs0, self.hidden_decode) + self.hidden_bias), self.hos += tensor.alloc(np.asarray(0., dtype=theano.config.floatX), encoderInputs.shape[1], self.de_hidden_size), self.Cos += tensor.alloc(np.asarray(0., dtype=theano.config.floatX), encoderInputs.shape[1], self.de_hidden_size), state_below = hs Encoder = state_below ei, di, dt = tensor.imatrices(3) #place holders em, dm, tf, di0 =tensor.fmatrices(4) self.encoder_function = theano.function(inputs=[ei, em], outputs=Encoder, givens={encoderInputs:ei, encoderMask:em}) ##################################################### ##################################################### state_below = self.de_lookuptable[decoderInputs.flatten()].reshape((decoderInputs.shape[0], decoderInputs.shape[1], self.de_hidden_size)) for i in range(self.lstm_layers_num): declstm = LSTM(self.de_hidden_size, self.de_hidden_size) self.decoder_lstm_layers += declstm, #append self.params += declstm.params #concatenate ho, Co = self.hos[i], self.Cos[i] state_below, Cs = declstm.forward(state_below, decoderMask, ho, Co) ##### Here we include the representation from the decoder decoder_lstm_outputs = tensor.concatenate([state_below, Encoder], axis=2) linear_outputs = tensor.dot(decoder_lstm_outputs, self.linear) + self.linear_bias[None, None, :] softmax_outputs, _ = theano.scan( fn=lambda x: tensor.nnet.softmax(x), sequences=[linear_outputs], ) def _NLL(pred, y, m): return -m * tensor.log(pred[tensor.arange(encoderInputs.shape[1]), y]) costs, _ = theano.scan(fn=_NLL, sequences=[softmax_outputs, decoderTarget, decoderMask]) loss = costs.sum() / decoderMask.sum() + params.L2*sum(lasagne.regularization.l2(x) for x in self.params) updates = lasagne.updates.adam(loss, self.params, self.eta) #updates = lasagne.updates.apply_momentum(updates, self.params, momentum=0.9) ################################################### #### using the ground truth when training ################################################## self._train = theano.function( inputs=[ei, em, di, dm, dt], outputs=[loss, softmax_outputs], updates=updates, givens={encoderInputs:ei, encoderMask:em, decoderInputs:di, decoderMask:dm, decoderTarget:dt} ) ######################################################################### ### For schedule sampling ######################################################################### ###### always use privous predict as next input def _step2(ctx_, state_, hs_, Cs_): ### ctx_: b x h ### state_ : b x h ### hs_ : 1 x b x h the first dimension is the number of the decoder layers ### Cs_ : 1 x b x h the first dimension is the number of the decoder layers hs, Cs = [], [] token_idxs = tensor.cast(state_.argmax(axis=-1), "int32" ) msk_ = tensor.fill( (tensor.zeros_like(token_idxs, dtype="float32")), 1) msk_ = msk_.dimshuffle('x', 0) state_below0 = self.de_lookuptable[token_idxs].reshape((1, ctx_.shape[0], self.de_hidden_size)) for i, lstm in enumerate(self.decoder_lstm_layers): h, C = lstm.forward(state_below0, msk_, hs_[i], Cs_[i]) #mind msk hs += h[-1], Cs += C[-1], state_below0 = h hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable(Cs) state_below0 = state_below0.reshape((ctx_.shape[0], self.de_hidden_size)) state_below0 = tensor.concatenate([ctx_, state_below0], axis =1) newpred = tensor.dot(state_below0, self.linear) + self.linear_bias[None, :] state_below = tensor.nnet.softmax(newpred) ##### the beging symbole probablity is 0 extra_p = tensor.zeros_like(hs[:,:,0]) state_below = tensor.concatenate([state_below, extra_p.T], axis=1) return state_below, hs, Cs ctx_0, state_0 =tensor.fmatrices(2) hs_0 = tensor.ftensor3() Cs_0 = tensor.ftensor3() state_below_tmp, hs_tmp, Cs_tmp = _step2(ctx_0, state_0, hs_0, Cs_0) self.f_next = theano.function([ctx_0, state_0, hs_0, Cs_0], [state_below_tmp, hs_tmp, Cs_tmp], name='f_next') hs0, Cs0 = tensor.as_tensor_variable(self.hos, name="hs0"), tensor.as_tensor_variable(self.Cos, name="Cs0") train_outputs, _ = theano.scan( fn=_step2, sequences= [Encoder], outputs_info=[decoderInputs0, hs0, Cs0], n_steps=encoderInputs.shape[0] ) train_predict = train_outputs[0] train_costs, _ = theano.scan(fn=_NLL, sequences=[train_predict, decoderTarget, decoderMask]) train_loss = train_costs.sum() / decoderMask.sum() + params.L2*sum(lasagne.regularization.l2(x) for x in self.params) #from adam import adam #train_updates = adam(train_loss, self.params, self.eta) #train_updates = lasagne.updates.apply_momentum(train_updates, self.params, momentum=0.9) #train_updates = lasagne.updates.sgd(train_loss, self.params, self.eta) #train_updates = lasagne.updates.apply_momentum(train_updates, self.params, momentum=0.9) from momentum import momentum train_updates = momentum(train_loss, self.params, params.eta, momentum=0.9) self._train2 = theano.function( inputs=[ei, em, di0, dm, dt], outputs=[train_loss, train_predict], updates=train_updates, givens={encoderInputs:ei, encoderMask:em, decoderInputs0:di0, decoderMask:dm, decoderTarget:dt} #givens={encoderInputs:ei, encoderMask:em, decoderInputs:di, decoderMask:dm, decoderTarget:dt, TF:tf} ) listof_token_idx = train_predict.argmax(axis=-1) self._utter = theano.function( inputs=[ei, em, di0], outputs=listof_token_idx, givens={encoderInputs:ei, encoderMask:em, decoderInputs0:di0} )
def build_model(shared_params, options): trng = RandomStreams(1234) drop_ratio = options['drop_ratio'] batch_size = options['batch_size'] n_dim = options['n_dim'] w_emb = shared_params['w_emb'] dropout = theano.shared(numpy.float32(0.)) image_feat = T.ftensor3('image_feat') # T x batch_size input_idx = T.imatrix('input_idx') input_mask = T.matrix('input_mask') # label is the TRUE label label = T.ivector('label') empty_word = theano.shared(value=np.zeros((1, options['n_emb']), dtype='float32'), name='empty_word') w_emb_extend = T.concatenate([empty_word, shared_params['w_emb']], axis=0) input_emb = w_emb_extend[input_idx] # get the transformed image feature h_0 = theano.shared(numpy.zeros((batch_size, n_dim), dtype='float32')) c_0 = theano.shared(numpy.zeros((batch_size, n_dim), dtype='float32')) if options['sent_drop']: input_emb = dropout_layer(input_emb, dropout, trng, drop_ratio) h_from_lstm, c_encode = lstm_layer(shared_params, input_emb, input_mask, h_0, c_0, options, prefix='sent_lstm') # pick the last one as encoder image_feat_down = fflayer(shared_params, image_feat, options, prefix='image_mlp', act_func=options.get('image_mlp_act', 'tanh')) r_0 = theano.shared(numpy.zeros((batch_size, n_dim), dtype='float32')) h_encode = wbw_attention_layer(shared_params, image_feat_down, h_from_lstm, input_mask, r_0, options, prefix='wbw_attention' ) h_encode = h_encode[-1] image_feat_attention_1 = fflayer(shared_params, image_feat_down, options, prefix='image_att_mlp_1', act_func=options.get('image_att_mlp_act', 'tanh')) h_encode_attention_1 = fflayer(shared_params, h_encode, options, prefix='sent_att_mlp_1', act_func=options.get('sent_att_mlp_act', 'tanh')) # combined_feat_attention_1 = image_feat_attention_1 + \ h_encode_attention_1[:, None, :] if options['use_attention_drop']: combined_feat_attention_1 = dropout_layer(combined_feat_attention_1, dropout, trng, drop_ratio) combined_feat_attention_1 = fflayer(shared_params, combined_feat_attention_1, options, prefix='combined_att_mlp_1', act_func=options.get( 'combined_att_mlp_act', 'tanh')) prob_attention_1 = T.nnet.softmax(combined_feat_attention_1[:, :, 0]) image_feat_ave_1 = (prob_attention_1[:, :, None] * image_feat_down).sum(axis=1) combined_hidden_1 = image_feat_ave_1 + h_encode # second layer attention model image_feat_attention_2 = fflayer(shared_params, image_feat_down, options, prefix='image_att_mlp_2', act_func=options.get('image_att_mlp_act', 'tanh')) h_encode_attention_2 = fflayer(shared_params, combined_hidden_1, options, prefix='sent_att_mlp_2', act_func=options.get('sent_att_mlp_act', 'tanh')) combined_feat_attention_2 = image_feat_attention_2 + \ h_encode_attention_2[:, None, :] if options['use_attention_drop']: combined_feat_attention_2 = dropout_layer(combined_feat_attention_2, dropout, trng, drop_ratio) combined_feat_attention_2 = fflayer(shared_params, combined_feat_attention_2, options, prefix='combined_att_mlp_2', act_func=options.get( 'combined_att_mlp_act', 'tanh')) prob_attention_2 = T.nnet.softmax(combined_feat_attention_2[:, :, 0]) image_feat_ave_2 = (prob_attention_2[:, :, None] * image_feat_down).sum(axis=1) if options.get('use_final_image_feat_only', False): combined_hidden = image_feat_ave_2 + h_encode else: combined_hidden = image_feat_ave_2 + combined_hidden_1 for i in range(options['combined_num_mlp']): if options.get('combined_mlp_drop_%d'%(i), False): combined_hidden = dropout_layer(combined_hidden, dropout, trng, drop_ratio) if i == options['combined_num_mlp'] - 1: combined_hidden = fflayer(shared_params, combined_hidden, options, prefix='combined_mlp_%d'%(i), act_func='linear') else: combined_hidden = fflayer(shared_params, combined_hidden, options, prefix='combined_mlp_%d'%(i), act_func=options.get('combined_mlp_act_%d'%(i), 'tanh')) # drop the image output prob = T.nnet.softmax(combined_hidden) prob_y = prob[T.arange(prob.shape[0]), label] pred_label = T.argmax(prob, axis=1) # sum or mean? cost = -T.mean(T.log(prob_y)) accu = T.mean(T.eq(pred_label, label)) return image_feat, input_idx, input_mask, \ label, dropout, cost, accu
import lasagne from recognizer import Recognizer BATCH_SIZE = 16 EMB_DIM = 256 SPEAKER_DIM = 128 ENC_DIM = 128 V = 43 + 1 NB_EPOCHS = 40 N_SPEAKERS = 21 + 1 OUTPUT_DIM = 63 LR = 0.001 SAVE_FILE_NAME = 'blizzard_cnn_mapper.pkl' WEIGHTNORM = True X = T.ftensor3() mask = T.fmatrix() ctx = T.imatrix() learn_rate = T.fscalar() def RecurrentMapper(ctx): emb_ctx = lib.ops.Embedding('Mapper.Generator.Embedding_Context', V, ENC_DIM, ctx) batch_size = T.shape(ctx)[0] seq_len = T.shape(ctx)[1] out = lib.ops.BiGRU('Mapper.Generator.BiGRU', emb_ctx, ENC_DIM, 256) readout = lib.ops.Linear('Mapper.Generator.FC', out, 512, EMB_DIM) return readout
# Constants num_train = 5000 num_test = 500 arch_size = [None, 110, 2] # Set filename if args.exp=='task2': comb_filename = '{}_task2_{}_bn_{}_{}'.format(args.filename, args.model_type, args.batch_norm, args.seed) else: comb_filename = '{}_task1_{}_bn_{}_reg_samp_{}_samp_res_{}_{}'.format(args.filename, args.model_type, args.batch_norm, args.sample_regularly, args.sample_res, args.seed) if args.run_id != '': comb_filename += '_{}'.format(args.run_id) # Create symbolic vars input_var = T.ftensor3('my_input_var') mask_var = T.bmatrix('my_mask') target_var = T.ivector('my_targets') time_var = T.fmatrix('my_timevar') # Build model print("Building network ...") # Get input dimensions network = get_rnn(input_var, mask_var, time_var, arch_size, args.grad_clip, bn=args.batch_norm, model_type=args.model_type) # Instantiate log log = defaultdict(list) print("Built.") # Resume if desired if args.resume: