def random_search_gpu(
    modal_names, train_probs, val_probs,
    target_train, target_val, numpy_rng, n_iter=400):

    n_modal = train_probs.shape[0]
    n_cls = train_probs.shape[2]
    
    # sample random weights and normalize so the modalities sum to 1
    # for each class
    
    weight_samples = T.ftensor3('weight_samples')
    probs = T.ftensor3('probs')
    targets = T.ivector('targets')
    preds = T.argmax(
        T.sum(probs.dimshuffle('x',0,1,2) * weight_samples.dimshuffle(0,1,'x',2), axis=1),
        axis=2)
    accs = T.mean(T.eq(preds, targets.dimshuffle('x',0)), axis=1)
    best_index = T.argmax(accs)
    best_acc = accs[best_index]
    best_weights = weight_samples[best_index]
    print 'compiling functtion'
    fn = theano.function([weight_samples, probs, targets],
        [best_weights, best_index, best_acc])
    print 'done'
    weight_samples_np = numpy_rng.rand(n_iter, n_modal, n_cls).astype(np.float32)
    weight_samples_np /= weight_samples_np.sum(1)[:, None, :]
    
    return fn(weight_samples_np, val_probs, target_val)
Пример #2
0
def test_pycuda_elemwise_kernel():
    x=T.fmatrix('x')
    y=T.fmatrix('y')
    f=theano.function([x,y],x+y, mode=mode_with_gpu)
    print f.maker.env.toposort()
    f2 = theano.function([x,y],x+y, mode=mode_with_gpu.including("local_pycuda_gpu_elemwise_kernel"))
    print f2.maker.env.toposort()

    assert any([ isinstance(node.op, theano.sandbox.cuda.GpuElemwise) for node in f.maker.env.toposort()])
    assert any([ isinstance(node.op, PycudaElemwiseKernelOp) for node in f2.maker.env.toposort()])

    val1 = numpy.asarray(numpy.random.rand(5,5), dtype='float32')
    val2 = numpy.asarray(numpy.random.rand(5,5), dtype='float32')
    #val1 = numpy.ones((5,5))
    #val2 = numpy.arange(25).reshape(5,5)
    assert (f(val1,val2) == f2(val1,val2)).all()
    print f(val1,val2)
    print f2(val1,val2)


    x3=T.ftensor3('x')
    y3=T.ftensor3('y')
    z3=T.ftensor3('y')

    f4 = theano.function([x3,y3,z3],x3*y3+z3, mode=mode_with_gpu.including("local_pycuda_gpu_elemwise_kernel"))
    print f4.maker.env.toposort()
    assert any([ isinstance(node.op, PycudaElemwiseKernelOp) for node in f4.maker.env.toposort()])

    val1 = numpy.random.rand(2,2,2)
    print val1
    print f4(val1,val1,val1)
    assert numpy.allclose(f4(val1,val1,val1),val1*val1+val1)
Пример #3
0
 def make_node(self, x, x2, x3, x4, x5):
     # check that the theano version has support for __props__.
     # This next line looks like it has a typo,
     # but it's actually a way to detect the theano version
     # is sufficiently recent to support the use of __props__.
     assert hasattr(self, '_props'), "Your version of theano is too old to support __props__."
     x = tensor.as_tensor_variable(x)
     x2 = tensor.as_tensor_variable(x2)
     x3 = tensor.as_tensor_variable(x3)
     x4 = tensor.as_tensor_variable(x4)
     x5 = tensor.as_tensor_variable(x5)
     
     if prm.att_doc:
         if prm.compute_emb:
             td = tensor.itensor4().type()
         else:
             td = tensor.ftensor4().type()
         tm = tensor.ftensor3().type()
     else:
         if prm.compute_emb:
             td = tensor.itensor3().type()
         else:
             td = tensor.ftensor3().type()
         tm = tensor.fmatrix().type()
     return theano.Apply(self, [x,x2,x3,x4,x5], [td, tm, \
                                        tensor.fmatrix().type(), tensor.ivector().type()])
Пример #4
0
def test_attention_dot_does_not_crash():
  Z = T.ftensor3('Z')
  B = T.ftensor3('B') #base
  W_re = T.fmatrix('W_re')
  W_att_quadr = T.fmatrix("W_att_quadr")
  W_att_in = T.fmatrix('W_att_in')
  c = T.fmatrix('c') #initial state
  y0 = T.fmatrix('y0') #initial activation
  i = T.matrix('i',dtype='int8')
  Y, H, d = LSTMCustomDotAttentionOpNoInplaceInstance(Z, c, y0, i, W_re, B, W_att_in, W_att_quadr)

  f = theano.function(inputs=[Z, B, c, y0, i, W_re, W_att_in, W_att_quadr], outputs=Y)

  n_B = 8
  n_T = 5
  n_batch = 4
  n_cells = 8
  numpy.random.seed(1234)
  Z_val = numpy.random.ranf((n_T,n_batch,4*n_cells)).astype('float32')
  B_val = numpy.random.ranf((n_B,n_batch,n_cells)).astype('float32')
  W_re_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32')
  W_att_quadr_val = numpy.eye(n_B).astype('float32')
  W_att_in_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32')
  c_val = numpy.random.ranf((n_batch, n_cells)).astype('float32')
  y0_val = numpy.random.ranf((n_batch, n_cells)).astype('float32')
  #i_val = numpy.ones((n_T, n_batch), dtype='int8')
  i_val = numpy.array([[1,1,1,1,1], [0,0,1,1,1], [0,0,1,1,1], [0,0,1,0,0]], dtype='int8').T

  Y_val = numpy.asarray(f(Z_val, B_val, c_val, y0_val, i_val, W_re_val, W_att_in_val, W_att_quadr_val))
  #print Y_val
  print("success")
Пример #5
0
def weighting():
    from theano.tensor import TensorType

    x = T.ftensor3()
    # w = TensorType('float32', (False, False, True))()
    w = T.ftensor3()
    # z = T.dot(w, x)
    # y = T.addbroadcast(w, 2)
    # y = w.reshape([w.shape[0], w.shape[1]])
    y = T.flatten(w, 2)
    z = x * y


    f = theano.function(inputs=[x, w], outputs=z)
    input1 = np.arange(8).reshape([2, 2, 2]).astype('float32')
    input2 = np.array(
        [
            [
                [0.1], [0.2]
            ],
            [
                [0.2], [0.4]
            ]
        ]
    ).astype('float32')
    print input1, input1.shape
    print
    print input2, input2.shape
    print
    print f(input1, input2)
Пример #6
0
    def _setup_vars(self, sparse_input):
        '''Setup Theano variables for our network.

        Parameters
        ----------
        sparse_input : bool
            Not used -- sparse inputs are not supported for recurrent networks.

        Returns
        -------
        vars : list of theano variables
            A list of the variables that this network requires as inputs.
        '''
        _warn_dimshuffle()

        assert not sparse_input, 'Theanets does not support sparse recurrent models!'

        self.src = TT.ftensor3('src')
        #self.src_mask = TT.imatrix('src_mask')
        self.src_mask = TT.matrix('src_mask')
        self.dst = TT.ftensor3('dst')
        self.labels = TT.imatrix('labels')
        self.weights = TT.matrix('weights')

        if self.weighted:
            return [self.src, self.src_mask, self.dst, self.labels, self.weights]
        return [self.src, self.dst]
Пример #7
0
def test_batched_dot():
    a = T.ftensor3('a')
    b = T.ftensor3('b')

    c = my_batched_dot(a, b)

    # Test in with values
    dim1, dim2, dim3, dim4 = 10, 12, 15, 20

    A_shape = (dim1, dim2, dim3)
    B_shape = (dim1, dim3, dim4)
    C_shape = (dim1, dim2, dim4)

    A = np.arange(np.prod(A_shape)).reshape(A_shape).astype(floatX)
    B = np.arange(np.prod(B_shape)).reshape(B_shape).astype(floatX)

    C = c.eval({a: A, b: B})

    # check shape
    assert C.shape == C_shape

    # check content
    C_ = np.zeros((dim1, dim2, dim4))
    for i in range(dim1):
        C_[i] = np.dot(A[i], B[i])
    assert np.allclose(C, C_)
Пример #8
0
 def test_infer_shape(self):
     # only matrix / matrix is supported
     admat = tensor.ftensor3()
     bdmat = tensor.ftensor3()
     admat_val = my_rand(7, 4, 5)
     bdmat_val = my_rand(7, 5, 3)
     self._compile_and_check([admat, bdmat], [GpuBatchedDot()(admat, bdmat)], [admat_val, bdmat_val], GpuBatchedDot)
Пример #9
0
def testSNLIExample():
    """
    Test an example actually taken from SNLI dataset on LSTM pipeline.
    """
    start = time.time()
    table = EmbeddingTable(dataPath+"glove.6B.50d.txt.gz")
    dataStats= "/Users/mihaileric/Documents/Research/LSTM-NLI/data/" \
               "test_dataStats.json"
    dataJSONFile= "/Users/mihaileric/Documents/Research/LSTM-NLI/data/" \
                  "snli_1.0_test.jsonl"
    premiseTensor, hypothesisTensor = table.convertDataToEmbeddingTensors(
                                                dataJSONFile, dataStats)

    symPremise = T.ftensor3("inputPremise")
    symHypothesis = T.ftensor3("inputHypothesis")

    premiseSent = premiseTensor[:, 0:3, :]
    hypothesisSent = hypothesisTensor[:, 0:3, :]

    network = LSTMP2H(numTimestepsPremise=57, numTimestepsHypothesis=30,
                      dimInput=10, embedData="/Users/mihaileric/Documents/Research/"
                                             "LSTM-NLI/data/glove.6B.50d.txt.gz")
    network.printLSTMP2HParams()

    predictFunc = network.predictFunc(symPremise, symHypothesis)
    labels = network.predict(premiseSent, hypothesisSent, predictFunc)

    for l in labels:
        print "Label: %s" %(l)

    print "Time for evaluation: %f" %(time.time() - start)
Пример #10
0
    def theano_vars(self):

        if self.cond:
            return [T.ftensor3('x'), T.fmatrix('mask'),
                    T.ftensor3('y'), T.fmatrix('label_mask')]
        else:
            return [T.ftensor3('x'), T.fmatrix('mask')]
Пример #11
0
            def cmp(a_shp, b_shp):

                a = numpy.random.randn(* a_shp).astype(numpy.float32)
                b = numpy.random.randn(* b_shp).astype(numpy.float32)

                x = tensor.ftensor3()
                y = tensor.ftensor3()

                f = theano.function([x, y],
                                    batched_dot(x, y),
                                    mode=mode_with_gpu)

                z0 = numpy.asarray(f(a, b))

                ga = cuda_ndarray.CudaNdarray(a)
                gb = cuda_ndarray.CudaNdarray(b)

                z1 = numpy.asarray(f(ga, gb))

                z_test = numpy.sum(
                    a[:, :, :, None] * b[:, None, :, :], axis=-2)
                z1 = numpy.asarray(f(ga, gb))

                z_test = numpy.sum(
                    a[:, :, :, None] * b[:, None, :, :], axis=-2)

                unittest_tools.assert_allclose(z0, z_test)
                unittest_tools.assert_allclose(z1, z_test)
Пример #12
0
def test_multiple_inputs():
  X = T.ftensor3('X')
  X2 = T.ftensor3('X')
  W = T.fmatrix('W')
  V_h = T.fmatrix('V_h')
  b = T.fvector('b')
  c = T.fmatrix('c') #initial state
  i = T.matrix('i',dtype='int8')
  X_val_mat0 = 0.1 * numpy.array([[1,2,3], [4,5,6]], dtype='float32')
  X_val_mat1 = 0.1 * numpy.array([[5,1,8], [7,0,1]], dtype='float32')
  X_val_mat2 = 0.1 * numpy.array([[2,1,1], [-7,0,-1]], dtype='float32')
  X_val = numpy.zeros((3,2,3), dtype='float32')
  X_val[0, :, :] = X_val_mat0
  X_val[1, :, :] = X_val_mat1
  X_val[2, :, :] = X_val_mat2
  X_val2 = numpy.zeros_like(X_val)
  #should be divisable by 4 for lstm, attention: note the .T
  W_val = 0.1 * numpy.array([[3,1,2], [4,8,0], [7,7,1], [4,2,-5],
                             [6,-1,-2], [-4,8,0], [-7,2,1], [4,-2,-5],
                             [6,5,-2], [-4,8,-6], [-7,3,-1], [4,2,-5]], dtype='float32').T
  #(for lstm) size 1/4th
  V_h_val = 0.1 * numpy.array([[1,3,5], [2,-1,-1], [4, 8,-5], [0,-2,3],
                               [7,7,7], [1,2,3], [5,2,1], [-4,8,-4],
                               [-3,7,-7], [2,-2,-3], [-5,2,1], [-4,-5,-4]],
                              dtype='float32').T
  b_val = 0.1 * numpy.array([1,2,3,4,5,6,7,8,9,10,11,12], dtype='float32')
  c_val = numpy.zeros((2,3), dtype='float32')
  i_val = numpy.ones((3,2),dtype='int8')

  Z1, H1, d1 = LSTMOp2Instance(V_h, c, b, i, X, W)
  Z2, H2, d2 = LSTMOp2Instance(V_h, c, b, i, X, X2, W, W)
  Z3, H3, d3 = LSTMOp2Instance(V_h, c, b, i) # no inputs!
  DX1 = T.grad(Z1.sum(), X)
  DW1 = T.grad(Z1.sum(), W)
  DV_h1 = T.grad(Z1.sum(), V_h)
  Db1 = T.grad(Z1.sum(), b)
  Dc1 = T.grad(Z1.sum(), c)

  DX2 = T.grad(Z2.sum(), X)
  DW2 = T.grad(Z2.sum(), W)
  DV_h2 = T.grad(Z2.sum(), V_h)
  Db2 = T.grad(Z2.sum(), b)
  Dc2 = T.grad(Z2.sum(), c)

  DV_h3 = T.grad(Z3.sum(), V_h)

  f = theano.function(inputs=[X, W, V_h, c, b, i], outputs=[Z1, DX1, DW1])
  g = theano.function(inputs=[X, X2, W, V_h, c, b, i], outputs=[Z2, DX2, DW2])
  h = theano.function(inputs=[V_h, c, b, i], outputs=[Z3, DV_h3])
  h_res = [numpy.asarray(A, dtype='float32') for A in h(V_h_val, c_val, b_val, i_val)]
  #print h_res[0], h_res[1]
  f_res = [numpy.asarray(A, dtype='float32') for A in f(X_val, W_val, V_h_val, c_val, b_val, i_val)]
  g_res = [numpy.asarray(A, dtype='float32') for A in g(X_val, X_val2, W_val, V_h_val, c_val, b_val, i_val)]
  for A1, A2 in zip(f_res, g_res):
    assert numpy.allclose(A1, A2)
  #print f_res[0], g_res[0]

  print "success"
Пример #13
0
    def test_outer_infershape(self):
        o = tensor.ftensor4()
        x = tensor.ftensor3()
        y = tensor.ftensor3()
        xIdx = tensor.imatrix()
        yIdx = tensor.imatrix()

        self._compile_and_check(
            [o, x, y, xIdx, yIdx], [self.outer_op(o, x, y, xIdx, yIdx)], self.outer_data(), self.outer_class
        )
Пример #14
0
def test_attention_time_gauss():
  n_T = 4
  n_batch = 2
  n_inp_dim = 3
  n_cells = 5
  n_B = 5

  custom_op = get_attention(RecurrentTransform.AttentionTimeGauss,
                            n_out=n_cells, n_batches=n_batch, n_input_t=n_B, n_input_dim=n_inp_dim)
  att = custom_op.recurrent_transform

  Z_val = numpy.random.ranf((n_T,n_batch,4*n_cells)).astype('float32')
  W_re_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32')
  W_att_quadr_val = numpy.eye(n_B).astype('float32')
  W_att_in_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32')
  B_val = numpy.random.ranf((n_B,n_batch,n_cells)).astype('float32')
  c_val = numpy.random.ranf((n_batch, n_cells)).astype('float32')
  y0_val = numpy.random.ranf((n_batch, n_cells)).astype('float32')
  i_val = numpy.ones((n_T, n_batch), dtype='int8')

  Z = T.ftensor3('Z')
  B = T.ftensor3('B') #base
  W_re = T.fmatrix('W_re')
  W_att_quadr = T.fmatrix("W_att_quadr")
  W_att_in = T.fmatrix('W_att_in')
  c = T.fmatrix('c') #initial state
  y0 = T.fmatrix('y0') #initial activation
  i = T.matrix('i',dtype='int8')
  t0 = T.fvector('t0')
  custom_vars = att.get_sorted_custom_vars()
  initial_state_vars = att.get_sorted_state_vars_initial()
  custom_op_inputs = [Z, c, y0, i, W_re] + custom_vars + initial_state_vars
  print("input args num:", len(custom_op_inputs))
  print("input args:", custom_op_inputs)
  custom_op_outputs = custom_op(*custom_op_inputs)
  print("output args num:", len(custom_op_outputs))
  custom_op_outputs = [cuda.host_from_gpu(v) for v in custom_op_outputs]
  f = theano.function(inputs=[Z, c, y0, i, W_re], outputs=custom_op_outputs)

  res = f(Z_val, c_val, y0_val, i_val, W_re_val)

  #print res
  # res: (output) Y, (gates and cell state) H, (final cell state) d, state vars sequences
  (Y, H, d), state_var_seqs = res[:3], res[3:]

  # print "running custom dumped data"
  # custom_op_inputs = [theano.shared(numpy.load("../op.i.%i" % i)) for i in range(12)]
  # custom_op_outputs = custom_op(*custom_op_inputs)
  # custom_op_outputs = [cuda.host_from_gpu(v) for v in custom_op_outputs]
  # f = theano.function(inputs=[], outputs=custom_op_outputs)
  # res = f()

  print(res)

  assert False
Пример #15
0
        def fail(a_shp, b_shp):

            a=numpy.random.randn(*a_shp).astype(numpy.float32)
            b=numpy.random.randn(*b_shp).astype(numpy.float32)

            x=tensor.ftensor3()
            y=tensor.ftensor3()

            f=theano.function([x,y], batched_dot(x,y), mode=mode_with_gpu)

            z = f(a,b)
Пример #16
0
def test_tensor3_roc_auc_scores():
    true = np.random.binomial(n=1, p=.5, size=(20, 30, 40)).astype('float32')
    predicted = np.random.random((20, 30, 40)).astype('float32')
    yt, yp = T.ftensor3('yt'), T.ftensor3('yp')
    refscore = tmetrics.classification.last_axis_roc_auc_scores(true, predicted)
    roc_auc_scores = tmetrics.classification.roc_auc_scores(yt, yp)
    f = theano.function([yt, yp], roc_auc_scores)
    score = f(true, predicted)
    print 'refscore'
    print refscore
    print 'score'
    print score
    assert np.allclose(refscore, score, equal_nan=True)
def experiment(train_data, train_labels, test_data, test_labels):
    x = T.ftensor3('input_data')
    no_of_patches = 64

    cs_args = {
        "train_args":{
         "learning_rate": 0.08,
         "nepochs": 200,
         "cost_type": "crossentropy",
         "save_exp_data": False,
         "batch_size": 100,
         "randomize_mb": True,
         "enable_dropout": False
        },
        "test_args":{
         "save_exp_data":False,
         "batch_size": 2000
        }
    }

    post_mlp = StructuredMLP(x,
        in_layer_shape=(no_of_patches, 81, 200, 128),
        layer2_in=1024,
        activation=NeuralActivations.Rectifier,
        n_out=1,
        quiet=True,
        #momentum=0.9,
        save_file="./pkls/structured_mlp_1000_11outs_1hot.pkl",
        use_adagrad=False)

    post_mlp.set_test_data(test_data, test_labels, patch_mode=False)
    print "=============((((()))))==============="
    print "Training on the dataset."
    post_mlp.train(train_data, train_labels, **cs_args["train_args"])
Пример #18
0
    def create_iterator_functions(self):
        # Define input and target variables
        input_var = T.ftensor3('inputs')
        target_var = T.ivector('targets')
        hop_length = (par.STEP_SIZE / 1000.0) * par.SR
        self.net = build_model_small((None, par.N_COMPONENTS, int(par.MAX_LENGTH/hop_length)), input_var)

        #with open('models/499.pkl', 'rb') as f:
            #param_values = pickle.load(f)
            #lasagne.layers.set_all_param_values(self.net['prob'], param_values)

        # Define prediction and loss calculation
        prediction = lasagne.layers.get_output(self.net['prob'], inputs=input_var)
        loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
        loss = loss.mean()

        # Define updates
        params = lasagne.layers.get_all_params(self.net['prob'], trainable=True)
        updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=self.update_learning_rate, momentum=0.9)

        # Define test time prediction
        test_prediction = lasagne.layers.get_output(self.net['prob'], inputs=input_var, deterministic=True)
        test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var)
        test_loss = test_loss.mean()

        # Compile functions
        self.train_fn = theano.function([input_var, target_var], loss, updates=updates)
        self.val_fn = theano.function([input_var, target_var], [test_loss, test_prediction])
Пример #19
0
 def get_input_data(self, name, mcp, input_dims):
     """
     """
     input_names = mcp.safe_get_list(name, 'inputs')
     l = []
     for e,dim in zip(input_names, input_dims):
         if e in self.symbolic_var_dic:
             print('    use symbolic variable {}'.format(e))
             l += [self.symbolic_var_dic[e]]
         else:
             try:
                 if isinstance(dim, int):
                     sym, descr = tensor.fmatrix(e), 'fmatrix'
                 elif isinstance(dim, tuple):
                     t = len(dim)
                     if t != 2 and t != 3:
                         raise Exception('Unsupported dimension {}'.format(t))
                     if t == 2:
                         sym, descr = tensor.ftensor3(e), 'tensor3'
                     else:
                         sym, descr = tensor.ftensor4(e), 'tensor4'
                 print('    create symbolic variable {} as {}'.format(e, descr))
                 self.symbolic_var_dic[e] = sym 
                 l += [sym]
             except Exception as err:
                 print err
                 sys.exit(1)
     return input_names, l
def experiment(train_data, train_labels, test_data, test_labels):
    x = T.ftensor3('input_data')
    no_of_patches = 64
    train_patches = get_dataset_patches(train_data)

    cs_args = {
        "train_args":{
         "learning_rate": 0.0015,
         "nepochs": 60,
         "cost_type": "crossentropy",
         "save_exp_data": False,
         "batch_size": 100,
         "enable_dropout": False
        },
        "test_args":{
         "save_exp_data":False,
         "batch_size": 2000
        }
    }

    post_mlp = StructuredMLP(x,
        in_layer_shape=(no_of_patches, no_of_patches, 200, 256),
        layer2_in=2048,
        activation=NeuralActivations.Rectifier,
        layer1_nout=100,
        n_out=1,
        quiet=True,
        save_file="structured_mlp_100k_100lbls_g.pkl",
        use_adagrad=False)

    post_mlp.set_test_data(test_data, test_labels)
    print "=============((((()))))==============="
    print "Training on the dataset."
    post_mlp.train(train_patches, train_labels, **cs_args["train_args"])
Пример #21
0
def test_does_not_crash():
  Z = T.ftensor3('Z')
  W_re = T.fmatrix('W_re')
  W_att_in = T.fmatrix('W_att_in')
  c = T.fmatrix('c') #initial state
  y0 = T.fmatrix('y0') #initial activation
  i = T.matrix('i',dtype='int8')
  Y, H, d = LSTMCustomTestOpNoInplaceInstance(Z, c, y0, i, W_re, W_att_in)

  f = theano.function(inputs=[Z, c, y0, i, W_re, W_att_in], outputs=Y)

  n_T = 5
  n_batch = 4
  n_inp_dim = 3
  n_cells = 8
  numpy.random.seed(1234)
  Z_val = numpy.random.ranf((n_T,n_batch,4*n_cells)).astype('float32')
  W_re_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32')
  W_att_in_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32')
  c_val = numpy.random.ranf((n_batch, n_cells)).astype('float32')
  y0_val = numpy.random.ranf((n_batch, n_cells)).astype('float32')
  #i_val = numpy.ones((n_T, n_batch), dtype='int8')
  i_val = numpy.array([[1,1,1,1,1], [0,0,1,1,1], [0,0,1,1,1], [0,0,1,0,0]], dtype='int8').T

  Y_val = numpy.asarray(f(Z_val, c_val, y0_val, i_val, W_re_val, W_att_in_val))
  #print Y_val
  print "success"
Пример #22
0
    def build_loss_graph(self, saved_graph=None):
        print("Building loss graph...")

        for l in self.layers:
            l.set_training(False)

        Sentence = T.fmatrix('Sentence')
        Characters = T.ftensor3('Characters')
        WordLengths = T.ivector('WordLengths')
        GoldPredictions = T.fmatrix('GoldPredictions')
        
        weight_list = self.get_theano_weight_list()

        if self.feature_mode == 'character':
            result = self.theano_sentence_loss(Characters, WordLengths, GoldPredictions)
            input_list = [Characters, WordLengths, GoldPredictions] + list(weight_list)
        elif self.feature_mode == 'sentence':
            result = self.theano_sentence_loss(Sentence, GoldPredictions)
            input_list = [Sentence, GoldPredictions] + list(weight_list)
        elif self.feature_mode == 'both':
            result = self.theano_sentence_loss(Sentence, Characters, WordLengths, GoldPredictions)
            input_list = [Sentence, Characters, WordLengths, GoldPredictions] + list(weight_list)

        cgraph = theano.function(inputs=input_list, outputs=result, mode='FAST_RUN', allow_input_downcast=True)

        print("Done building graph.")
        
        return cgraph
Пример #23
0
    def test_sparseblockgemvF(self):
        """
            Test the fortan order for W (which can happen in the grad for some
            graphs).
        """
        b = tensor.fmatrix()
        W = tensor.ftensor4()
        h = tensor.ftensor3()
        iIdx = tensor.imatrix()
        oIdx = tensor.imatrix()

        o = self.gemv_op(b.take(oIdx, axis=0),
                         tensor.DimShuffle((False, False, False, False),
                                           (0, 1, 3, 2))
                         (tensor.as_tensor_variable(W)),
                         h, iIdx, oIdx)

        f = theano.function([W, h, iIdx, b, oIdx], o, mode=self.mode)

        W_val, h_val, iIdx_val, b_val, oIdx_val = \
            BlockSparse_Gemv_and_Outer.gemv_data()

        th_out = f(numpy.swapaxes(W_val, 2, 3), h_val, iIdx_val, b_val,
                   oIdx_val)
        ref_out = BlockSparse_Gemv_and_Outer.gemv_numpy(
            b_val.take(oIdx_val, axis=0), W_val, h_val, iIdx_val, oIdx_val)

        utt.assert_allclose(ref_out, th_out)
Пример #24
0
  def make_node(self, acts, input_lengths, flat_labels, label_lengths):
    acts_ = T.as_tensor_variable(acts)
    input_lengths_ = T.as_tensor_variable(input_lengths)
    flat_labels_ = T.as_tensor_variable(flat_labels)
    label_lengths_ = T.as_tensor_variable(label_lengths)

    if acts_.dtype != "float32":
      raise Exception("acts must be float32 instead of %s" % acts.dtype)
    if input_lengths.dtype != "int32":
      raise Exception("input_lengths must be int32 instead of %s" % input_lengths.dtype)
    if flat_labels.dtype != "int32":
      raise Exception("flat_labels must be int32 instead of %s" % flat_labels.dtype)
    if label_lengths.dtype != "int32":
      raise Exception("label_lengths must be int32 instead of %s" % label_lengths.dtype)

    # Normally a singleton Op instance is created, and different Apply nodes are
    # created for different inputs.
    # Here, we create an Op instance specifically for this application,
    # and store the gradient variable in it so that it can be used by grad().
    op = CpuCtc()
    op.costs = T.fvector(name="ctc_cost")
    op.gradients = T.ftensor3(name="ctc_grad")

    # Don't compute gradient unless needed
    op.computeGradient = theano.shared(np.asarray([1], dtype=np.int32))

    applyNode = theano.Apply(op, 
                             inputs=[acts_, input_lengths_, flat_labels_, label_lengths_, op.computeGradient], 
                             outputs=[op.costs, op.gradients])

    # Return only the cost. Gradient will be returned by grad()
    self.default_output = 0   
    return applyNode
Пример #25
0
    def test_transfer(self):
        tensor1 = self.rng.rand(20, 10, 5, 8).astype("float32")
        tensor2 = self.rng.rand(5, 8, 20).astype("float32")
        tensor3 = self.rng.rand(8, 20, 5).astype("float32")

        x = tensor.ftensor4("x")
        y = tensor.ftensor3("y")

        tdot1 = tensor.tensordot(x, y, 2)
        f1 = theano.function([x, y], tdot1, mode=mode_with_gpu)
        topo1 = f1.maker.fgraph.toposort()
        assert topo1[-1].op == cuda.host_from_gpu
        # Let DebugMode debug
        f1(tensor1, tensor2)

        tdot2 = tensor.tensordot(x, y, axes=[(0, 3), (1, 0)])
        f2 = theano.function([x, y], tdot2, mode=mode_with_gpu)
        topo2 = f2.maker.fgraph.toposort()
        assert topo2[-1].op == cuda.host_from_gpu
        f2(tensor1, tensor3)

        tdot3 = tensor.tensordot(x, y, axes=[(0, 3, 2), (1, 0, 2)])
        f3 = theano.function([x, y], tdot3, mode=mode_with_gpu)
        topo3 = f3.maker.fgraph.toposort()
        assert topo3[-1].op == cuda.host_from_gpu
        f3(tensor1, tensor3)
Пример #26
0
    def build_theano_gru(self, innerdim, indim, batsize, gru):
        u = theano.shared(gru.u.d.get_value())
        w = theano.shared(gru.w.d.get_value())
        um = theano.shared(gru.um.d.get_value())
        wm = theano.shared(gru.wm.d.get_value())
        uhf = theano.shared(gru.uhf.d.get_value())
        whf = theano.shared(gru.whf.d.get_value())
        b = theano.shared(gru.b.d.get_value())
        bm = theano.shared(gru.bm.d.get_value())
        bhf = theano.shared(gru.bhf.d.get_value())

        def rec(x_t, h_tm1):
            mgate =  T.nnet.sigmoid(T.dot(h_tm1, um)  + T.dot(x_t, wm)  + bm)
            hfgate = T.nnet.sigmoid(T.dot(h_tm1, uhf) + T.dot(x_t, whf) + bhf)
            canh = T.tanh(T.dot(h_tm1 * hfgate, u) + T.dot(x_t, w) + b)
            h = mgate * h_tm1 + (1-mgate) * canh
            return [h, h]

        def apply(x):
            inputs = x.dimshuffle(1, 0, 2) # inputs is (seq_len, batsize, dim)
            init_h = T.zeros((batsize, innerdim))
            outputs, _ = theano.scan(fn=rec,
                                sequences=inputs,
                                outputs_info=[None, init_h])
            output = outputs[0]
            return output[-1, :, :] #.dimshuffle(1, 0, 2) # return is (batsize, seqlen, dim)

        inp = T.ftensor3()
        return inp, apply(inp)
Пример #27
0
    def test_read(self):
        batch_size = 100
        height, width = self.height, self.width
        N = self.N
        zaw = self.zaw

        # Create theano function
        images = T.ftensor3('images')
        center_y, center_x = T.fvectors('center_x', 'center_y')
        delta, sigma = T.fvectors('delta', 'sigma')

        readout = zaw.read(images, center_y, center_x, delta, sigma)

        do_read = theano.function(
            [images, center_y, center_x, delta, sigma],
            readout,
            name="do_read",
            allow_input_downcast=True)

        # Test theano function
        images = np.random.uniform(size=(batch_size, height, width))
        center_y = np.linspace(-height, 2*height, batch_size)
        center_x = np.linspace(-width, 2*width, batch_size)
        delta = np.linspace(0.1, height, batch_size)
        sigma = np.linspace(0.1, height, batch_size)

        readout = do_read(images, center_y, center_x, delta, sigma)

        assert readout.shape == (batch_size, N**2)
        assert np.isfinite(readout).all()
        assert (readout >= 0.).all()
        assert (readout <= 1.).all()
Пример #28
0
def test_fwd_pass_compatible_with_OpLSTM():
  Z = T.ftensor3('Z')
  W_re = T.fmatrix('W_re')
  W_att_in = T.fmatrix('W_att_in')
  c = T.fmatrix('c') #initial state
  y0 = T.fmatrix('y0') #initial activation
  i = T.matrix('i',dtype='int8')

  Y, H, d = LSTMCustomTestOpNoInplaceInstance(Z, c, y0, i, W_re, W_att_in)
  W_re_modified = W_re + W_att_in
  Z_modified = T.inc_subtensor(Z[0], T.dot(y0,W_re_modified))
  Y2, H2, d2 = LSTMOpInstance(Z_modified, W_re_modified, c, i)

  f = theano.function(inputs=[Z, c, y0, i, W_re, W_att_in], outputs=Y)
  g = theano.function(inputs=[Z, W_re, c, y0, i, W_att_in], outputs=Y2)

  n_T = 5
  n_batch = 4
  n_inp_dim = 3
  n_cells = 8
  numpy.random.seed(1234)
  Z_val = numpy.random.ranf((n_T,n_batch,4*n_cells)).astype('float32')
  W_re_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32')
  W_att_in_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32')
  c_val = numpy.random.ranf((n_batch, n_cells)).astype('float32')
  y0_val = numpy.random.ranf((n_batch, n_cells)).astype('float32')
  #i_val = numpy.ones((n_T, n_batch), dtype='int8')
  i_val = numpy.array([[1,1,1,1,1], [0,0,1,1,1], [0,0,1,1,1], [0,0,1,0,0]], dtype='int8').T

  Y_val = numpy.asarray(f(Z_val, c_val, y0_val, i_val, W_re_val, W_att_in_val))
  Y2_val = numpy.asarray(g(Z_val, W_re_val, c_val, y0_val, i_val, W_att_in_val))
  assert numpy.allclose(Y_val, Y2_val)
  print("success")
Пример #29
0
    def make_node(self, activations, labels, input_lengths):
        t_activations = T.as_tensor_variable(activations)
        # Ensure activations array is C-contiguous
        t_activations = cpu_contiguous(t_activations)

        t_labels = T.as_tensor_variable(labels)
        t_input_lengths = T.as_tensor_variable(input_lengths)

        if t_activations.type.dtype != 'float32':
            raise TypeError('activations must use the float32 type!')

        if t_activations.ndim != 3:
            raise ValueError('activations must have 3 dimensions.')

        if t_labels.type.dtype != 'int32':
            raise TypeError('labels must use the int32 type!')

        if t_labels.ndim != 2:
            raise ValueError('labels must have 2 dimensions.')

        if t_input_lengths.type.dtype != 'int32':
            raise TypeError('input_lengths must use the int32 type!')

        if t_input_lengths.ndim != 1:
            raise ValueError('input_lengths must have 1 dimension.')

        costs = T.fvector(name="ctc_cost")
        outputs = [costs]
        if self.compute_grad:
            gradients = T.ftensor3(name="ctc_grad")
            outputs += [gradients]

        return gof.Apply(self, inputs=[t_activations, t_labels, t_input_lengths],
                         outputs=outputs)
Пример #30
0
def test_blocksparse_grad_merge():
    b = tensor.fmatrix()
    h = tensor.ftensor3()
    iIdx = tensor.lmatrix()
    oIdx = tensor.lmatrix()

    W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data()
    W = float32_shared_constructor(W_val)

    o = sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
    gW = theano.grad(o.sum(), W)

    lr = numpy.asarray(0.05, dtype='float32')

    upd = W - lr * gW

    f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)],
                         mode=mode_with_gpu)
    # not running with mode=gpu ensures that the elemwise is not merged in
    mode = None
    if theano.config.mode == 'FAST_COMPILE':
        mode = theano.compile.mode.get_mode('FAST_RUN')

    f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode)

    f2(h_val, iIdx_val, b_val, oIdx_val)
    W_ref = W.get_value()

    # reset the var
    W.set_value(W_val)
    f1(h_val, iIdx_val, b_val, oIdx_val)
    W_opt = W.get_value()

    utt.assert_allclose(W_ref, W_opt)
Пример #31
0
    def __init__(self,
                 Nbranches = 1,             # number of branches (parallel models to be fused)
                 Nlayers = 1,               # number of layers
                 Ndirs = 1,                 # unidirectional or bidirectional
                 Nx = 100,                  # input size
                 Nh = 100,                  # hidden layer size
                 Ny = 100,                  # output size
                 Ah = "relu",               # hidden unit activation (e.g. relu, tanh, lstm)
                 Ay = "linear",             # output unit activation (e.g. linear, sigmoid, softmax)
                 predictPer = "frame",      # frame or sequence
                 loss = None,               # loss function (e.g. mse, ce, ce_group, hinge, squared_hinge)
                 L1reg = 0.0,               # L1 regularization
                 L2reg = 0.0,               # L2 regularization
                 multiReg = 0.0,            # regularization of agreement of predictions on data of different conditions
                 momentum = 0.0,            # SGD momentum
                 seed = 15213,              # random seed for initializing the weights
                 frontEnd = None,           # a lambda function for transforming the input
                 filename = None,           # initialize from file
                 initParams = None,         # initialize from given dict
                ):

        if filename is not None:            # load parameters from file
            with smart_open(filename, "rb") as f:
                initParams = dill.load(f)
        if initParams is not None:          # load parameters from given dict
            self.paramNames = []
            self.params = []
            for k, v in initParams.iteritems():
                if type(v) is numpy.ndarray:
                    self.addParam(k, v)
                else:
                    setattr(self, k, v)
                    self.paramNames.append(k)
            # F*ck, locals()[k] = v doesn't work; I have to do this statically
            Nbranches, Nlayers, Ndirs, Nx, Nh, Ny, Ah, Ay, predictPer, loss, L1reg, L2reg, momentum, frontEnd \
                = self.Nbranches, self.Nlayers, self.Ndirs, self.Nx, self.Nh, self.Ny, self.Ah, self.Ay, self.predictPer, self.loss, self.L1reg, self.L2reg, self.momentum, self.frontEnd
        else:                           # Initialize parameters randomly
            # Names of parameters to save to file
            self.paramNames = ["Nbranches", "Nlayers", "Ndirs", "Nx", "Nh", "Ny", "Ah", "Ay", "predictPer", "loss", "L1reg", "L2reg", "momentum", "frontEnd"]
            for name in self.paramNames:
                value = locals()[name]
                setattr(self, name, value)

            # Values of parameters for building the computational graph
            self.params = []

            # Initialize random number generators
            global rng
            rng = numpy.random.RandomState(seed)

            # Construct parameter matrices
            Nlstm = 4 if Ah == 'lstm' else 1
            self.addParam("Win", rand_init((Nbranches, Nx, Nh * Ndirs * Nlstm), Ah))
            self.addParam("Wrec", rand_init((Nbranches, Nlayers, Ndirs, Nh, Nh * Nlstm), Ah))
            self.addParam("Wup", rand_init((Nbranches, Nlayers - 1, Nh * Ndirs, Nh * Ndirs * Nlstm), Ah))
            self.addParam("Wout", rand_init((Nbranches, Nh * Ndirs, Ny), Ay))
            if Ah != "lstm":
                self.addParam("Bhid", zeros((Nbranches, Nlayers, Nh * Ndirs)))
            else:
                self.addParam("Bhid", numpy.tile(numpy.concatenate([full((Nbranches, Nlayers, Nh), 1.0),
                                                                    zeros((Nbranches, Nlayers, Nh * 3))], 2), (1, 1, Ndirs)))
            self.addParam("Bout", zeros((Nbranches, Ny)))
            self.addParam("h0", zeros((Nbranches, Nlayers, Ndirs, Nh)))
            if Ah == "lstm":
                self.addParam("c0", zeros((Nbranches, Nlayers, Ndirs, Nh)))

        # Compute total number of parameters
        self.nParams = sum(x.get_value().size for x in self.params)

        # Initialize gradient tensors when using momentum
        if momentum > 0:
            self.dparams = [theano.shared(zeros(x.get_value().shape)) for x in self.params]

        # Build computation graph
        input = T.ftensor3()
        mask = T.imatrix()
        mask_int = [(mask % 2).nonzero(), (mask >= 2).nonzero()]
        mask_float = [T.cast((mask % 2).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX),
                      T.cast((mask >= 2).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX)]
        # mask_int = [(mask & 1).nonzero(), (mask & 2).nonzero()]
        # mask_float = [T.cast((mask & 1).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX),
        #               T.cast(((mask & 2) / 2).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX)]

        def step_rnn(x_t, mask, h_tm1, W, h0):
            h_tm1 = T.switch(mask, h0, h_tm1)
            return [ACTIVATION[Ah](x_t + h_tm1.dot(W))]

        def step_lstm(x_t, mask, c_tm1, h_tm1, W, c0, h0):
            c_tm1 = T.switch(mask, c0, c_tm1)
            h_tm1 = T.switch(mask, h0, h_tm1)
            a = x_t + h_tm1.dot(W)
            f_t = T.nnet.sigmoid(a[:, :Nh])
            i_t = T.nnet.sigmoid(a[:, Nh : Nh * 2])
            o_t = T.nnet.sigmoid(a[:, Nh * 2 : Nh * 3])
            c_t = T.tanh(a[:, Nh * 3:]) * i_t + c_tm1 * f_t
            h_t = T.tanh(c_t) * o_t
            return [c_t, h_t]

        x = input if frontEnd is None else frontEnd(input)
        outputs = []
        for k in range(Nbranches):
            for i in range(Nlayers):
                h = (x.dimshuffle((1, 0, 2)).dot(self.Win[k]) if i == 0 else h.dot(self.Wup[k, i-1])) + self.Bhid[k, i]
                rep = lambda x: T.extra_ops.repeat(x.reshape((1, -1)), h.shape[1], axis = 0)
                if Ah != "lstm":
                    h = T.concatenate([theano.scan(
                            fn = step_rnn,
                            sequences = [h[:, :, Nh * d : Nh * (d+1)], mask_float[d]],
                            outputs_info = [rep(self.h0[k, i, d])],
                            non_sequences = [self.Wrec[k, i, d], rep(self.h0[k, i, d])],
                            go_backwards = (d == 1),
                        )[0][::(1 if d == 0 else -1)] for d in range(Ndirs)], axis = 2)
                else:
                    h = T.concatenate([theano.scan(
                            fn = step_lstm,
                            sequences = [h[:, :, Nh * 4 * d : Nh * 4 * (d+1)], mask_float[d]],
                            outputs_info = [rep(self.c0[k, i, d]), rep(self.h0[k, i, d])],
                            non_sequences = [self.Wrec[k, i, d], rep(self.c0[k, i, d]), rep(self.h0[k, i, d])],
                            go_backwards = (d == 1),
                        )[0][1][::(1 if d == 0 else -1)] for d in range(Ndirs)], axis = 2)
            h = h.dimshuffle((1, 0, 2))
            if predictPer == "sequence":
                h = T.concatenate([h[mask_int[1 - d]][:, Nh * d : Nh * (d+1)] for d in range(Ndirs)], axis = 1)
            outputs.append(ACTIVATION[Ay](h.dot(self.Wout[k]) + self.Bout[k]))
        output = T.stack(*outputs)      # Deprecated in Theano 0.8 but accepted in Theano 0.7
        output_mean = output.mean(axis = 0)
        output_var = output.var(axis = 0)

        # Compute loss function
        if loss is None:
            loss = {"linear": "mse", "sigmoid": "ce", "softmax": "ce_group"}[self.Ay]
        if loss == "ctc":
            label = T.imatrix()
            label_time = T.imatrix()
            tol = T.iscalar()
            cost = ctc_cost(output_mean, mask, label, label_time, tol)
        else:
            if predictPer == "sequence":
                label = T.fmatrix()
                y = output_mean
                t = label
            elif predictPer == "frame":
                label = T.ftensor3()
                indices = (mask >= 0).nonzero()
                y = output_mean[indices]
                t = label[indices]
            cost = T.mean({
                "ce":               -T.mean(T.log(y) * t + T.log(1 - y) * (1 - t), axis = 1),
                "ce_group":         -T.log((y * t).sum(axis = 1)),
                "mse":              T.mean((y - t) ** 2, axis = 1),
                "hinge":            T.mean(relu(1 - y * (t * 2 - 1)), axis = 1),
                "squared_hinge":    T.mean(relu(1 - y * (t * 2 - 1)) ** 2, axis = 1),
            }[loss])

        # Add regularization
        cost += sum(abs(x).sum() for x in self.params) / self.nParams * L1reg
        cost += sum(T.sqr(x).sum() for x in self.params) / self.nParams * L2reg
        if predictPer == "sequence":
            cost += output_var.mean() * multiReg
        else:
            indices = (mask >= 0).nonzero()
            cost += output_var[indices].mean() * multiReg

        # Compute updates for network parameters
        updates = []
        lrate = T.fscalar()
        clip = T.fscalar()
        grad = T.grad(cost, self.params)
        grad_clipped = [T.maximum(T.minimum(g, clip), -clip) for g in grad]
        if momentum > 0:
            for w, d, g in zip(self.params, self.dparams, grad_clipped):
                updates.append((w, w + momentum * momentum * d - (1 + momentum) * lrate * g))
                updates.append((d, momentum * d - lrate * g))
        else:
            for w, g in zip(self.params, grad_clipped):
                updates.append((w, w - lrate * g))

        # Create functions to be called from outside
        if loss == "ctc":
            inputs = [input, mask, label, label_time, tol, lrate, clip]
        else:
            inputs = [input, mask, label, lrate, clip]
        self.train = theano.function(
                         inputs = inputs,
                         outputs = cost,
                         updates = updates,
                     )

        self.predict = theano.function(inputs = [input, mask], outputs = output)
def UnitTest_OnestepAttend():
	N = 2 #number of sample
	D = 5 #dimension of input
	H = 4 #dimension of hidden
	T_new = 1 #length of per each sample
	context_dim = 3
	K = 5

	x = np.linspace(-0.4, 0.6, num=N*T_new*D, dtype = theano.config.floatX).reshape(T_new, N, D)
	h0= np.linspace(-0.4, 0.8, num=N*H, dtype = theano.config.floatX).reshape(N, H)
	Wx= np.linspace(-0.2, 0.9, num=4*D*H, dtype = theano.config.floatX).reshape(D, 4*H)
	Wh= np.linspace(-0.3,0.6, num =4*H*H, dtype = theano.config.floatX).reshape(H,4*H)
	b = np.linspace(0.0, 0.0, num = 4*H, dtype = theano.config.floatX)
	Wz= np.linspace(-0.3, 0.6, num=4*H*context_dim, dtype = theano.config.floatX).reshape(context_dim, 4*H)
	Hcontext = np.linspace(-0.2, 0.6, num=H*K, dtype = theano.config.floatX).reshape(H, K)
	Zcontext = np.linspace(-0.2, 0.5, num=context_dim*K, dtype= theano.config.floatX).reshape(context_dim, K)
	Va= np.linspace(0.1, 0.4, num=K, dtype = theano.config.floatX)
	Va_reshape = Va.reshape(K,1)

	image_feature_3D = np.linspace(-0.2, 0.5, num=10*N*context_dim, dtype = theano.config.floatX).reshape(N,10, context_dim)

	h0_theano = h0.reshape(1, N, H)
	# h0_symb   = theano.tensor.ftensor3("h_symb")
	# lstm_theano_layer.h_m1.set_value(h0_theano)

	c0_theano = np.zeros((1, N, H), dtype = theano.config.floatX)
	# c0_symb   = theano.tensor.ftensor3("c_symb")
	# lstm_theano_layer.c_m1.set_value(c0_theano)

	z0_theano = np.zeros((1, N, context_dim), dtype = theano.config.floatX)

	x_theano = x.reshape(T_new, N, D, 1)
	image_feature_input = image_feature_3D

	weight_y_in_value = np.zeros(( 10, context_dim) , dtype= theano.config.floatX)
	b_theano= b.reshape(1, 1, 4*H)
	pdb.set_trace()

	#symbolic variables
	initial_h0_layer_out = theano.tensor.tensor3(name = 'h0_initial', dtype = theano.config.floatX)
	initial_c0_layer_out = theano.tensor.tensor3(name = 'c0_initial', dtype = theano.config.floatX)
	initial_z0			 = T.tensor3(name= 'z0_initial', dtype = theano.config.floatX)
	weight_y_in = theano.tensor.fmatrix("weight_y")	
	input_data = theano.tensor.tensor3(name ='x', dtype=theano.config.floatX)
	image_feature_region = theano.tensor.tensor3(name = 'feature_region', dtype = theano.config.floatX)

	Wi_sym, Wf_sym, Wc_sym, Wo_sym, Ui_sym, Uf_sym, Uc_sym, Uo_sym, Zi_sym, Zf_sym, Zc_sym, Zo_sym = T.fmatrices(12)
	Zcontext_sym, Hcontext_sym = T.fmatrices(2)
	bi  = T.ftensor3("bi")
	bf  = T.ftensor3("bf")
	bc  = T.ftensor3("bc")
	bo  = T.ftensor3("bo")
	Va_sym = T.fcol("Va")


	out_sym = onestep_attend_tell(input_data, initial_h0_layer_out, initial_c0_layer_out, initial_z0, 
		Wi_sym, Wf_sym, Wc_sym, Wo_sym, Ui_sym, Uf_sym, Uc_sym, Uo_sym, Zi_sym, Zf_sym, Zc_sym, Zo_sym,
		Zcontext_sym, Hcontext_sym, Va_sym,
		bi, bf, bc, bo, image_feature_region, weight_y_in)

	onestep_func = theano.function([input_data, initial_h0_layer_out, initial_c0_layer_out, initial_z0, 
		Wi_sym, Wf_sym, Wc_sym, Wo_sym, Ui_sym, Uf_sym, Uc_sym, Uo_sym, Zi_sym, Zf_sym, Zc_sym, Zo_sym,
		Zcontext_sym, Hcontext_sym, Va_sym,
		bi, bf, bc, bo, image_feature_region, weight_y_in], out_sym)

	list_output = onestep_func(x, h0_theano, c0_theano, z0_theano,
		Wx[:, :H], Wx[:, H:2*H], Wx[:, 2*H:3*H], Wx[:, 3*H:],
		Wh[:, :H], Wh[:, H:2*H], Wh[:, 2*H:3*H], Wh[:, 3*H:],
		Wz[:, :H], Wz[:, H:2*H], Wz[:, 2*H:3*H], Wz[:, 3*H:],
		Zcontext,Hcontext,
		Va_reshape,
		b_theano[:,: , :H], b_theano[:, :, H:2*H], b_theano[:, :, 2*H:3*H], b_theano[:, :, 3*H:], 
		image_feature_input, weight_y_in_value)


	pdb.set_trace()

	print(list_output[0].shape)
	print(list_output[1].shape)
	print(list_output[2].shape)

	pdb.set_trace()
Пример #33
0
    def __init__(self,
                 input_size=2,
                 inner_size=3,
                 output_size=None,
                 batch_size=10,
                 lr=0.01,
                 gamma=0.9):
        self.bsz = batch_size

        # Forget gate matrix
        self.W_f = init_sh_param(shape=(inner_size, inner_size), name='W_f')
        self.U_f = init_sh_param(shape=(inner_size, input_size), name='U_f')
        self.b_f = init_sh_param(shape=inner_size, name='b_f')

        # Insert gate matrix
        self.W_i = init_sh_param(shape=(inner_size, inner_size), name='W_i')
        self.U_i = init_sh_param(shape=(inner_size, input_size), name='U_i')
        self.b_i = init_sh_param(shape=inner_size, name='b_i')

        # Cell gate matrix
        self.W_c = init_sh_param(shape=(inner_size, inner_size), name='W_c')
        self.U_c = init_sh_param(shape=(inner_size, input_size), name='U_c')
        self.b_c = init_sh_param(shape=inner_size, name='b_c')

        # Output gate matrix
        self.W_o = init_sh_param(shape=(inner_size, inner_size), name='W_o')
        self.U_o = init_sh_param(shape=(inner_size, input_size), name='U_o')
        self.b_o = init_sh_param(shape=inner_size, name='b_o')

        # bundle
        self.params = [
            self.W_f, self.U_f, self.b_f, self.W_i, self.U_i, self.b_i,
            self.W_c, self.U_c, self.b_c, self.W_o, self.U_o, self.b_o
        ]

        self.names = [
            'W_f', 'U_f', 'b_f', 'W_i', 'U_i', 'b_i', 'W_c', 'U_c', 'b_c',
            'W_o', 'U_o', 'b_o'
        ]

        # Softmax layer
        if output_size != None:
            self.S = init_sh_param((output_size, inner_size), name='S_softmax')
            self.b_s = init_sh_param(output_size, name='b_s')
            self.params.append(self.S)
            self.params.append(self.b_s)
            self.names.append('S_softmax_data')
            self.names.append('b_s_data')

        # RMSProp data
        self.params_data = []
        for elem, name in zip(self.params, self.names):
            self.params_data.append(
                init_sh_zero(elem.get_value().shape, name=name + '_data'))

        def step(x_t, h_t_1, C_t_1):
            f_t = T.dot(self.W_f, h_t_1) + T.dot(self.U_f, x_t)
            f_t = sigm(f_t.T + self.b_f).T
            i_t = T.dot(self.W_i, h_t_1) + T.dot(self.U_i, x_t)
            i_t = sigm(i_t.T + self.b_i).T
            o_t = T.dot(self.W_o, h_t_1) + T.dot(self.U_o, x_t)
            o_t = sigm(o_t.T + self.b_o).T
            C_t_c = T.dot(self.W_c, h_t_1) + T.dot(self.U_c, x_t)
            C_t_c = tanh(C_t_c.T + self.b_c).T
            C_t = f_t * C_t_1 + i_t * C_t_c
            h_t = o_t * T.tanh(C_t)
            return h_t, C_t

        x = T.ftensor3(name='x_input')
        y = T.fmatrix(name='y_input')
        (h_t, _), _ = theano.scan(fn=step,
                                  sequences=x,
                                  outputs_info=[
                                      T.zeros(shape=(inner_size, batch_size),
                                              dtype='float32'),
                                      T.zeros(shape=(inner_size, batch_size),
                                              dtype='float32')
                                  ])
        h_last = h_t[-1]

        if output_size == None:
            E = T.sum((h_last - y)**2)
        else:
            j = T.nnet.softmax(T.dot(self.S, h_last).T + self.b_s).T
            E = T.sum((j - y)**2)

        gradients = T.grad(E, self.params)
        updates = []
        for param, grad, param_data in zip(self.params, gradients,
                                           self.params_data):
            r_t = (1 - gamma) * (grad**2) + gamma * param_data
            v_t_1 = lr * grad / T.sqrt(r_t)
            updates.append((param, param - v_t_1))
            updates.append((param_data, r_t))

        self.train = theano.function(inputs=[x, y],
                                     outputs=E,
                                     updates=OrderedDict(updates))

        t = T.zeros(shape=(inner_size, 1), dtype='float32')
        t = T.unbroadcast(t, 1)
        (h_t_2, _), _ = theano.scan(fn=step, sequences=x, outputs_info=[t, t])
        j_test = h_t_2[-1]
        if output_size != None:
            j_test = T.nnet.softmax(T.dot(self.S, j_test).T + self.b_s).T
        self.test = theano.function(inputs=[x], outputs=j_test)
def build(word_embeddings, len_voc, word_emb_dim, args, freeze=False):

    # input theano vars
    posts = T.imatrix()
    post_masks = T.fmatrix()
    ques_list = T.itensor3()
    ques_masks_list = T.ftensor3()
    ans_list = T.itensor3()
    ans_masks_list = T.ftensor3()
    labels = T.imatrix()
    N = args.no_of_candidates

    post_out, post_lstm_params = build_lstm(posts, post_masks, args.post_max_len, \
              word_embeddings, word_emb_dim, args.hidden_dim, len_voc, args.batch_size)
    ques_out, ques_emb_out, ques_lstm_params = build_list_lstm(ques_list, ques_masks_list, N, args.ques_max_len, \
              word_embeddings, word_emb_dim, args.hidden_dim, len_voc, args.batch_size)
    ans_out, ans_emb_out, ans_lstm_params = build_list_lstm(ans_list, ans_masks_list, N, args.ans_max_len, \
              word_embeddings, word_emb_dim, args.hidden_dim, len_voc, args.batch_size)

    pqa_preds = [None] * (N * N)
    post_ques_ans = T.concatenate([post_out, ques_out[0], ans_out[0]], axis=1)
    l_post_ques_ans_in = lasagne.layers.InputLayer(shape=(args.batch_size,
                                                          3 * args.hidden_dim),
                                                   input_var=post_ques_ans)
    l_post_ques_ans_denses = [None] * DEPTH
    for k in range(DEPTH):
        if k == 0:
            l_post_ques_ans_denses[k] = lasagne.layers.DenseLayer(l_post_ques_ans_in, num_units=args.hidden_dim,\
                          nonlinearity=lasagne.nonlinearities.rectify)
        else:
            l_post_ques_ans_denses[k] = lasagne.layers.DenseLayer(l_post_ques_ans_denses[k-1], num_units=args.hidden_dim,\
                          nonlinearity=lasagne.nonlinearities.rectify)
    l_post_ques_ans_dense = lasagne.layers.DenseLayer(l_post_ques_ans_denses[-1], num_units=1,\
                   nonlinearity=lasagne.nonlinearities.sigmoid)
    pqa_preds[0] = lasagne.layers.get_output(l_post_ques_ans_dense)
    loss = 0.0
    for i in range(N):
        for j in range(N):
            if i == 0 and j == 0:
                continue
            post_ques_ans = T.concatenate([post_out, ques_out[i], ans_out[j]],
                                          axis=1)
            l_post_ques_ans_in_ = lasagne.layers.InputLayer(
                shape=(args.batch_size, 3 * args.hidden_dim),
                input_var=post_ques_ans)
            for k in range(DEPTH):
                if k == 0:
                    l_post_ques_ans_dense_ = lasagne.layers.DenseLayer(l_post_ques_ans_in_, num_units=args.hidden_dim,\
                                 nonlinearity=lasagne.nonlinearities.rectify,\
                                 W=l_post_ques_ans_denses[k].W,\
                                 b=l_post_ques_ans_denses[k].b)
                else:
                    l_post_ques_ans_dense_ = lasagne.layers.DenseLayer(l_post_ques_ans_dense_, num_units=args.hidden_dim,\
                                 nonlinearity=lasagne.nonlinearities.rectify,\
                                 W=l_post_ques_ans_denses[k].W,\
                                 b=l_post_ques_ans_denses[k].b)
            l_post_ques_ans_dense_ = lasagne.layers.DenseLayer(l_post_ques_ans_dense_, num_units=1,\
                          nonlinearity=lasagne.nonlinearities.sigmoid)
            pqa_preds[i * N +
                      j] = lasagne.layers.get_output(l_post_ques_ans_dense_)
        loss += T.mean(
            lasagne.objectives.binary_crossentropy(pqa_preds[i * N + i],
                                                   labels[:, i]))

    squared_errors = [None] * (N * N)
    for i in range(N):
        for j in range(N):
            squared_errors[i * N + j] = lasagne.objectives.squared_error(
                ans_out[i], ans_out[j])
    post_ques_ans_dense_params = lasagne.layers.get_all_params(
        l_post_ques_ans_dense, trainable=True)

    all_params = post_lstm_params + ques_lstm_params + ans_lstm_params + post_ques_ans_dense_params
    print 'Params in concat ', lasagne.layers.count_params(
        l_post_ques_ans_dense)
    loss += args.rho * sum(T.sum(l**2) for l in all_params)

    updates = lasagne.updates.adam(loss,
                                   all_params,
                                   learning_rate=args.learning_rate)

    train_fn = theano.function([posts, post_masks, ques_list, ques_masks_list, ans_list, ans_masks_list, labels], \
            [loss] + pqa_preds + squared_errors, updates=updates)
    test_fn = theano.function([posts, post_masks, ques_list, ques_masks_list, ans_list, ans_masks_list, labels], \
            [loss] + pqa_preds + squared_errors,)
    return train_fn, test_fn
Пример #35
0
    def _init_model(self, in_size, out_size, n_hid=10, learning_rate_sl=0.005, \
            learning_rate_rl=0.005, batch_size=32, ment=0.1):
        # 2-layer MLP
        self.in_size = in_size  # x and y coordinate
        self.out_size = out_size  # up, down, right, left
        self.batch_size = batch_size
        self.learning_rate = learning_rate_rl
        self.n_hid = n_hid

        input_var, turn_mask, act_mask, reward_var = T.ftensor3('in'), T.imatrix('tm'), \
                T.itensor3('am'), T.fvector('r')

        in_var = T.reshape(
            input_var, (input_var.shape[0] * input_var.shape[1], self.in_size))

        l_mask_in = L.InputLayer(shape=(None, None), input_var=turn_mask)

        pol_in = T.fmatrix('pol-h')
        l_in = L.InputLayer(shape=(None, None, self.in_size),
                            input_var=input_var)
        l_pol_rnn = L.GRULayer(l_in,
                               n_hid,
                               hid_init=pol_in,
                               mask_input=l_mask_in)  # B x H x D
        pol_out = L.get_output(l_pol_rnn)[:, -1, :]
        l_den_in = L.ReshapeLayer(
            l_pol_rnn,
            (turn_mask.shape[0] * turn_mask.shape[1], n_hid))  # BH x D
        l_out = L.DenseLayer(l_den_in,
                             self.out_size,
                             nonlinearity=lasagne.nonlinearities.softmax)

        self.network = l_out
        self.params = L.get_all_params(self.network)

        # rl
        probs = L.get_output(self.network)  # BH x A
        out_probs = T.reshape(probs, (input_var.shape[0], input_var.shape[1],
                                      self.out_size))  # B x H x A
        log_probs = T.log(out_probs)
        act_probs = (log_probs * act_mask).sum(axis=2)  # B x H
        ep_probs = (act_probs * turn_mask).sum(axis=1)  # B
        H_probs = -T.sum(T.sum(out_probs * log_probs, axis=2), axis=1)  # B
        self.loss = 0. - T.mean(ep_probs * reward_var + ment * H_probs)

        updates = lasagne.updates.rmsprop(self.loss, self.params, learning_rate=learning_rate_rl, \
                epsilon=1e-4)

        self.inps = [input_var, turn_mask, act_mask, reward_var, pol_in]
        self.train_fn = theano.function(self.inps, self.loss, updates=updates)
        self.obj_fn = theano.function(self.inps, self.loss)
        self.act_fn = theano.function([input_var, turn_mask, pol_in],
                                      [out_probs, pol_out])

        # sl
        sl_loss = 0. - T.mean(ep_probs)
        sl_updates = lasagne.updates.rmsprop(sl_loss, self.params, learning_rate=learning_rate_sl, \
                epsilon=1e-4)

        self.sl_train_fn = theano.function([input_var, turn_mask, act_mask, pol_in], sl_loss, \
                updates=sl_updates)
        self.sl_obj_fn = theano.function(
            [input_var, turn_mask, act_mask, pol_in], sl_loss)
Пример #36
0
import theano
from theano import tensor as T
from theano import function
import numpy as np

a = np.array([[[1, 2, 3], [3, 4, 5]], [[7, 8, 9], [45, 345, 12]]])

x0 = T.ftensor3()


def create_atom_context(atom_vector):
    # type_vector = T.fvector()
    types_array = atom_vector[0]
    dists = atom_vector[1]
    w = [[1, 1, 1, 1], [2, 2, 2, 2], [3, 3, 3, 3], [4, 4, 4, 4], [5, 5, 5, 5]]

    # outputs_info = T.as_tensor_variable(np.asarray(0, dtype=np.float32))
    # types, updates = theano.scan(fn=lambda atm_type: atm_type,
    #                              outputs_info=None,
    #                              sequences=type_vector)
    # mult = type_vector*2
    # f = function(inputs=[type_vector], outputs=mult)
    # print(f([1, 2, 3]))

    # f = function(inputs=[type_vector], outputs=types)
    # return T.concatenate([types_array], [dists])
    print(len(types_array))
    ls = []
    for tp in types_array:
        ls.append(tp)
    return ls
Пример #37
0
def build_fn(args, embeddings):
    """
        Build training and testing functions.
    """
    if args.para_shared_model is not None:
        dic = utils.load_params(args.para_shared_model)
        params_shared = dic['params']
        params_name = [
            'W', 'o_layer1.W_in_to_updategate', 'o_layer1.W_hid_to_updategate',
            'o_layer1.b_updategate', 'o_layer1.W_in_to_resetgate',
            'o_layer1.W_hid_to_resetgate', 'o_layer1.b_resetgate',
            'o_layer1.W_in_to_hidden_update',
            'o_layer1.W_hid_to_hidden_update', 'o_layer1.b_hidden_update',
            'o_layer1.hid_init', 'o_back_layer1.W_in_to_updategate',
            'o_back_layer1.W_hid_to_updategate', 'o_back_layer1.b_updategate',
            'o_back_layer1.W_in_to_resetgate',
            'o_back_layer1.W_hid_to_resetgate', 'o_back_layer1.b_resetgate',
            'o_back_layer1.W_in_to_hidden_update',
            'o_back_layer1.W_hid_to_hidden_update',
            'o_back_layer1.b_hidden_update', 'o_back_layer1.hid_init',
            'd_layer1.W_in_to_updategate', 'd_layer1.W_hid_to_updategate',
            'd_layer1.b_updategate', 'd_layer1.W_in_to_resetgate',
            'd_layer1.W_hid_to_resetgate', 'd_layer1.b_resetgate',
            'd_layer1.W_in_to_hidden_update',
            'd_layer1.W_hid_to_hidden_update', 'd_layer1.b_hidden_update',
            'd_layer1.hid_init', 'd_back_layer1.W_in_to_updategate',
            'd_back_layer1.W_hid_to_updategate', 'd_back_layer1.b_updategate',
            'd_back_layer1.W_in_to_resetgate',
            'd_back_layer1.W_hid_to_resetgate', 'd_back_layer1.b_resetgate',
            'd_back_layer1.W_in_to_hidden_update',
            'd_back_layer1.W_hid_to_hidden_update',
            'd_back_layer1.b_hidden_update', 'd_back_layer1.hid_init',
            'q_layer1.W_in_to_updategate', 'q_layer1.W_hid_to_updategate',
            'q_layer1.b_updategate', 'q_layer1.W_in_to_resetgate',
            'q_layer1.W_hid_to_resetgate', 'q_layer1.b_resetgate',
            'q_layer1.W_in_to_hidden_update',
            'q_layer1.W_hid_to_hidden_update', 'q_layer1.b_hidden_update',
            'q_layer1.hid_init', 'q_back_layer1.W_in_to_updategate',
            'q_back_layer1.W_hid_to_updategate', 'q_back_layer1.b_updategate',
            'q_back_layer1.W_in_to_resetgate',
            'q_back_layer1.W_hid_to_resetgate', 'q_back_layer1.b_resetgate',
            'q_back_layer1.W_in_to_hidden_update',
            'q_back_layer1.W_hid_to_hidden_update',
            'q_back_layer1.b_hidden_update', 'q_back_layer1.hid_init',
            'W_bilinear', 'W_bilinear'
        ]
    in_x1 = T.imatrix('x1')
    in_x3 = T.imatrix('x3')
    in_mask1 = T.matrix('mask1')
    in_mask3 = T.matrix('mask3')
    in_y = T.ivector('y')

    #batch x word_num x mea_num
    in_x4 = T.ftensor3('x4')

    l_in1 = lasagne.layers.InputLayer((None, None), in_x1)
    l_mask1 = lasagne.layers.InputLayer((None, None), in_mask1)
    Embed_W = params_shared[params_name.index('W')]
    l_emb1 = lasagne.layers.EmbeddingLayer(l_in1,
                                           args.vocab_size,
                                           args.embedding_size,
                                           W=Embed_W)

    l_in3 = lasagne.layers.InputLayer((None, None), in_x3)
    l_mask3 = lasagne.layers.InputLayer((None, None), in_mask3)
    l_emb3 = lasagne.layers.EmbeddingLayer(l_in3,
                                           args.vocab_size,
                                           args.embedding_size,
                                           W=l_emb1.W)
    # x4 is the human attention
    l_in4 = lasagne.layers.InputLayer((None, None, args.mea_num), in_x4)

    if not args.tune_embedding:
        l_emb1.params[l_emb1.W].remove('trainable')
        l_emb3.params[l_emb3.W].remove('trainable')

    args.rnn_output_size = args.hidden_size * 2 if args.bidir else args.hidden_size
    assert args.model is None
    network1 = nn_layers.stack_rnn(l_emb1,
                                   l_mask1,
                                   args.num_layers,
                                   args.hidden_size,
                                   grad_clipping=args.grad_clipping,
                                   dropout_rate=args.dropout_rate,
                                   only_return_final=(args.att_func == 'last'),
                                   bidir=args.bidir,
                                   name='d',
                                   rnn_layer=args.rnn_layer)
    #weighted mean: passage embedding
    #    weight_mlp_np = np.array([[1.]])
    #    b_mlp = np.array([0.])
    #    l_weight = lasagne.layers.DenseLayer(l_in4, 1, num_leading_axes=-1,
    #                                         name='w_dense', W=weight_mlp_np, b=b_mlp)
    # pass a Linear layer and get human ATT  l_weight: batch x word_num x 1     activation -- sigmoid
    l_weight = lasagne.layers.DenseLayer(l_in4,
                                         1,
                                         num_leading_axes=-1,
                                         nonlinearity=nonlinearities.sigmoid,
                                         name='w_dense')

    att = nn_layers.WeightedAverageLayer([network1, l_weight, l_mask1],
                                         name='w_aver')
    if RAW:
        att = nn_layers.WeightedAverageLayer([network1, l_in4, l_mask1],
                                             name='w_aver')
    if SAG:
        # network1 1x1 conv
        # l_in4 1x1 conv

        pass

    #options
    network3 = nn_layers.stack_rnn(l_emb3,
                                   l_mask3,
                                   args.num_layers,
                                   args.hidden_size,
                                   grad_clipping=args.grad_clipping,
                                   dropout_rate=args.dropout_rate,
                                   only_return_final=True,
                                   bidir=args.bidir,
                                   name='o',
                                   rnn_layer=args.rnn_layer)
    network3 = lasagne.layers.ReshapeLayer(
        network3, (in_x1.shape[0], 4, args.rnn_output_size))
    #answer
    network = nn_layers.BilinearDotLayer([network3, att], args.rnn_output_size)
    # if not args.tune_embedding:
    #     network.params[network.W].remove('trainable')
    #parameter sharing
    params_initial = lasagne.layers.get_all_params(network)
    params_set = []
    for params_initial_tmp in params_initial:
        if str(params_initial_tmp) in ['w_dense.W', 'w_dense.b']:
            params_set = params_set + [params_initial_tmp.get_value()]
        elif str(params_initial_tmp) == 'W_bilinear':
            params_set = params_set + [params_shared[-1]]
        else:
            params_set = params_set + [
                params_shared[params_name.index(str(params_initial_tmp))]
            ]
    lasagne.layers.set_all_param_values(network, params_set)

    if args.pre_trained is not None:
        dic = utils.load_params(args.pre_trained)
        lasagne.layers.set_all_param_values(network, dic['params'])
        del dic['params']
        logging.info('Loaded pre-trained model: %s' % args.pre_trained)
        for dic_param in dic.iteritems():
            logging.info(dic_param)

    logging.info('#params: %d' %
                 lasagne.layers.count_params(network, trainable=True))
    logging.info('#fixed params: %d' %
                 lasagne.layers.count_params(network, trainable=False))
    for layer in lasagne.layers.get_all_layers(network):
        logging.info(layer)

    # Test functions
    test_prob = lasagne.layers.get_output(network, deterministic=True)
    test_prediction = T.argmax(test_prob, axis=-1)
    acc = T.sum(T.eq(test_prediction, in_y))
    test_fn = theano.function([in_x1, in_mask1, in_x3, in_mask3, in_y, in_x4],
                              [acc, test_prediction],
                              on_unused_input='warn')

    # Train functions
    train_prediction = lasagne.layers.get_output(network)
    train_prediction = T.clip(train_prediction, 1e-7, 1.0 - 1e-7)
    loss = lasagne.objectives.categorical_crossentropy(train_prediction,
                                                       in_y).mean()

    # TODO: lasagne.regularization.regularize_network_params(network, lasagne.regularization.l2)
    #    params = lasagne.layers.get_all_params(network)#, trainable=True)
    params_init = lasagne.layers.get_all_params(network, trainable=True)
    params = lasagne.layers.get_all_params(network, trainable=True)
    if not (args.tune_sar):
        for params_tmp in params_init:
            if not (str(params_tmp) in ['w_dense.W', 'w_dense.b']):
                print(params_tmp)
                params.remove(params_tmp)
                print(len(params))
                print(params)
            else:
                print(params_tmp)


#                params.remove(params_tmp)
    all_params = lasagne.layers.get_all_params(network)
    if args.optimizer == 'sgd':
        updates = lasagne.updates.sgd(loss, params, args.learning_rate)
    elif args.optimizer == 'adam':
        updates = lasagne.updates.adam(loss,
                                       params,
                                       learning_rate=args.learning_rate)
    elif args.optimizer == 'rmsprop':
        updates = lasagne.updates.rmsprop(loss,
                                          params,
                                          learning_rate=args.learning_rate)
    else:
        raise NotImplementedError('optimizer = %s' % args.optimizer)
    train_fn = theano.function([in_x1, in_mask1, in_x3, in_mask3, in_y, in_x4],
                               loss,
                               updates=updates,
                               on_unused_input='warn')

    return train_fn, test_fn, params, all_params
Пример #38
0
def train(
        batch_size=64,
        n_epochs=25,
        ):
    #1 denotes positive rule,0 is negative
    rules = [["sweatshirts", "activewear pants", 1], ["cashmere", "leather", 1], ["tank tops", "shorts", 1]]

    extract_rule.extract(rules)
    rule_num = len(rules)

    # parameters of text
    non_static = False
    filter_hs = [2, 3, 4, 5]
    hidden_units = [100, 2]
    conv_non_linear = "relu"
    img_w = 300

    print "loading w2v data...",
    x = cPickle.load(open("./cloth.binary.p", "rb"))
    revs, W, W2, word_idx_map, vocab = x[0], x[1], x[2], x[3], x[4]
    print "data loaded!"
    if non_static == True:
        print "model architecture: CNN-non-static"
        print "using: random vectors"
        U = W2
    elif non_static == False:
        print "model architecture: CNN-static"
        print "using: word2vec vectors, dim=%d" % W.shape[1]
        U = W

    # make text data
    datasets = make_idx_data(revs, word_idx_map, max_l=55, k=300, filter_h=filter_hs[-1])
    train_text_i, train_text_j, train_text_k = datasets[0], datasets[1], datasets[2]
    valid_text_i, valid_text_j, valid_text_k = datasets[3], datasets[4], datasets[5]
    test_text_i, test_text_j, test_text_k = datasets[6], datasets[7], datasets[8]

    # load visual data
    print 'loading visual data'
    print('now():' + str(datetime.now()))
    with open("./data_mm/AUC_new_dataset_train_811_norm.pkl", "rb") as f:
        train_set = np.asarray(cPickle.load(f), dtype='float32')
    with open("./data_mm/AUC_new_dataset_valid_811_norm.pkl", "rb") as f:
        valid_set = np.asarray(cPickle.load(f), dtype='float32')
    with open("./data_mm/AUC_new_dataset_test_811_norm.pkl", "rb") as f:
        test_set = np.asarray(cPickle.load(f), dtype='float32')
    print 'visual data loaded'
    print('now():' + str(datetime.now()))

    print 'loading rule ind'
    print 'loading train ind'
    with open("./rule_ind/train_rules_ind.pkl", "rb") as f:
        train_rules_ind = np.asarray(cPickle.load(f), dtype='float32')

    print 'loading valid ind'
    with open("./rule_ind/valid_rules_ind.pkl", "rb") as f:
        valid_rules_ind = np.asarray(cPickle.load(f), dtype='float32')

    print 'loading test ind'
    with open("./rule_ind/test_rules_ind.pkl", "rb") as f:
        test_rules_ind = np.asarray(cPickle.load(f), dtype='float32')
    print 'rules ind loaded'

    train_set_size = train_set[0].shape[0]
    valid_set_size = valid_set[0].shape[0]
    test_set_size = test_set[0].shape[0]

    train_set_i, train_set_j, train_set_k = train_set[0], train_set[1], train_set[2]
    valid_set_i, valid_set_j, valid_set_k = valid_set[0], valid_set[1], valid_set[2]
    test_set_i, test_set_j, test_set_k = test_set[0], test_set[1], test_set[2]

    train_rules_ind = train_rules_ind[0]
    valid_rules_ind = valid_rules_ind[0]
    test_rules_ind = test_rules_ind[0]

    np.random.seed(3435)
    # training data
    if train_set_size % batch_size > 0:
        extra_data_num = batch_size - train_set_size % batch_size
        '''
        permutation_order = np.random.permutation(train_set_size)
        train_set_i = train_set_i[permutation_order]
        train_set_j = train_set_j[permutation_order]
        train_set_k = train_set_k[permutation_order]
        train_text_i = train_text_i[permutation_order]
        train_text_j = train_text_j[permutation_order]
        train_text_k = train_text_k[permutation_order]
        '''
        extra_data_i = train_set_i[:extra_data_num]
        extra_data_j = train_set_j[:extra_data_num]
        extra_data_k = train_set_k[:extra_data_num]
        extra_text_i = train_text_i[:extra_data_num]
        extra_text_j = train_text_j[:extra_data_num]
        extra_text_k = train_text_k[:extra_data_num]

        train_set_i = np.append(train_set_i, extra_data_i, axis=0)
        train_set_j = np.append(train_set_j, extra_data_j, axis=0)
        train_set_k = np.append(train_set_k, extra_data_k, axis=0)
        train_text_i = np.append(train_text_i, extra_text_i, axis=0)
        train_text_j = np.append(train_text_j, extra_text_j, axis=0)
        train_text_k = np.append(train_text_k, extra_text_k, axis=0)

        new_train_rules_ind = np.zeros(
            (len(train_rules_ind), len(train_rules_ind[0]) + extra_data_num, len(train_rules_ind[0][0])))
        for i in range(len(train_rules_ind)):
            #train_rules_ind[i] = train_rules_ind[i][permutation_order]
            extra_rules_ind_i = train_rules_ind[i][:extra_data_num]
            train_rules_ind_i = np.append(train_rules_ind[i], extra_rules_ind_i, axis=0)
            new_train_rules_ind[i] = train_rules_ind_i
        #print(len(new_train_rules_ind[0]))
        train_rules_ind = new_train_rules_ind

    train_set_size = train_set_i.shape[0]
    train_set_i = shared_dataset_x(train_set_i)
    train_set_j = shared_dataset_x(train_set_j)
    train_set_k = shared_dataset_x(train_set_k)
    train_text_i = shared_dataset_x(train_text_i)
    train_text_j = shared_dataset_x(train_text_j)
    train_text_k = shared_dataset_x(train_text_k)
    train_rules_ind = theano.shared(np.asarray(train_rules_ind,dtype=theano.config.floatX),borrow=True)

    # valid data
    if valid_set_size % batch_size > 0:
        extra_data_num = batch_size - valid_set_size % batch_size
        '''
        permutation_order = np.random.permutation(valid_set_size)
        valid_set_i = valid_set_i[permutation_order]
        valid_set_j = valid_set_j[permutation_order]
        valid_set_k = valid_set_k[permutation_order]
        valid_text_i = valid_text_i[permutation_order]
        valid_text_j = valid_text_j[permutation_order]
        valid_text_k = valid_text_k[permutation_order]
        '''
        extra_data_i = valid_set_i[:extra_data_num]
        extra_data_j = valid_set_j[:extra_data_num]
        extra_data_k = valid_set_k[:extra_data_num]
        extra_text_i = valid_text_i[:extra_data_num]
        extra_text_j = valid_text_j[:extra_data_num]
        extra_text_k = valid_text_k[:extra_data_num]

        valid_set_i = np.append(valid_set_i, extra_data_i, axis=0)
        valid_set_j = np.append(valid_set_j, extra_data_j, axis=0)
        valid_set_k = np.append(valid_set_k, extra_data_k, axis=0)
        valid_text_i = np.append(valid_text_i, extra_text_i, axis=0)
        valid_text_j = np.append(valid_text_j, extra_text_j, axis=0)
        valid_text_k = np.append(valid_text_k, extra_text_k, axis=0)

        new_valid_rules_ind = np.zeros(
            (len(valid_rules_ind), len(valid_rules_ind[0]) + extra_data_num, len(valid_rules_ind[0][0])))
        for i in range(len(valid_rules_ind)):
            # valid_rules_ind[i] = valid_rules_ind[i][permutation_order]
            extra_rules_ind_i = valid_rules_ind[i][:extra_data_num]
            valid_rules_ind_i = np.append(valid_rules_ind[i], extra_rules_ind_i, axis=0)
            new_valid_rules_ind[i] = valid_rules_ind_i
        # print(len(new_valid_rules_ind[0]))
        valid_rules_ind = new_valid_rules_ind


    valid_set_size = valid_set_i.shape[0]
    valid_set_i = shared_dataset_x(valid_set_i)
    valid_set_j = shared_dataset_x(valid_set_j)
    valid_set_k = shared_dataset_x(valid_set_k)
    valid_text_i = shared_dataset_x(valid_text_i)
    valid_text_j = shared_dataset_x(valid_text_j)
    valid_text_k = shared_dataset_x(valid_text_k)
    valid_rules_ind = theano.shared(np.asarray(valid_rules_ind,dtype=theano.config.floatX),borrow=True)

    # test data
    if test_set_size % batch_size > 0:
        extra_data_num = batch_size - test_set_size % batch_size
        '''
        permutation_order = np.random.permutation(test_set_size)
        test_set_i = test_set_i[permutation_order]
        test_set_j = test_set_j[permutation_order]
        test_set_k = test_set_k[permutation_order]
        test_text_i = test_text_i[permutation_order]
        test_text_j = test_text_j[permutation_order]
        test_text_k = test_text_k[permutation_order]
        '''
        extra_data_i = test_set_i[:extra_data_num]
        extra_data_j = test_set_j[:extra_data_num]
        extra_data_k = test_set_k[:extra_data_num]
        extra_text_i = test_text_i[:extra_data_num]
        extra_text_j = test_text_j[:extra_data_num]
        extra_text_k = test_text_k[:extra_data_num]

        test_set_i = np.append(test_set_i, extra_data_i, axis=0)
        test_set_j = np.append(test_set_j, extra_data_j, axis=0)
        test_set_k = np.append(test_set_k, extra_data_k, axis=0)
        test_text_i = np.append(test_text_i, extra_text_i, axis=0)
        test_text_j = np.append(test_text_j, extra_text_j, axis=0)
        test_text_k = np.append(test_text_k, extra_text_k, axis=0)

        new_test_rules_ind = np.zeros(
            (len(test_rules_ind), len(test_rules_ind[0]) + extra_data_num, len(test_rules_ind[0][0])))
        for i in range(len(test_rules_ind)):
            # test_rules_ind[i] = test_rules_ind[i][permutation_order]
            extra_rules_ind_i = test_rules_ind[i][:extra_data_num]
            test_rules_ind_i = np.append(test_rules_ind[i], extra_rules_ind_i, axis=0)
            new_test_rules_ind[i] = test_rules_ind_i
        # print(len(new_test_rules_ind[0]))
        test_rules_ind = new_test_rules_ind

    test_set_size = test_set_i.shape[0]
    test_set_i = shared_dataset_x(test_set_i)
    test_set_j = shared_dataset_x(test_set_j)
    test_set_k = shared_dataset_x(test_set_k)
    test_text_i = shared_dataset_x(test_text_i)
    test_text_j = shared_dataset_x(test_text_j)
    test_text_k = shared_dataset_x(test_text_k)
    test_rules_ind = theano.shared(np.asarray(test_rules_ind,dtype=theano.config.floatX),borrow=True)

    print 'train size:%f , valid size:%f , test size:%f'%(train_set_size,valid_set_size,test_set_size)
    n_train_batches = train_set_size / batch_size
    n_valid_batches = valid_set_size / batch_size
    n_test_batches = test_set_size / batch_size

    iteration = 0
    best_val_q_perf = 0.0
    best_test_q_perf = 0.0
    ret_test_q_perf = 0.0
    ret_test_p_perf = 0.0
    ret_iteration = 0
    ret_dropout_rate = 0.0
    ret_mu_param = 0.0

    #_attention_hidden = 512
    #_learning_rate = 0.05

    for _learning_rate in [0.05]:
        for _mu_param in [[0.01, 0.1],[0.001,0.05]]:
            for _attention_hidden in [256,512]:
                # parameters of classifier
                n_hidden = 1024
                n_in = 4096
                n_out = n_hidden
                n2_in = 400
                n2_out = n_hidden
                dropout_rate_v = 0.0
                dropout_rate_t = 0.4

                # parameters of logicnn
                pi_params = [0.95, 0]       #pi = [1.0, 0] C=0 is train p only
                learning_rate = _learning_rate
                momentum = 0.9
                C = 3.0
                mu_param = _mu_param    #weight of Sqr
                attention_hidden = _attention_hidden  #hidden num of attention

                index = T.lscalar()
                input1 = T.matrix('input1')
                input2 = T.matrix('input2')
                input3 = T.matrix('input3')
                input1_t = T.matrix('input1_t')
                input2_t = T.matrix('input2_t')
                input3_t = T.matrix('input3_t')
                rules_ind = T.ftensor3('rules_ind')

                # convolution setup
                rng = np.random.RandomState(3435)
                img_h = len(datasets[0][0])
                filter_w = img_w
                feature_maps = hidden_units[0]
                filter_shapes = []
                pool_sizes = []
                for filter_h in filter_hs:
                    filter_shapes.append((feature_maps, 1, filter_h, filter_w))
                    pool_sizes.append((img_h - filter_h + 1, img_w - filter_w + 1))
                parameters = [("image shape", img_h, img_w), ("filter shape", filter_shapes),
                              ("hidden_units", hidden_units),
                              ("conv_non_linear", conv_non_linear)]
                print parameters
                Words = theano.shared(value=U, name="Words")
                zero_vec_tensor = T.vector()
                zero_vec = np.zeros(img_w)
                set_zero = theano.function([zero_vec_tensor],
                                           updates=[(Words, T.set_subtensor(Words[0, :], zero_vec_tensor))],
                                           allow_input_downcast=True)
                layer0_input_i = Words[T.cast(input1_t.flatten(), dtype="int32")].reshape(
                    (input1_t.shape[0], 1, input1_t.shape[1], Words.shape[1]))
                layer0_input_j = Words[T.cast(input2_t.flatten(), dtype="int32")].reshape(
                    (input2_t.shape[0], 1, input2_t.shape[1], Words.shape[1]))
                layer0_input_k = Words[T.cast(input3_t.flatten(), dtype="int32")].reshape(
                    (input3_t.shape[0], 1, input3_t.shape[1], Words.shape[1]))

                layer0_input = [layer0_input_i, layer0_input_j, layer0_input_k]

                # convolution
                conv_layers = []
                layer1_inputs_i = []
                layer1_inputs_j = []
                layer1_inputs_k = []
                for i in xrange(len(filter_hs)):
                    filter_shape = filter_shapes[i]
                    pool_size = pool_sizes[i]

                    conv_layer = matching_attention_classes.LeNetConvPoolLayer(rng, input=layer0_input,
                                                    image_shape=(batch_size, 1, img_h, img_w),
                                                    filter_shape=filter_shape, poolsize=pool_size,
                                                    non_linear=conv_non_linear)
                    layer1_input_i = conv_layer.output_i.flatten(2)
                    layer1_input_j = conv_layer.output_j.flatten(2)
                    layer1_input_k = conv_layer.output_k.flatten(2)
                    conv_layers.append(conv_layer)
                    layer1_inputs_i.append(layer1_input_i)
                    layer1_inputs_j.append(layer1_input_j)
                    layer1_inputs_k.append(layer1_input_k)

                layer1_input_i = T.concatenate(layer1_inputs_i, 1)
                layer1_input_j = T.concatenate(layer1_inputs_j, 1)
                layer1_input_k = T.concatenate(layer1_inputs_k, 1)

                network = matching_attention_classes.MLP(rng,
                              input1=input1,
                              input2=input2,
                              input3=input3,
                              input1_t=layer1_input_i,
                              input2_t=layer1_input_j,
                              input3_t=layer1_input_k,
                              dropout_rate_v=dropout_rate_v,
                              dropout_rate_t=dropout_rate_t,
                              n_in=n_in,
                              n_out=n_out,
                              n2_in=n2_in,
                              n2_out=n2_out
                                )
                rules = []
                for i in range(rule_num):
                    rules.append(matching_attention_classes.Rule(rules_ind[i]))

                new_pi = get_pi(cur_iter=0, params=pi_params)
                logic_nn = matching_attention_classes.LogicNN(input1=input1,
                                   input2=input2,
                                   input3=input3,
                                   network=network,
                                   rules=rules,
                                   rule_num=rule_num,
                                   n_hidden=n_hidden,
                                   attention_hidden=attention_hidden,
                                   C=C,
                                   pi=new_pi,
                                   mu_param=mu_param)

                # parameters to update
                params = logic_nn.params
                for conv_layer in conv_layers:
                    params += conv_layer.params
                if non_static:
                    params += [Words]

                cost = logic_nn.cost()
                dropout_cost = logic_nn.dropout_cost()

                # momentum
                gparams = T.grad(dropout_cost, params)
                updates = []
                for p, g in zip(params, gparams):
                    mparam_i = theano.shared(np.zeros(p.get_value().shape, dtype=theano.config.floatX))
                    v = momentum * mparam_i - learning_rate * g
                    updates.append((mparam_i, v))
                    updates.append((p, p + v))

                train_model = theano.function([index], cost, updates=updates,
                                              givens={
                                                input1: train_set_i[index * batch_size:(index + 1) * batch_size],
                                                input2: train_set_j[index * batch_size:(index + 1) * batch_size],
                                                input3: train_set_k[index * batch_size:(index + 1) * batch_size],
                                                input1_t: train_text_i[index * batch_size:(index + 1) * batch_size],
                                                input2_t: train_text_j[index * batch_size:(index + 1) * batch_size],
                                                input3_t: train_text_k[index * batch_size:(index + 1) * batch_size],
                                                rules_ind: train_rules_ind[:,index * batch_size:(index + 1) * batch_size]
                                              },
                                              allow_input_downcast=True,
                                              on_unused_input='warn')

                train_test_model = theano.function([index], logic_nn.sup(),
                                                   givens={
                                                    input1: train_set_i[index * batch_size:(index + 1) * batch_size],
                                                    input2: train_set_j[index * batch_size:(index + 1) * batch_size],
                                                    input3: train_set_k[index * batch_size:(index + 1) * batch_size],
                                                    input1_t: train_text_i[index * batch_size:(index + 1) * batch_size],
                                                    input2_t: train_text_j[index * batch_size:(index + 1) * batch_size],
                                                    input3_t: train_text_k[index * batch_size:(index + 1) * batch_size],
                                                    rules_ind: train_rules_ind[:,index * batch_size:(index + 1) * batch_size]
                                                   },
                                                   allow_input_downcast=True,
                                                   on_unused_input='warn')

                val_model = theano.function([index], logic_nn.sup(),
                                            givens={
                                                input1: valid_set_i[index * batch_size:(index + 1) * batch_size],
                                                input2: valid_set_j[index * batch_size:(index + 1) * batch_size],
                                                input3: valid_set_k[index * batch_size:(index + 1) * batch_size],
                                                input1_t: valid_text_i[index * batch_size:(index + 1) * batch_size],
                                                input2_t: valid_text_j[index * batch_size:(index + 1) * batch_size],
                                                input3_t: valid_text_k[index * batch_size:(index + 1) * batch_size],
                                                rules_ind: valid_rules_ind[:,index * batch_size:(index + 1) * batch_size]
                                            },
                                            allow_input_downcast=True,
                                            on_unused_input='warn')

                test_model = theano.function([index], logic_nn.sup(),
                                             givens={
                                                 input1: test_set_i[index * batch_size:(index + 1) * batch_size],
                                                 input2: test_set_j[index * batch_size:(index + 1) * batch_size],
                                                 input3: test_set_k[index * batch_size:(index + 1) * batch_size],
                                                 input1_t: test_text_i[index * batch_size:(index + 1) * batch_size],
                                                 input2_t: test_text_j[index * batch_size:(index + 1) * batch_size],
                                                 input3_t: test_text_k[index * batch_size:(index + 1) * batch_size],
                                                 rules_ind: test_rules_ind[:,index * batch_size:(index + 1) * batch_size]
                                             },
                                             allow_input_downcast=True,
                                             on_unused_input='warn')
                test_mijk = theano.function([index], logic_nn.mijk(),
                                            givens={
                                                input1: test_set_i[index * batch_size:(index + 1) * batch_size],
                                                input2: test_set_j[index * batch_size:(index + 1) * batch_size],
                                                input3: test_set_k[index * batch_size:(index + 1) * batch_size],
                                                input1_t: test_text_i[index * batch_size:(index + 1) * batch_size],
                                                input2_t: test_text_j[index * batch_size:(index + 1) * batch_size],
                                                input3_t: test_text_k[index * batch_size:(index + 1) * batch_size],
                                                rules_ind: test_rules_ind[:,index * batch_size:(index + 1) * batch_size]
                                            },
                                            allow_input_downcast=True,
                                            on_unused_input='warn')

                print 'training...'
                fi = open('mm_attention_color_coatdress.txt', 'a+')
                epoch = 0
                batch = 0
                iteration += 1
                best_val_p_iter = 0.0
                best_test_q_iter = 0.0
                print 'iteration: %i' % iteration
                fi.write('################iteration: %f\n' % iteration)
                fi.write('parameters: hidden%.5f, attention_hidden%f, lr: %.4f, mu: %.5f %.5f\n' % (n_hidden, attention_hidden, learning_rate,mu_param[0],mu_param[1]))
                fi.flush()


                while (epoch < n_epochs):
                    start_time = time.time()
                    epoch = epoch + 1
                    if epoch > 5:
                        learning_rate = 0.02
                    cost = 0.0
                    L_sup = 0.0
                    L_p_q = 0.0
                    L_sqr = 0.0


                    # train
                    for minibatch_index in xrange(n_train_batches):
                        batch = batch + 1
                        new_pi = get_pi(cur_iter=batch * 1. / n_train_batches, params=pi_params)
                        logic_nn.set_pi(new_pi)
                        set_zero(zero_vec)
                        cost_batch = train_model(minibatch_index)
                        cost += cost_batch[0]
                        L_sup += cost_batch[1]
                        L_p_q += cost_batch[2]
                        L_sqr += cost_batch[3]


                    print 'epoch: %i, cost: %.4f, L_sup: %.4f, L_p_q: %.4f, L_sqr: %.4f' % (
                    epoch, cost, L_sup, L_p_q, L_sqr)


                    # training result
                    train_sup = [train_test_model(i) for i in xrange(n_train_batches)]
                    train_sup = np.array(train_sup)
                    train_q_sup = train_sup[:, 0]
                    train_p_sup = train_sup[:, 1]
                    count_q = 0.0
                    count_p = 0.0
                    for i in range(train_q_sup.shape[0]):
                        for j in range(train_q_sup.shape[1]):
                            if train_q_sup[i, j, 0] > 0.5:
                                count_q += 1
                            if train_p_sup[i, j, 0] > 0.5:
                                count_p += 1
                    train_q_perf = count_q / (train_q_sup.shape[0] * train_q_sup.shape[1])
                    train_p_perf = count_p / (train_p_sup.shape[0] * train_p_sup.shape[1])
                    print('training time: %.2f secs; q_train perf: %.4f %% ,p_train perf: %.4f %% ' % \
                          (time.time() - start_time, train_q_perf * 100., train_p_perf * 100.))


                    # valid result
                    valid_sup = [val_model(i) for i in xrange(n_valid_batches)]
                    valid_sup = np.array(valid_sup)
                    valid_q_sup = valid_sup[:, 0]
                    valid_p_sup = valid_sup[:, 1]
                    count_q = 0.0
                    count_p = 0.0
                    for i in range(valid_q_sup.shape[0]):
                        for j in range(valid_q_sup.shape[1]):
                            if valid_q_sup[i, j, 0] > 0.5:
                                count_q += 1
                            if valid_p_sup[i, j, 0] > 0.5:
                                count_p += 1
                    val_q_perf = count_q / (valid_q_sup.shape[0] * valid_q_sup.shape[1])
                    val_p_perf = count_p / (valid_p_sup.shape[0] * valid_p_sup.shape[1])


                    # testing result
                    test_sup = [test_model(i) for i in xrange(n_test_batches)]
                    test_sup = np.array(test_sup)
                    test_q_sup = test_sup[:, 0]
                    test_p_sup = test_sup[:, 1]
                    count_q = 0.0
                    count_p = 0.0
                    for i in range(test_q_sup.shape[0]):
                        for j in range(test_q_sup.shape[1]):
                            if test_q_sup[i, j, 0] > 0.5:
                                count_q += 1
                            if test_p_sup[i, j, 0] > 0.5:
                                count_p += 1
                    test_q_perf = count_q / (test_q_sup.shape[0] * test_q_sup.shape[1])
                    test_p_perf = count_p / (test_p_sup.shape[0] * test_p_sup.shape[1])


                    print 'valid perf: q %.4f %%, p %.4f %%' % (val_q_perf * 100., val_p_perf * 100.)
                    print 'test perf: q %.4f %%, p %.4f %%' % (test_q_perf * 100., test_p_perf * 100.)
                    fi.write('%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\n' % (
                        cost, L_sup, L_p_q, L_sqr, train_q_perf * 100., train_p_perf * 100., val_q_perf * 100.,
                        val_p_perf * 100., test_q_perf * 100., test_p_perf * 100.))
                    fi.flush()


                    # select
                    if test_q_perf > best_test_q_iter:
                        best_test_q_iter = test_q_perf
                        iter_test_q_perf = test_q_perf
                        iter_test_p_perf = test_p_perf
                    if test_q_perf > best_test_q_perf:
                        best_test_q_perf = test_q_perf
                        ret_test_q_perf = test_q_perf
                        ret_test_p_perf = test_p_perf
                        ret_iteration = iteration
                        ret_dropout_rate = dropout_rate_v

                        best_w1 = network.W1.get_value()
                        best_w2 = network.W2.get_value()
                        best_w1t = network.W1t.get_value()
                        best_w2t = network.W2t.get_value()
                        best_b1 = network.b1.get_value()
                        best_b2 = network.b2.get_value()
                        count = 0
                        for conv_layer in conv_layers:
                            if count == 0:
                                wc0 = conv_layer.W.get_value()
                                bc0 = conv_layer.b.get_value()
                            if count == 1:
                                wc1 = conv_layer.W.get_value()
                                bc1 = conv_layer.b.get_value()
                            if count == 2:
                                wc2 = conv_layer.W.get_value()
                                bc2 = conv_layer.b.get_value()
                            if count == 3:
                                wc3 = conv_layer.W.get_value()
                                bc3 = conv_layer.b.get_value()
                            count += 1
                        mij = []
                        mik = []
                        qmij = []
                        qmik = []
                        rule_lambda = []
                        masks = []
                        raw_rule_lambda = []
                        for batch_index in xrange(n_test_batches):
                            mijk_batch = test_mijk(batch_index)
                            mij.append(np.array(mijk_batch[0]))
                            mik.append(np.array(mijk_batch[1]))
                            qmij.append(np.array(mijk_batch[2]))
                            qmik.append(np.array(mijk_batch[3]))
                            rule_lambda.append(np.array(mijk_batch[4]))
                            masks.append(np.array(mijk_batch[5]))
                            raw_rule_lambda.append(np.array(mijk_batch[6]))
                        mij = np.array(mij)
                        mik = np.array(mik)
                        qmij = np.array(mij)
                        qmik = np.array(mik)
                        rule_lambda = np.array(rule_lambda,dtype="float32")
                        masks = np.array(masks,dtype="float32")
                        raw_rule_lambda = np.array(raw_rule_lambda,dtype="float32")

                print '###interation: %i: test q perf: %.4f%%, test p perf: %.4f%%' % (
                    iteration, iter_test_q_perf * 100., iter_test_p_perf * 100.)
                fi.write('interation###: %i, test q perf: %.4f %%\n, test p perf: %.4f %%\n' % (
                iteration, iter_test_q_perf * 100., iter_test_p_perf * 100.))
                fi.flush()
                fi.close()
                
    print '##best q perf: %.4f%%, p perf: %.4f%%' % (ret_test_q_perf * 100., ret_test_p_perf * 100.)
    print 'in iteration: %i, dropout_rate: %.4f' % (
        ret_iteration, ret_dropout_rate)

    np.savetxt('./parameters/mij_Ar.csv', mij)
    np.savetxt('./parameters/mik_Ar.csv', mik)
    np.savetxt('./parameters/qmij_Ar.csv', qmij)
    np.savetxt('./parameters/qmik_Ar.csv', qmik)
    cPickle.dump(rule_lambda, open('./parameters/rule_lambda.pkl', "wb"))
    cPickle.dump(masks, open('./parameters/masks.pkl', "wb"))
    cPickle.dump(raw_rule_lambda, open('./parameters/raw_rule_lambda.pkl', "wb"))

    np.savetxt('./parameters/W1.csv', best_w1)
    np.savetxt('./parameters/W2.csv', best_w2)
    np.savetxt('./parameters/W1t.csv', best_w1t)
    np.savetxt('./parameters/W2t.csv', best_w2t)
    np.savetxt('./parameters/b1.csv', best_b1)
    np.savetxt('./parameters/b2.csv', best_b2)
    cPickle.dump(wc0, open("./parameters/Wc0.pkl", "wb"))
    cPickle.dump(wc1, open("./parameters/Wc1.pkl", "wb"))
    cPickle.dump(wc2, open("./parameters/Wc2.pkl", "wb"))
    cPickle.dump(wc3, open("./parameters/Wc3.pkl", "wb"))
    cPickle.dump(bc0, open("./parameters/bc0.pkl", "wb"))
    cPickle.dump(bc1, open("./parameters/bc1.pkl", "wb"))
    cPickle.dump(bc2, open("./parameters/bc2.pkl", "wb"))
    cPickle.dump(bc3, open("./parameters/bc3.pkl", "wb"))
def build_model(options, tparams):
    """Build up the whole computation graph
    Input is the features extracted from googleNet.
    """

    last_n = options['last_n']
    actionNum = options['actions']
    decay_c = options['decay_c']
    use_dropout = options['use_dropout']
    use_wta = options['use_wta']

    location_dim = options['locations']
    feature_dim = options['featureMaps']

    trng = RandomStreams(1234)
    use_noise = theano.shared(np.float32(0.))
    """combine model"""
    x = T.ftensor3('x')
    n_steps = x.shape[0]
    n_samples = x.shape[1]
    mask = T.fmatrix('mask')
    y = T.ftensor3('y')
    # one hot vector,n_steps*n_samples*actionNum

    _x = x.reshape([n_steps * n_samples, location_dim, feature_dim])
    feature = _x.mean(1)
    # feature=feature/feature.max(1,keepdims=True);
    feature = feature.reshape([n_steps, n_samples, feature_dim])
    feature = feature + use_noise * trng.normal(
        feature.shape, avg=1, std=0.05, dtype=feature.dtype)
    #noisy

    if use_dropout: feature = dropout_layer(feature, use_noise, trng)
    if use_wta: feature = WTA_Layer(feature, 4, 2, ndim=options['featureMaps'])

    f1 = ff_build(tparams,
                  feature,
                  prefix="recog",
                  name='fullconn',
                  active="tanh")
    if use_dropout: f1 = dropout_layer(f1, use_noise, trng)
    lin = ff_build(tparams, f1, prefix="recog", name='output', active="linear")
    # n_steps*n_samples*actionNum
    probs = T.nnet.softmax(lin.reshape([-1, actionNum]))
    probs = probs.reshape([n_steps, n_samples, actionNum])
    """compute cost"""
    cost = 0
    # cross entropy
    entropy_cost = -y * T.log(probs + 1e-8)
    entropy_cost = (entropy_cost.sum(2) * mask).mean(0).sum() * 100

    cost += entropy_cost
    # weight decay
    weight_decay = 0.
    if decay_c > 0.:
        decay_c = theano.shared(np.float32(decay_c), name='decay_c')
        for kk, vv in tparams.iteritems():
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay
    """Predictions"""
    preds = T.sum(probs[-last_n:, :, :], axis=0)
    preds = T.argmax(preds, axis=1)
    # n_samples
    # preds=T.argmax(probs[-last_n:,:,:],axis=2);

    return cost, preds, [], [x, mask, y], use_noise
Пример #40
0
    def __init__(self, nam, maxlen=0, load=False, training=False):

        # 创建2个LSTM单元(参数:WUb)放入词典中,并初始化参数
        # Generate 2 LSTM unit with Guassian innitialization
        # Type: Dictionary
        self.maxlen = maxlen
        newp = creatrnnx() 
        self.model_name = nam
        # 让两个LSTM单元的参数WUb的初始相同
        # Make the weights(WUb) of both LSTM unit same
        for i in newp.keys():
            if i[0] == '1':
                newp['2' + i[1:]] = newp[i]

        # Create 5 tensors (symoblic) variables (y, mask11, mask21, emb11, emb21)
        # Here, config.floatX = 'float32'
        y = T.vector('y', dtype = config.floatX)
        mask11 = T.matrix('mask11', dtype = config.floatX)
        mask21 = T.matrix('mask21', dtype = config.floatX)
        emb11 = T.ftensor3('emb11')
        emb21 = T.ftensor3('emb21') # 3-D float-type tensor

        # Load the existed model (pre-trained weights) if needed
        if load == True:
            newp = pickle.load(open(nam,'rb'))

        # Convert 'newp' to shared-tensor-type dictionary 'tnewp'
        # Shared tenasor variable
        self.tnewp = init_tparams(newp)

        # Set tensor-type noise 
        use_noise = theano.shared(numpy_floatX(0.))

        # Set tensor-type random number generator
        # rng -> random number generator
        trng = RandomStreams(1234)

        # ??? rrng?
        # create a 3-D random tensor for "dropout"?
        rate = 0.5
        rrng = trng.binomial(emb11.shape, p = 1 - rate, n = 1, dtype = emb11.dtype)
        # print "rrng:"
        # print "type of rrng:", type(rrng)
        # print rrng
        
        # 具体化LSTM模型的结构和参数(核心)proj代表着一个mini-batch输入以后的输出值
        # Implement the LSTM module;
        # Here 'False' -> NOT apply DROPOUT layers;
        # Since the input is in the format: (Max No. of words in batch, No. of Samples, 300) 
        # Note: that the 1st term and 2nd term are exchanged!
        # 只需要getp()即scan循环以后的最后一次(timesteps)结果,之前记录LSTM输出的结果都抛弃
        # proj11[-1] -> (No. of samples[N], Hidden unit dimension[timesteps]) -> (N, 50)
        # proj11 takes the inputs as embedding matrix emb1 and gives the o/p of the LSTM_A
        proj11 = getpl2(emb11, '1lstm1', mask11, False, rrng, 50, self.tnewp)[-1]
        proj21 = getpl2(emb21, '2lstm1', mask21, False, rrng, 50, self.tnewp)[-1]

        # Define the cost function
        dif = (proj21 - proj11).norm(L = 1, axis = 1)
        s2 = T.exp(-dif)
        sim = T.clip(s2, 1e-7, 1.0-1e-7) # Similarity
        lr = tensor.scalar(name = 'lr') # learning rate
        ys = T.clip((y-1.0) / 4.0, 1e-7, 1.0-1e-7)
        cost = T.mean((sim - ys) ** 2)
        ns=emb11.shape[1]
        self.f2sim = theano.function([emb11, mask11, emb21, mask21], sim, allow_input_downcast = True)
        self.f_proj11 = theano.function([emb11, mask11], proj11, allow_input_downcast = True) # NOT used
        self.f_cost = theano.function([emb11, mask11, emb21, mask21, y], cost, allow_input_downcast = True) # NOT used

        # Prepare for the backpropogation and gradiant descend
        if training == True:
            
            # 计算cost对不同参数的导数,并且平均两个LSTM模型的参数
            # The gradi refers to gradients wrt. cost, and is a list containing gradients to be update weights 
            # We average out the gradients by appending to another list grads[] 
            # So, we average out the gradients : wrt LSTM_A and wrt LSTM_B 
            # i.e, gradient= (grad(wrt(LSTM_A)+grad(wrt(LSTM_B))/2.0 to maintain the symmetricity between either LSTMs
            # wrt: (variable or list of variables) – term[s] for which we want gradients
            gradi = tensor.grad(cost, wrt = self.tnewp.values()) # T.grad -> differential
            grads = []
            l = len(gradi)
            for i in range(0, l/2):
                gravg = (gradi[i] + gradi[i + l / 2]) / (4.0)
            #print i,i+9
                grads.append(gravg)
            for i in range(0, len(self.tnewp.keys()) / 2):
                    grads.append(grads[i])
            
            # Here, the f_grad_shared and f_update are theano functions
            self.f_grad_shared, self.f_update = adadelta(lr, self.tnewp, grads, emb11, mask11, emb21, mask21, y, cost)
Пример #41
0
    def __init__(self, 
        glimpse_shape, glimpse_times, 
        dim_hidden, dim_fc, dim_out, 
        reward_base, 
        rng_std=1.0, activation=T.tanh, bptt_truncate=-1, 
        lmbd=0.1 # gdupdate + lmbd*rlupdate
        ): 
        if reward_base == None: 
            reward_base = np.zeros((glimpse_times)).astype('float32')
            reward_base[-1] = 1.0
        x = T.ftensor3('x')  # N * W * H 
        y = T.ivector('y')  # label 
        lr = T.fscalar('lr')
        reward_base = theano.shared(name='reward_base', value=np.array(reward_base).astype(theano.config.floatX), borrow=True) # Time (vector)
        reward_bias = T.fvector('reward_bias')
        rng = MRG_RandomStreams(np.random.randint(9999999))
#       rng = theano.tensor.shared_randomstreams.RandomStreams(np.random.randint(9999999))
    
        i = InputLayer(x)
        au = AttentionUnit(x, glimpse_shape, glimpse_times, dim_hidden, rng, rng_std, activation, bptt_truncate)
#       All hidden states are put into decoder
#       layers = [i, au, InputLayer(au.output[:,:,:].flatten(2))]
#       dim_fc = [glimpse_times*dim_hidden] + dim_fc + [dim_out]
#       Only the last hidden states
        layers = [i, au, InputLayer(au.output[:,-1,:])]
        dim_fc = [dim_hidden] + dim_fc + [dim_out]
        for Idim, Odim in zip(dim_fc[:-1], dim_fc[1:]):
            fc = FullConnectLayer(layers[-1].output, Idim, Odim, activation, 'FC')
            layers.append(fc)
        sm = SoftmaxLayer(layers[-1].output)
        layers.append(sm)

        output = sm.output       # N * classes 
        hidoutput = au.output    # N * dim_output 
        location = au.location   # N * T * dim_hidden
        prediction = output.argmax(1) # N

        # calc
        equalvec = T.eq(prediction, y) # [0, 1, 0, 0, 1 ...]
        correct = T.cast(T.sum(equalvec), 'float32')
#       noequalvec = T.neq(prediction, y)
#       nocorrect = T.cast(T.sum(noequalvec), 'float32')
        logLoss = T.log(output)[T.arange(y.shape[0]), y] # 
        reward_biased = T.outer(equalvec, reward_base)-reward_bias.dimshuffle('x', 0)
            # N * Time
            # (R_t - b_t), where b = E[R]
        
        # gradient descent
        gdobjective = logLoss.sum()/x.shape[0]  # correct * dim_output (only has value on the correctly predicted sample)
        gdparams = reduce(lambda x, y: x+y.params, layers, []) 
        gdupdates = map(lambda x: (x, x+lr*T.grad(gdobjective, x)), gdparams)

        # reinforce learning
        rlobjective = (reward_biased.dimshuffle(0, 1, 'x') * T.log(au.location_p)).sum() / x.shape[0]
            # location_p: N * Time * 2
            # location_logp: N * Time
            # reward_biased: N * 2
        rlparams = au.reinforceParams 
        rlupdates = map(lambda x: (x, x+lr*lmbd*T.grad(rlobjective, x)), rlparams)

        # Hidden state keeps unchange in time
        deltas = T.stack(*[((au.output[:,i,:].mean(0)-au.output[:,i+1,:].mean(0))**2).sum()  for i in xrange(glimpse_times-1)])
            # N * Time * dim_hidden
         
        print 'compile step()'
        self.step = theano.function([x, y, lr, reward_bias], [gdobjective, rlobjective, correct, T.outer(equalvec, reward_base)], updates=gdupdates+rlupdates)
    #       print 'compile gdstep()'
    #       self.gdstep = theano.function([x, y, lr], [gdobjective, correct, location], updates=gdupdates)
    #       print 'compile rlstep()'
    #       self.rlstep = theano.function([x, y, lr], [rlobjective], updates=rlupdates)
        print 'compile predict()'
        self.predict = theano.function([x], prediction)
#       print 'compile forward()'
#       self.forward = theano.function([x], map(lambda x: x.output, layers)) #[layers[-3].output, fc.output])
#       print 'compile error()'
#       self.error = theano.function([x, y], gdobjective)
        print 'compile locate()'
        self.locate = theano.function([x], [au.location_mean, location]) #[layers[-3].output, fc.output])
        print 'compile debug()'
        self.debug = theano.function([x, y, lr, reward_bias], [deltas, au.location_p], on_unused_input='warn')

        # self.xxx
        self.glimpse_times = glimpse_times
Пример #42
0
                                      batch_size=args.batch_size,
                                      use_ivectors=True)
    valid_datastream = get_datastream(path=args.data_path,
                                      which_set=args.valid_dataset,
                                      batch_size=args.batch_size,
                                      use_ivectors=True)
    test_datastream = get_datastream(path=args.data_path,
                                     which_set=args.test_dataset,
                                     batch_size=args.batch_size,
                                     use_ivectors=True)

    #################
    # build network #
    #################
    print('Building and compiling network')
    input_data = T.ftensor3('input_data')
    input_cond = T.ftensor3('input_cond')
    input_mask = T.fmatrix('input_mask')
    target_data = T.imatrix('target_data')
    target_mask = T.fmatrix('target_mask')
    network_output = deep_projection_ivector_ln_model_fix(
        input_var=input_data,
        cond_var=input_cond,
        mask_var=input_mask,
        num_inputs=input_dim,
        num_outputs=output_dim,
        num_layers=args.num_layers,
        num_conds=args.num_conds,
        num_factors=args.num_factors,
        num_units=args.num_units,
        grad_clipping=args.grad_clipping,
Пример #43
0
    def setup(self):
        """
        Set up the model to train.
        """

        # input_words: shape (n_batch, n_sentence, sentence_len)
        input_words = T.itensor3()
        n_batch, n_sentences, sentence_len = input_words.shape
        # query_words: shape (n_batch, query_len)
        query_words = T.imatrix()
        # correct_output: shape (n_batch, ?, num_output_words)
        correct_output = T.ftensor3()

        # graph_num_new_nodes: shape(n_batch, n_sentence)
        graph_num_new_nodes = T.imatrix()
        # graph_new_node_strengths: shape(n_batch, n_sentence, new_nodes_per_iter)
        graph_new_node_strengths = T.ftensor3()
        # graph_new_node_ids: shape(n_batch, n_sentence, new_nodes_per_iter, num_node_ids)
        graph_new_node_ids = T.ftensor4()
        # graph_new_edges: shape(n_batch, n_sentence, pad_graph_size, pad_graph_size, num_edge_types)
        graph_new_edges = T.TensorType('floatX', (False, ) * 5)()

        def _build(with_correct_graph, snap_to_best, using_dropout,
                   evaluate_accuracy):
            info = {}
            # Process each sentence, flattened to (?, sentence_len)
            flat_input_words = input_words.reshape([-1, sentence_len])
            flat_input_reprs, flat_ref_matrices = self.input_transformer.process(
                flat_input_words)
            # flat_input_reprs of shape (?, input_repr_size)
            # flat_ref_matrices of shape (?, num_node_ids, input_repr_size)
            input_reprs = flat_input_reprs.reshape(
                [n_batch, n_sentences, self.input_repr_size])
            ref_matrices = flat_ref_matrices.reshape([
                n_batch, n_sentences, self.num_node_ids, self.input_repr_size
            ])

            query_repr, query_ref_matrix = self.input_transformer.process(
                query_words)

            if using_dropout:
                iter_dropouts = []
                states_mask = util.make_dropout_mask(
                    (self.node_state_size, ), self.dropout_keep, self.srng)
                if self.nodes_mutable:
                    iter_dropouts.extend(
                        self.node_state_updater.dropout_masks(
                            self.srng, states_mask))
                if len(self.word_node_mapping) > 0:
                    iter_dropouts.extend(
                        self.direct_reference_updater.dropout_masks(
                            self.srng, states_mask))
                if self.intermediate_propagate != 0:
                    iter_dropouts.extend(
                        self.intermediate_propagator.dropout_masks(
                            self.srng, states_mask))
                if self.dynamic_nodes:
                    iter_dropouts.extend(
                        self.new_node_adder.dropout_masks(self.srng))
                iter_dropouts.extend(
                    self.edge_state_updater.dropout_masks(self.srng))
            else:
                iter_dropouts = []
                states_mask = None

            def _iter_fn(input_repr,
                         ref_matrix,
                         gstate,
                         correct_num_new_nodes=None,
                         correct_new_strengths=None,
                         correct_new_node_ids=None,
                         correct_edges=None,
                         dropout_masks=None):
                # If necessary, update node state
                if self.nodes_mutable:
                    gstate, dropout_masks = self.node_state_updater.process(
                        gstate, input_repr, dropout_masks)

                if len(self.word_node_mapping) > 0:
                    gstate, dropout_masks = self.direct_reference_updater.process(
                        gstate, ref_matrix, dropout_masks)

                # If necessary, propagate node state
                if self.intermediate_propagate != 0:
                    gstate, dropout_masks = self.intermediate_propagator.process_multiple(
                        gstate, self.intermediate_propagate, dropout_masks)

                node_loss = None
                node_accuracy = None
                # Propose and vote on new nodes
                if self.dynamic_nodes:
                    new_strengths, new_ids, dropout_masks = self.new_node_adder.get_candidates(
                        gstate, input_repr, self.new_nodes_per_iter,
                        dropout_masks)
                    # new_strengths and correct_new_strengths are of shape (n_batch, new_nodes_per_iter)
                    # new_ids and correct_new_node_ids are of shape (n_batch, new_nodes_per_iter, num_node_ids)
                    if with_correct_graph:
                        perm_idxs = np.array(
                            list(
                                itertools.permutations(
                                    range(self.new_nodes_per_iter))))
                        permuted_correct_str = correct_new_strengths[:,
                                                                     perm_idxs]
                        permuted_correct_ids = correct_new_node_ids[:,
                                                                    perm_idxs]
                        # due to advanced indexing, we should have shape (n_batch, permutation, new_nodes_per_iter, num_node_ids)
                        ext_new_str = T.shape_padaxis(new_strengths, 1)
                        ext_new_ids = T.shape_padaxis(new_ids, 1)
                        strength_ll = permuted_correct_str * T.log(
                            ext_new_str +
                            util.EPSILON) + (1 - permuted_correct_str) * T.log(
                                1 - ext_new_str + util.EPSILON)
                        ids_ll = permuted_correct_ids * T.log(ext_new_ids +
                                                              util.EPSILON)
                        reduced_perm_lls = T.sum(strength_ll, axis=2) + T.sum(
                            ids_ll, axis=[2, 3])
                        if self.best_node_match_only:
                            node_loss = -T.max(reduced_perm_lls, 1)
                        else:
                            full_ll = util.reduce_log_sum(reduced_perm_lls, 1)
                            # Note that some of these permutations are identical, since we likely did not add the maximum
                            # amount of nodes. Thus we will have added repeated elements here.
                            # We have log(x+x+...+x) = log(kx), where k is the repetition factor and x is the probability we want
                            # log(kx) = log(k) + log(x)
                            # Our repetition factor k is given by (new_nodes_per_iter - correct_num_new_nodes)!
                            # Recall that n! = gamma(n+1)
                            # so log(x) = log(kx) - log(gamma(k+1))
                            log_rep_factor = T.gammaln(
                                T.cast(
                                    self.new_nodes_per_iter -
                                    correct_num_new_nodes + 1, 'floatX'))
                            scaled_ll = full_ll - log_rep_factor
                            node_loss = -scaled_ll
                        if evaluate_accuracy:
                            best_match_idx = T.argmax(reduced_perm_lls, 1)
                            # should be of shape (n_batch), indexing the best permutation
                            best_correct_str = permuted_correct_str[
                                T.arange(n_batch), best_match_idx]
                            best_correct_ids = permuted_correct_ids[
                                T.arange(n_batch), best_match_idx]
                            snapped_strengths = util.independent_best(
                                new_strengths)
                            snapped_ids = util.categorical_best(
                                new_ids) * T.shape_padright(snapped_strengths)
                            close_strengths = T.all(
                                T.isclose(best_correct_str, snapped_strengths),
                                (1))
                            close_ids = T.all(
                                T.isclose(best_correct_ids, snapped_ids),
                                (1, 2))
                            node_accuracy = T.and_(close_strengths, close_ids)
                        # now substitute in the correct nodes
                        gstate = gstate.with_additional_nodes(
                            correct_new_strengths, correct_new_node_ids)
                    elif snap_to_best:
                        snapped_strengths = util.independent_best(
                            new_strengths)
                        snapped_ids = util.categorical_best(new_ids)
                        gstate = gstate.with_additional_nodes(
                            snapped_strengths, snapped_ids)
                    else:
                        gstate = gstate.with_additional_nodes(
                            new_strengths, new_ids)

                # Update edge state
                gstate, dropout_masks = self.edge_state_updater.process(
                    gstate, input_repr, dropout_masks)
                if with_correct_graph:
                    cropped_correct_edges = correct_edges[:, :gstate.n_nodes, :
                                                          gstate.n_nodes, :]
                    edge_lls = cropped_correct_edges * T.log(
                        gstate.edge_strengths +
                        util.EPSILON) + (1 - cropped_correct_edges) * T.log(
                            1 - gstate.edge_strengths + util.EPSILON)
                    # edge_lls currently penalizes for edges connected to nodes that do not exist
                    # we do not want it to do this, so we mask it with node strengths
                    mask_src = util.shape_padaxes(gstate.node_strengths,
                                                  [2, 3])
                    mask_dest = util.shape_padaxes(gstate.node_strengths,
                                                   [1, 3])
                    masked_edge_lls = edge_lls * mask_src * mask_dest
                    edge_loss = -T.sum(masked_edge_lls, axis=[1, 2, 3])
                    if evaluate_accuracy:
                        snapped_edges = util.independent_best(
                            gstate.edge_strengths)
                        close_edges = T.isclose(cropped_correct_edges,
                                                snapped_edges)
                        ok_mask = 1 - T.cast(
                            mask_src * mask_dest, 'int8'
                        )  # its OK for things not to match if node strengths are NOT both 1
                        edge_accuracy = T.all(T.or_(close_edges, ok_mask),
                                              (1, 2, 3))
                        overall_accuracy = edge_accuracy if node_accuracy is None else T.and_(
                            node_accuracy, edge_accuracy)
                    else:
                        overall_accuracy = None
                    gstate = gstate.with_updates(
                        edge_strengths=cropped_correct_edges)
                    return gstate, node_loss, edge_loss, overall_accuracy
                elif snap_to_best:
                    snapped_edges = util.independent_best(
                        gstate.edge_strengths)
                    gstate = gstate.with_updates(edge_strengths=snapped_edges)
                    return gstate
                else:
                    return gstate

            # Scan over each sentence
            def _scan_fn(
                input_repr, *stuff
            ):  # (input_repr, [ref_matrix?], [*correct_graph_stuff?], [dropout_masks?], *flat_graph_state, pad_graph_size)
                stuff = list(stuff)

                if len(self.word_node_mapping) > 0:
                    ref_matrix = stuff[0]
                    stuff = stuff[1:]
                else:
                    ref_matrix = None

                if with_correct_graph:
                    c_num_new_nodes, c_new_strengths, c_new_node_ids, c_edges = stuff[:
                                                                                      4]
                    stuff = stuff[4:]

                if using_dropout:
                    dropout_masks = stuff[:len(iter_dropouts)]
                    stuff = stuff[len(iter_dropouts):]
                else:
                    dropout_masks = None

                flat_graph_state = stuff[:-1]
                pad_graph_size = stuff[-1]
                gstate = GraphState.unflatten_from_const_size(flat_graph_state)

                if with_correct_graph:
                    gstate, node_loss, edge_loss, overall_accuracy = _iter_fn(
                        input_repr,
                        ref_matrix,
                        gstate,
                        c_num_new_nodes,
                        c_new_strengths,
                        c_new_node_ids,
                        c_edges,
                        dropout_masks=dropout_masks)
                else:
                    gstate = _iter_fn(input_repr,
                                      ref_matrix,
                                      gstate,
                                      dropout_masks=dropout_masks)

                retvals = gstate.flatten_to_const_size(pad_graph_size)
                if with_correct_graph:
                    if self.dynamic_nodes:
                        retvals.append(node_loss)
                    retvals.append(edge_loss)
                    if evaluate_accuracy:
                        retvals.append(overall_accuracy)
                return retvals

            if self.dynamic_nodes:
                initial_gstate = GraphState.create_empty(
                    n_batch, self.num_node_ids, self.node_state_size,
                    self.num_edge_types)
            else:
                initial_gstate = GraphState.create_full_unique(
                    n_batch, self.num_node_ids, self.node_state_size,
                    self.num_edge_types)

            # Account for all nodes, plus the extra padding node to prevent GPU unpleasantness
            if self.dynamic_nodes:
                pad_graph_size = n_sentences * self.new_nodes_per_iter + 1
            else:
                pad_graph_size = self.num_node_ids
            outputs_info = initial_gstate.flatten_to_const_size(pad_graph_size)
            prepped_input = input_reprs.dimshuffle([1, 0, 2])
            sequences = [prepped_input]
            if len(self.word_node_mapping) > 0:
                sequences.append(ref_matrices.dimshuffle([1, 0, 2, 3]))
            if with_correct_graph:
                sequences.append(graph_num_new_nodes.swapaxes(0, 1))
                sequences.append(graph_new_node_strengths.swapaxes(0, 1))
                sequences.append(graph_new_node_ids.swapaxes(0, 1))
                sequences.append(graph_new_edges.swapaxes(0, 1))

                if self.dynamic_nodes:
                    outputs_info.extend([None])
                if evaluate_accuracy:
                    outputs_info.extend([None])
                outputs_info.extend([None])
            if using_dropout:
                sequences.extend(iter_dropouts)
            all_scan_out, _ = theano.scan(_scan_fn,
                                          sequences=sequences,
                                          outputs_info=outputs_info,
                                          non_sequences=[pad_graph_size])
            graph_accurate_list = None
            if with_correct_graph:
                if evaluate_accuracy:
                    full_graph_accuracy = all_scan_out[-1]
                    all_scan_out = all_scan_out[:-1]
                    graph_accurate_list = T.all(full_graph_accuracy, 0)
                    info["graph_accuracy"] = T.sum(graph_accurate_list,
                                                   dtype='floatX') / T.cast(
                                                       n_batch, 'floatX')
                if self.dynamic_nodes:
                    all_flat_gstates = all_scan_out[:-2]
                    node_loss, edge_loss = all_scan_out[-2:]
                    reduced_node_loss = T.sum(node_loss) / T.cast(
                        n_batch, 'floatX')
                    reduced_edge_loss = T.sum(edge_loss) / T.cast(
                        n_batch, 'floatX')
                    avg_graph_loss = (reduced_node_loss +
                                      reduced_edge_loss) / T.cast(
                                          input_words.shape[1], 'floatX')
                    info["node_loss"] = reduced_node_loss
                    info["edge_loss"] = reduced_edge_loss
                else:
                    all_flat_gstates = all_scan_out[:-1]
                    edge_loss = all_scan_out[-1]
                    reduced_edge_loss = T.sum(edge_loss) / T.cast(
                        n_batch, 'floatX')
                    avg_graph_loss = reduced_edge_loss / T.cast(
                        input_words.shape[1], 'floatX')
                    info["edge_loss"] = reduced_edge_loss
            else:
                all_flat_gstates = all_scan_out

            if self.sequence_representation:
                # Each part of all_flat_gstates is of shape (n_sentences, n_batch, ...)
                # except for the last one, which we handle separately
                # Swap to (n_batch, n_sentences, ...)
                # Then flatten to (n_batch*n_sentences, ...) for further processing
                final_flat_gstate = [
                    x.swapaxes(0, 1).reshape(T.concatenate([[-1],
                                                            x.shape[2:]]),
                                             ndim=(x.ndim - 1))
                    for x in all_flat_gstates[:-1]
                ]
                # As for the last one, we need to get a single scalar value. The last one will be the biggest
                # so we will take that. Note that this will introduce a bunch of zero-nodes, but thats
                # OK and we can process that later. (We REQUIRE that padding in graph_state makes zero strength
                # nodes here!)
                final_flat_gstate.append(all_flat_gstates[-1][-1])
                # We also need to repeat query_repr and query_ref_matrix so that they broadcast together
                query_repr = T.extra_ops.repeat(query_repr, n_sentences, 0)
                query_ref_matrix = T.extra_ops.repeat(query_ref_matrix,
                                                      n_sentences, 0)
            else:
                # Extract last timestep
                final_flat_gstate = [x[-1] for x in all_flat_gstates]
            final_gstate = GraphState.unflatten_from_const_size(
                final_flat_gstate)

            if self.train_with_query:
                if self.wipe_node_state:
                    final_gstate = final_gstate.with_updates(
                        node_states=T.zeros_like(final_gstate.node_states))

                qnsu_dropout_masks = self.query_node_state_updater.dropout_masks(
                    self.srng, states_mask)
                query_gstate, _ = self.query_node_state_updater.process(
                    final_gstate, query_repr, qnsu_dropout_masks)

                if len(self.word_node_mapping) > 0:
                    qdru_dropout_masks = self.query_direct_reference_updater.dropout_masks(
                        self.srng, states_mask)
                    query_gstate, _ = self.query_direct_reference_updater.process(
                        query_gstate, query_ref_matrix, qdru_dropout_masks)

                fp_dropout_masks = self.final_propagator.dropout_masks(
                    self.srng, states_mask)
                propagated_gstate, _ = self.final_propagator.process_multiple(
                    query_gstate, self.final_propagate, fp_dropout_masks)

                agg_dropout_masks = self.aggregator.dropout_masks(self.srng)
                aggregated_repr, _ = self.aggregator.process(
                    propagated_gstate,
                    agg_dropout_masks)  # shape (n_batch, output_repr_size)

                if self.sequence_representation:
                    # aggregated_repr is of shape (n_batch*n_sentences, repr_width)
                    # We want to split back to timesteps: (n_batch, n_sentences, repr_width)
                    agg_repr_seq = aggregated_repr.reshape(
                        [n_batch, n_sentences, -1])
                    # Now collapse it to a summary representation
                    aggsum_dropout_masks = self.aggregate_summarizer.dropout_masks(
                        self.srng)
                    aggregated_repr, _ = self.aggregate_summarizer.process(
                        agg_repr_seq, aggsum_dropout_masks)
                    # At this point aggregated_repr is (n_batch, repr_width) as desired

                max_seq_len = correct_output.shape[1]
                if self.output_format == ModelOutputFormat.sequence:
                    final_output = self.output_processor.process(
                        aggregated_repr,
                        max_seq_len)  # shape (n_batch, ?, num_output_words)
                else:
                    final_output = self.output_processor.process(
                        aggregated_repr)

                if snap_to_best:
                    final_output = self.output_processor.snap_to_best(
                        final_output)

                if self.output_format == ModelOutputFormat.subset:
                    elemwise_loss = T.nnet.binary_crossentropy(
                        final_output, correct_output)
                    query_loss = T.sum(elemwise_loss)
                else:
                    flat_final_output = final_output.reshape(
                        [-1, self.num_output_words])
                    flat_correct_output = correct_output.reshape(
                        [-1, self.num_output_words])
                    timewise_loss = T.nnet.categorical_crossentropy(
                        flat_final_output, flat_correct_output)
                    query_loss = T.sum(timewise_loss)
                query_loss = query_loss / T.cast(n_batch, 'floatX')
                info["query_loss"] = query_loss
            else:
                final_output = T.zeros([])

            full_loss = np.array(0.0, np.float32)
            if with_correct_graph:
                full_loss = full_loss + avg_graph_loss
            if self.train_with_query:
                full_loss = full_loss + query_loss

            if self.train_with_query:
                adjusted_query_gstates = [
                    x.reshape(T.concatenate([[n_batch, n_sentences],
                                             x.shape[1:]]),
                              ndim=(x.ndim + 1))
                    if self.sequence_representation else T.shape_padaxis(x, 1)
                    for x in query_gstate.flatten()
                ]
                adjusted_prop_gstates = [
                    x.reshape(T.concatenate([[n_batch, n_sentences],
                                             x.shape[1:]]),
                              ndim=(x.ndim + 1))
                    if self.sequence_representation else T.shape_padaxis(x, 1)
                    for x in propagated_gstate.flatten()
                ]
                full_flat_gstates = [
                    T.concatenate([a.swapaxes(0, 1), b, c], 1) for a, b, c in
                    zip(all_flat_gstates[:-1], adjusted_query_gstates,
                        adjusted_prop_gstates)
                ]
            else:
                full_flat_gstates = [
                    a.swapaxes(0, 1) for a in all_flat_gstates[:-1]
                ]
                max_seq_len = T.iscalar()
            return full_loss, final_output, full_flat_gstates, graph_accurate_list, max_seq_len, info

        train_loss, _, _, _, _, train_info = _build(self.train_with_graph,
                                                    False, True, False)
        adam_updates = Adam(train_loss, self.params, lr=self.learning_rate_var)

        self.info_keys = list(train_info.keys())

        print("Compiling...")

        optimizer = theano.compile.predefined_optimizers[
            'fast_run' if self.check_mode ==
            'debug' else theano.config.optimizer]
        optimizer = optimizer.excluding(
            "scanOp_pushout_output", "remove_constants_and_unused_inputs_scan")
        if self.check_mode == 'nan':
            mode = NanGuardMode(optimizer=optimizer,
                                nan_is_error=True,
                                inf_is_error=True,
                                big_is_error=True)
        elif self.check_mode == 'debug':
            mode = DebugMode(optimizer=optimizer,
                             check_isfinite=False,
                             check_py_code=False,
                             stability_patience=1)
            theano.tensor.TensorType.filter_checks_isfinite = False
        else:
            mode = theano.Mode(optimizer=optimizer)
        self.train_fn = theano.function([
            input_words, query_words, correct_output, graph_num_new_nodes,
            graph_new_node_strengths, graph_new_node_ids, graph_new_edges
        ], [train_loss] + list(train_info.values()),
                                        updates=adam_updates,
                                        allow_input_downcast=True,
                                        on_unused_input='ignore',
                                        mode=mode)

        eval_loss, _, full_flat_gstates, graph_accurate_list, _, eval_info = _build(
            self.train_with_graph, False, False, True)
        self.eval_info_keys = list(eval_info.keys())
        self.eval_fn = theano.function([
            input_words, query_words, correct_output, graph_num_new_nodes,
            graph_new_node_strengths, graph_new_node_ids, graph_new_edges
        ], [eval_loss, graph_accurate_list] + list(eval_info.values()),
                                       allow_input_downcast=True,
                                       on_unused_input='ignore',
                                       mode=mode)

        self.debug_test_fn = theano.function([
            input_words, query_words, correct_output, graph_num_new_nodes,
            graph_new_node_strengths, graph_new_node_ids, graph_new_edges
        ],
                                             full_flat_gstates,
                                             allow_input_downcast=True,
                                             on_unused_input='ignore',
                                             mode=mode)

        test_loss, final_output, full_flat_gstates, _, max_seq_len, _ = _build(
            False, False, False, False)
        self.fuzzy_test_fn = theano.function(
            [input_words, query_words] +
            ([max_seq_len] if self.output_format == ModelOutputFormat.sequence
             else []), [final_output] + full_flat_gstates,
            allow_input_downcast=True,
            on_unused_input='ignore',
            mode=mode)

        test_loss, final_output, full_flat_gstates, _, max_seq_len, _ = _build(
            False, True, False, False)
        self.snap_test_fn = theano.function(
            [input_words, query_words] +
            ([max_seq_len] if self.output_format == ModelOutputFormat.sequence
             else []), [final_output] + full_flat_gstates,
            allow_input_downcast=True,
            on_unused_input='ignore',
            mode=mode)
Пример #44
0
    def __init__(self, nh, nc, ne, de, cs):
        """
        nh :: dimension of the hidden layer
        nc :: number of classes
        ne :: number of word embeddings in the vocabulary
        de :: dimension of the word embeddings
        cs :: word window context size
        """

        #
        # parameters of the model
        #

        self.nh = nh
        self.nc = nc
        self.ne = ne
        self.de = de
        self.cs = cs

        # add one for PADDING at the end
        #self.emb = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0, (ne + 1, de)).astype(theano.config.floatX))
        #self.emb = gensim.models.Word2Vec.load_word2vec_format('vectors.bin', binary=True)

        # parameters for the input layer
        self.Wx = theano.shared(
            0.2 *
            numpy.random.uniform(-1.0, 1.0,
                                 (de * cs, nh)).astype(theano.config.floatX))

        # parameters for stored histories in the hidden layer
        self.Wh = theano.shared(
            0.2 * numpy.random.uniform(-1.0, 1.0,
                                       (nh, nh)).astype(theano.config.floatX))
        self.bh = theano.shared(numpy.zeros(nh, dtype=theano.config.floatX))

        # parameters for the output layer
        self.W = theano.shared(
            0.2 * numpy.random.uniform(-1.0, 1.0,
                                       (nh, nc)).astype(theano.config.floatX))
        self.b = theano.shared(numpy.zeros(nc, dtype=theano.config.floatX))

        # initial value of the stored histories in the hidden layer
        self.h0 = theano.shared(numpy.zeros(nh, dtype=theano.config.floatX))

        # bundle
        #self.params = [self.emb, self.Wx, self.Wh, self.W, self.bh, self.b, self.h0]
        self.params = [self.Wx, self.Wh, self.W, self.bh, self.b, self.h0]
        self.names = ['Wx', 'Wh', 'W', 'bh', 'b', 'h0']
        idxs = T.ftensor3(
        )  # as many columns as context window size/lines as words in the sentence
        #self.x = self.emb[idxs].reshape((idxs.shape[0], de * cs))
        self.x = idxs.reshape((idxs.shape[0], de * cs))
        y = T.iscalar('y')  # label

        # x_t:   the input at time t
        # s_t:   the output of the output layer (real output) at time t
        # h_tm1: the output of the hidden layer at time (t - 1)
        def recurrence(x_t, h_tm1):
            h_t = T.nnet.sigmoid(
                T.dot(x_t, self.Wx) + T.dot(h_tm1, self.Wh) + self.bh)
            s_t = T.nnet.softmax(T.dot(h_t, self.W) + self.b)
            return [h_t, s_t]

        [h, s], _ = theano.scan(fn=recurrence,
                                sequences=self.x,
                                outputs_info=[self.h0, None],
                                n_steps=self.x.shape[0])

        p_y_given_x_lastword = s[-1, 0, :]
        p_y_given_x_sentence = s[:, 0, :]
        y_pred = T.argmax(p_y_given_x_sentence, axis=1)

        # cost and gradients and learning rate
        lr = T.scalar('lr')
        nll = -T.mean(T.log(p_y_given_x_lastword)[y])
        gradients = T.grad(nll, self.params)
        updates = OrderedDict(
            (p, p - lr * g) for p, g in zip(self.params, gradients))

        # theano functions
        self.classify = theano.function(inputs=[idxs], outputs=y_pred)
        self.test = theano.function(inputs=[idxs],
                                    outputs=p_y_given_x_sentence)
        self.train = theano.function(inputs=[idxs, y, lr],
                                     outputs=nll,
                                     updates=updates)
Пример #45
0
def create_model(n_in, n_out, n_enc, n_hid, n_cyc):

    x = T.ftensor3('x')     # x <batch_size, sequence_len, n_in>
    y = T.imatrix('y')      # y <batch_size, sequence_len>

    # Layers

    c0 = Conv2D(16, 1, 3, 3)
    c1 = Conv2D(32, 16, 3, 3)
    #c1 = Conv2D(32, 32, 3, 3)
    #c1 = Conv2D(32, 1, 3, 3)
    #g0 = GRU(n_in, n_enc)
    #g1 = GRU(32*12, n_enc)
    #g2 = GRU(32*12, n_enc)
    g1 = GRU(32*7, n_enc)
    g2 = GRU(32*7, n_enc)

    d2 = TimeDistributedDense(n_enc*2, n_enc)

    #att = AttentionARSGy(n_enc, n_hid, n_cyc)
    att = AttentionARSGy(n_enc, n_out, n_hid, n_cyc)
    #do = Dense(n_cyc, n_out)
    #do = TimeDistributedDense(n_cyc, n_out)

    params = [
        c0.params,
        c1.params,
        #g0.params,
        g1.params,
        g2.params,
        att.params,
        #do.params,
        #d_0.params,
        #d_1.params,
        d2.params,
    ]

    # Logic
    x0 = x.reshape((x.shape[0], 1, x.shape[1], x.shape[2]))

    xc = relu(c0.apply(x0))
    xc = max_pool_2d(xc, (2,2)) 

    xc = relu(c1.apply(xc))
    #xc = max_pool_2d(xc, (2,2)) 

    #xc = relu(c1.apply(xc))

    x1 = xc.dimshuffle(0,2,1,3)
    x1 = x1.reshape((x1.shape[0], x1.shape[1], -1))

    #x0 = g0.apply(x0)
    #x1 = x0[:, ::skip_rate[0]]
    #x1 = d_0.apply(x1)
    x2_f = g1.apply(x1) #, truncate_gradient=30)
    x2_b = g2.apply(x1[:,::-1]) #, truncate_gradient=30)
    x2 = T.concatenate([x2_f, x2_b[:,::-1]], axis=2)

    #x2 = x2[:, ::skip_rate[0]]
    #x2 = d_1.apply(x2)
    x3 = d2.apply(x2)

    xe = x3

    Y = []
    A = []

    # extract glimplse

    H, alphas, out = att.apply(xe, y.shape[1]) 
    # H: batch_size, y_len, n_hid
    # alphas: batch_size, x_len
    o_shp = out.shape
    o = T.reshape(out, (-1, o_shp[2]))

    loss = T.nnet.categorical_crossentropy(o, y.flatten()).mean()

    params = [p for pp in params for p in pp]


    return [x, y], out, loss, params, alphas
Пример #46
0
    def test_gpu_memory_usage(self):
        # This test validates that the memory usage of the defined theano
        # function is reasonnable when executed on the GPU. It checks for
        # a bug in which one of scan's optimization was not applied which
        # made the scan node compute large and unnecessary outputs which
        # brought memory usage on the GPU to ~12G.

        # Dimensionality of input and output data (not one-hot coded)
        n_in = 100
        n_out = 100
        # Number of neurons in hidden layer
        n_hid = 4000

        # Number of minibatches
        mb_size = 2
        # Time steps in minibatch
        mb_length = 200

        # Define input variables
        xin = tensor.ftensor3(name="xin")
        yout = tensor.ftensor3(name="yout")

        # Initialize the network parameters
        U = theano.shared(np.zeros((n_in, n_hid), dtype="float32"), name="W_xin_to_l1")
        V = theano.shared(np.zeros((n_hid, n_hid), dtype="float32"), name="W_l1_to_l1")
        W = theano.shared(np.zeros((n_hid, n_out), dtype="float32"), name="W_l1_to_l2")
        nparams = [U, V, W]

        # Build the forward pass
        l1_base = tensor.dot(xin, U)

        def scan_l(baseline, last_step):
            return baseline + tensor.dot(last_step, V)

        zero_output = tensor.alloc(np.asarray(0.0, dtype="float32"), mb_size, n_hid)

        l1_out, _ = scan(
            scan_l,
            sequences=[l1_base],
            outputs_info=[zero_output],
            mode=self.mode_with_gpu_nodebug,
        )

        l2_out = tensor.dot(l1_out, W)

        # Compute the cost and take the gradient wrt params
        cost = tensor.sum((l2_out - yout) ** 2)
        grads = tensor.grad(cost, nparams)
        updates = list(zip(nparams, (n - g for n, g in zip(nparams, grads))))

        # Compile the theano function
        feval_backprop = theano.function(
            [xin, yout], cost, updates=updates, mode=self.mode_with_gpu_nodebug
        )

        # Validate that the PushOutScanOutput optimization has been applied
        # by checking the number of outputs of the grad Scan node in the
        # compiled function.
        nodes = feval_backprop.maker.fgraph.toposort()
        scan_nodes = [n for n in nodes if isinstance(n.op, Scan)]

        # The grad scan is always the 2nd one according to toposort. If the
        # optimization has been applied, it has 2 outputs, otherwise 3.
        grad_scan_node = scan_nodes[1]
        assert len(grad_scan_node.outputs) == 2, len(grad_scan_node.outputs)

        # Call the theano function to ensure the absence of a memory error
        feval_backprop(
            np.zeros((mb_length, mb_size, n_in), dtype="float32"),
            np.zeros((mb_length, mb_size, n_out), dtype="float32"),
        )
Пример #47
0
    def __init__(self, config):

        self.config = config

        batch_size = config['batch_size']
        num_seq = config['num_seq']
        self.n_timesteps = config['num_timesteps']

        num_joints = config['num_joints']
        classes_num = config['classes_num']
        # ##################### BUILD NETWORK ##########################
        mask = T.fvector('mask')
        y = T.lvector('y')
        target = T.ftensor3('target')
        rand = T.fvector('rand')
        trng = RandomStreams(1234)
        use_noise = T.fscalar('use_noise')

        print '... building the model'
        self.layers = []
        params = []
        weight_types = []

        conv_fea = T.ftensor4('conv_fea')  #(49, 16, 8, 1024)

        lstm_att_layer15 = JointAttentionLstmLayer(config,
                                                   num_joints,
                                                   conv_fea=conv_fea,
                                                   mask=mask,
                                                   batch_size=batch_size,
                                                   num_seq=num_seq,
                                                   trng=trng,
                                                   use_noise=use_noise,
                                                   n_in=1024 * 5,
                                                   n_out=1024,
                                                   dim_part=32)

        self.layers.append(lstm_att_layer15)
        params += lstm_att_layer15.params
        weight_types += lstm_att_layer15.weight_type
        self.conv_fea = conv_fea

        softmax_input = lstm_att_layer15.output

        softmax_layer15 = SoftmaxLayer(input=softmax_input,
                                       n_in=1024,
                                       n_out=21)
        self.layers.append(softmax_layer15)
        params += softmax_layer15.params
        weight_types += softmax_layer15.weight_type

        # #################### NETWORK BUILT #######################
        self.cost_nll = softmax_layer15.negative_log_likelihood(y, mask)
        self.cost_jhmdb_attention = T.mean(T.sum(T.sum(
            0.5 * (lstm_att_layer15.attention - target)**2, axis=1),
                                                 axis=1),
                                           axis=0,
                                           dtype=theano.config.floatX)
        self.cost = self.cost_nll + self.cost_jhmdb_attention
        self.errors_video = softmax_layer15.errors_video(
            y, mask, batch_size, num_seq)
        self.params = params
        self.prob = softmax_layer15.p_y_given_x

        self.mask = mask
        self.y = y
        self.target = target
        self.rand = rand
        self.weight_types = weight_types
        self.batch_size = batch_size
        self.num_seq = num_seq
        self.use_noise = use_noise
Пример #48
0
    def build_theano_functions(self):
        x = T.ftensor3('x')  # shape of input : batch X time X value
        y = T.ftensor3('y')
        z = T.ftensor3('z')

        layers_input = [x]
        dims = np.array([self.input_dim])
        for dim in self.lstm_layers_dim:
            dims = np.append(dims, dim)
        print "Dimensions =", dims

        # layer is just an index of the layer
        for layer in range(len(self.lstm_layers_dim)):

            # before the cell, input, forget and output gates, x needs to
            # be transformed
            linear = Linear(
                dims[layer],
                dims[layer + 1] * 4,
                #weights_init=Uniform(mean=data_mean, std=1),
                weights_init=IsotropicGaussian(mean=1., std=1),
                biases_init=Constant(0),
                name="linear" + str(layer))
            linear.initialize()
            lstm_input = linear.apply(layers_input[layer])

            # the lstm wants batch X time X value
            lstm = LSTM(dim=dims[layer + 1],
                        weights_init=IsotropicGaussian(mean=0., std=0.5),
                        biases_init=Constant(1),
                        name="lstm" + str(layer))
            lstm.initialize()
            # hack to use Orthogonal on lstm w_state
            lstm.W_state.set_value(Orthogonal().generate(
                np.random,
                lstm.W_state.get_value().shape))
            h, _dummy = lstm.apply(lstm_input)

            layers_input.append(h)

        # the idea is to have one gaussian parametrize every frequency bin
        print "Last linear transform dim :", dims[1:].sum()
        output_transform = Linear(
            dims[1:].sum(),
            self.output_dim,
            weights_init=IsotropicGaussian(mean=0., std=1),
            biases_init=Constant(0),
            #use_bias=False,
            name="output_transform")
        output_transform.initialize()
        if len(self.lstm_layers_dim) == 1:
            print "hallo there, only one layer speaking"
            y_hat = output_transform.apply(layers_input[-1])
        else:
            y_hat = output_transform.apply(
                T.concatenate(layers_input[1:], axis=2))

        sig = T.nnet.relu(y_hat[:, :, :self.output_dim / 2]) + 0.05
        mus = y_hat[:, :, self.output_dim / 2:]

        # sum likelihood with targets
        # sum inside log accross mixtures, sum outside log accross time
        inside_expo = -0.5 * ((y - mus)**2) / sig**2
        expo = T.exp(inside_expo)
        coeff = 1. / (T.sqrt(2. * np.pi) * sig)
        inside_log = T.log(coeff * expo)
        inside_log_max = T.max(inside_log, axis=2, keepdims=True)
        LL = -(inside_log_max + T.log(
            T.sum(T.exp(inside_log - inside_log_max), axis=2,
                  keepdims=True))).sum()

        #zinside_expo = -0.5*((z-mus)**2)/sig**2
        #zexpo = T.exp(zinside_expo)
        #zcoeff = pis*(1./(T.sqrt(2.*np.pi)*sig))
        #zinside_log = (zcoeff*zexpo).sum(axis=2)
        #zLL = -(T.log(zinside_log)).sum()

        model = Model(LL)
        self.model = model
        parameters = model.parameters

        grads = T.grad(LL, parameters)
        updates = []
        lr = T.scalar('lr')
        for i in range(len(grads)):
            #updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]]))
            updates.append(
                tuple([parameters[i], parameters[i] - lr * grads[i]]))

        #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False))
        if self.debug:
            gradf = theano.function([x, y, lr], [LL, mus, sig],
                                    updates=updates)
        else:
            #gradf = theano.function([x, y, z],[zLL],updates=updates)
            gradf = theano.function([x, y, lr], [LL], updates=updates)
        f = theano.function([x], [sig, mus])

        return gradf, f
Пример #49
0
def RelationStackMaker(chips,
                       params,
                       graph=False,
                       weighted=False,
                       batched=False):
    if batched:
        emb_input = T.itensor3('emb_input')
        entities_tv = [
            T.fmatrix('enidx_' + str(i)).astype(theano.config.floatX)
            for i in range(params['num_entity'])
        ]
        sample_weights = T.fvector('sample_weight')
        if graph:
            if weighted:
                masks = T.ftensor4('child_mask')
            else:
                masks = T.ftensor3('child_mask')
        else:
            masks = T.fmatrix('batch_mask')

    else:
        emb_input = T.imatrix('emb_input')
        entities_tv = [
            T.fvector('enidx_' + str(i)).astype(theano.config.floatX)
            for i in range(params['num_entity'])
        ]
        sample_weights = T.fvector('sample_weight')
        if graph:
            if weighted:
                masks = T.ftensor3('child_mask')
            else:
                masks = T.fmatrix('child_mask')
        else:
            masks = None
    #print masks, type(masks), masks.ndim
    current_chip = Start(params['voc_size'], emb_input)
    print '\n', 'Building Stack now', '\n', 'Start: ', params[
        'voc_size'], 'out_tv dim:', current_chip.output_tv.ndim
    instantiated_chips = stackLayers(chips,
                                     current_chip,
                                     params,
                                     entity_size=params['num_entity'])
    regularizable_params = computeLayers(instantiated_chips,
                                         current_chip,
                                         params,
                                         entities_input=entities_tv,
                                         mask=masks,
                                         sample_weights=sample_weights)
    ### Debug use: Get the attention co-efficiency and visualize. ###
    for c in instantiated_chips:
        if c[1].endswith('Entity_Att'):
            assert hasattr(c[0], 'att_wt_arry')
            assert hasattr(c[0], 'entity_tvs')
            attention_weights = c[0].att_wt_arry
            entity_tvs = c[0].entity_tvs

    current_chip = instantiated_chips[-1][0]
    if current_chip.output_tv.ndim == 2:
        pred_y = current_chip.output_tv  #T.argmax(current_chip.output_tv, axis=1)
    else:
        pred_y = current_chip.output_tv  #T.argmax(current_chip.output_tv) #, axis=1)
    gold_y = (current_chip.gold_y if hasattr(current_chip, 'gold_y') else None)
    # Show all parameters that would be needed in this system
    params_needed = calculate_params_needed(instantiated_chips)
    print "Parameters Needed", params_needed
    for k in params_needed:
        assert k in params, k
        print k, params[k]
    assert hasattr(current_chip, 'score')
    cost = current_chip.score  #/ params['nsentences']
    cost_arr = [cost]
    for layer in instantiated_chips[:-1]:
        if hasattr(layer[0], 'score'):
            print layer[1]
            cost += params['cost_coef'] * layer[0].score
            cost_arr.append(params['cost_coef'] * layer[0].score)

    grads = T.grad(cost, wrt=regularizable_params)
    #[params[k] for k in params if (hasattr(params[k], 'is_regularizable') and params[k].is_regularizable)])
    print 'Regularizable parameters:'
    for k, v in params.items():
        if hasattr(v, 'is_regularizable'):
            print k, v, v.is_regularizable
    if graph or batched:
        #return (emb_input, masks, entities_tv, attention_weights, entity_tvs, gold_y, pred_y, cost, grads, regularizable_params)
        return (emb_input, masks, entities_tv, sample_weights, gold_y, pred_y,
                cost, grads, regularizable_params)
    else:
        return (emb_input, entities_tv, sample_weights, gold_y, pred_y, cost,
                grads, regularizable_params, sample_weights)
Пример #50
0
    logging.getLogger().addHandler(logging.StreamHandler())
    logging.info('Experiment starts')
    logging.info('Saving experiment data to %s', conf['out_path'])

    logging.info("Setting random state...")
    start_time = time.clock()
    rng = conf['rng']
    srng = theano.tensor.shared_randomstreams.RandomStreams(
        rng.randint(999999))
    np.random.seed(conf['seed'])
    logging.info("...done %f" % (time.clock() - start_time))

    logging.info("Creating the model...")
    start_time = time.clock()

    x = T.ftensor3('x')
    label = T.fvector('label')
    #ldm_index = T.ivector('ldm_index')
    disk = Tsp.csr_matrix('disk')
    layer_begin = T.imatrix('layer_begin')
    layer_end = T.imatrix('layer_end')

    model = LSCNN(rng, conf['layers'], conf['drop'])

    model.inputs = [x, label, disk, layer_begin, layer_end]
    model.fwd_inputs = [x, label, disk, layer_begin, layer_end]
    model.w_constraints = eval(conf['w_constraints'])
    logging.info("...done %f" % (time.clock() - start_time))

    logging.info("Checking if there is already a best model to load...")
    start_time = time.clock()
Пример #51
0
import theano
from theano import tensor
import numpy as np

import mkl_gru_op_v

x = tensor.ftensor3('x')
x_m = tensor.ftensor3('x_m')
h_init = tensor.fmatrix('h_init')

W_h = tensor.fmatrix('W_h')
W_hzr = tensor.fmatrix('W_hzr')
W_hh = tensor.fmatrix('W_hh')
W_x = tensor.fmatrix('W_x')
b = tensor.ftensor3('b')

o = mkl_gru_op_v.GRU(units=1000, timesteps=10, batch_size=80,
                     input_dim=620)(x, x_m, h_init, W_h, W_x, b)
f = theano.function([x, x_m, h_init, W_h, W_x, b], o)

units = 1000
timesteps = 10
batch_size = 80
input_dim = 620
x = np.random.rand(timesteps, input_dim, batch_size).astype(np.float32)
x_m = np.random.rand(timesteps, units,
                     batch_size).astype(np.float32) - np.random.rand(
                         timesteps, units, batch_size).astype(np.float32)
h_init = np.random.rand(units, batch_size).astype(np.float32) - np.random.rand(
    units, batch_size).astype(np.float32)
W_x = np.random.rand(units * 3, input_dim).astype(np.float32) - np.random.rand(
Пример #52
0
        sys.exit(1)

    if not os.path.exists(args.model):
        print('File not found: {}'.format(args.model))
        sys.exit(1)

    if not os.path.exists(args.hmrnn_model):
        print('File not found: {}'.format(args.hmrnn_model))
        sys.exit(1)

    print('Loading an hmrnn model')

    hmrnn = HMRNNModel(args)
    hmrnn.load(args.hmrnn_model)

    input_data = T.ftensor3('input_data')
    input_mask = T.fmatrix('input_mask')

    ivector_data = None
    if args.use_ivector_input:
        ivector_data = T.ftensor3('ivector_data')

    network = build_deep_lstm(input_var=input_data,
                              mask_var=input_mask,
                              input_dim=args.input_dim,
                              num_layers=args.num_layers,
                              num_units=args.num_units,
                              num_proj_units=args.num_proj_units,
                              output_dim=args.output_dim,
                              grad_clipping=args.grad_clipping,
                              is_bidir=not args.uni,
Пример #53
0
def build_evpi_model(word_embeddings,
                     len_voc,
                     word_emb_dim,
                     N,
                     args,
                     freeze=False):

    # input theano vars
    posts = T.imatrix()
    post_masks = T.fmatrix()
    ques_list = T.itensor3()
    ques_masks_list = T.ftensor3()
    ans_list = T.itensor3()
    ans_masks_list = T.ftensor3()
    labels = T.imatrix()

    utility_posts = T.imatrix()
    utility_post_masks = T.fmatrix()
    utility_labels = T.ivector()

    post_out, post_lstm_params = build_lstm_posts(posts, post_masks, args.post_max_len, \
                 word_embeddings, word_emb_dim, args.hidden_dim, len_voc, args.batch_size)
    ques_out, ques_lstm_params = build_lstm(ques_list, ques_masks_list, N, args.ques_max_len, \
              word_embeddings, word_emb_dim, args.hidden_dim, len_voc, args.batch_size)
    ans_out, ans_lstm_params = build_lstm(ans_list, ans_masks_list, N, args.ans_max_len, \
              word_embeddings, word_emb_dim, args.hidden_dim, len_voc, args.batch_size)

    # pqa_preds = [None]*N
    # post_ques_ans = T.concatenate([post_out, ques_out[0], ans_out[0]], axis=1)
    # l_post_ques_ans_in = lasagne.layers.InputLayer(shape=(args.batch_size, 3*args.hidden_dim), input_var=post_ques_ans)
    # l_post_ques_ans_dense = lasagne.layers.DenseLayer(l_post_ques_ans_in, num_units=args.hidden_dim,\
    # 												  nonlinearity=lasagne.nonlinearities.rectify)
    # l_post_ques_ans_dense2 = lasagne.layers.DenseLayer(l_post_ques_ans_dense, num_units=1,\
    # 												   nonlinearity=lasagne.nonlinearities.sigmoid)
    # pqa_preds[0] = lasagne.layers.get_output(l_post_ques_ans_dense2)
    # loss = T.sum(lasagne.objectives.binary_crossentropy(pqa_preds[0], labels[:,0]))
    # for i in range(1, N):
    # 		post_ques_ans = T.concatenate([post_out, ques_out[i], ans_out[i]], axis=1)
    # 		l_post_ques_ans_in_ = lasagne.layers.InputLayer(shape=(args.batch_size, 3*args.hidden_dim), input_var=post_ques_ans)
    # 		l_post_ques_ans_dense_ = lasagne.layers.DenseLayer(l_post_ques_ans_in_, num_units=args.hidden_dim,\
    # 														nonlinearity=lasagne.nonlinearities.rectify,\
    # 														W=l_post_ques_ans_dense.W,\
    # 														b=l_post_ques_ans_dense.b)
    # 		l_post_ques_ans_dense2_ = lasagne.layers.DenseLayer(l_post_ques_ans_dense_, num_units=1,\
    # 														nonlinearity=lasagne.nonlinearities.sigmoid,\
    # 														W=l_post_ques_ans_dense2.W,\
    # 														b=l_post_ques_ans_dense2.b)
    # 		pqa_preds[i] = lasagne.layers.get_output(l_post_ques_ans_dense2_)
    # 		loss += T.sum(lasagne.objectives.binary_crossentropy(pqa_preds[i], labels[:,i]))
    #
    # post_ques_ans_dense2_params = lasagne.layers.get_all_params(l_post_ques_ans_dense2, trainable=True)

    pq_out = [None] * N
    post_ques = T.concatenate([post_out, ques_out[0]], axis=1)
    l_post_ques_in = lasagne.layers.InputLayer(shape=(args.batch_size,
                                                      2 * args.hidden_dim),
                                               input_var=post_ques)
    l_post_ques_dense = lasagne.layers.DenseLayer(l_post_ques_in, num_units=args.hidden_dim,\
                  nonlinearity=lasagne.nonlinearities.rectify)
    l_post_ques_dense2 = lasagne.layers.DenseLayer(l_post_ques_dense, num_units=1,\
                   nonlinearity=lasagne.nonlinearities.sigmoid)
    pq_out[0] = lasagne.layers.get_output(l_post_ques_dense2)
    for i in range(1, N):
        post_ques = T.concatenate([post_out, ques_out[i]], axis=1)
        l_post_ques_in_ = lasagne.layers.InputLayer(shape=(args.batch_size, 2 *
                                                           args.hidden_dim),
                                                    input_var=post_ques)
        l_post_ques_dense_ = lasagne.layers.DenseLayer(l_post_ques_in_, num_units=args.hidden_dim,\
                    nonlinearity=lasagne.nonlinearities.rectify,\
                    W=l_post_ques_dense.W,\
                    b=l_post_ques_dense.b)
        l_post_ques_dense2_ = lasagne.layers.DenseLayer(l_post_ques_dense_, num_units=1,\
                    nonlinearity=lasagne.nonlinearities.sigmoid,\
                    W=l_post_ques_dense2.W,\
                    b=l_post_ques_dense2.b)
        pq_out[i] = lasagne.layers.get_output(l_post_ques_dense2_)

    post_ques_dense2_params = lasagne.layers.get_all_params(l_post_ques_dense2,
                                                            trainable=True)

    all_sq_errors = [None] * (N * N)
    loss = 0.0
    for i in range(N):
        for j in range(N):
            all_sq_errors[i * N + j] = T.sum(lasagne.objectives.squared_error(
                pq_out[i], ans_out[j]),
                                             axis=1)
            loss += T.sum(
                lasagne.objectives.squared_error(pq_out[i], ans_out[j]) *
                labels[:, i, None])

    utility_preds, utility_post_ans_preds, utility_params = build_utility_lstm(utility_posts, utility_post_masks, \
                       posts, post_masks, ans_list, ans_masks_list, \
                       N, args.post_max_len, args.ans_max_len, \
                       word_embeddings, word_emb_dim, args.hidden_dim, len_voc)

    utility_loss = T.sum(
        lasagne.objectives.binary_crossentropy(utility_preds, utility_labels))

    # for i in range(N):
    # 	loss += T.sum(lasagne.objectives.binary_crossentropy(utility_post_ans_preds[i], labels[:,i]))

    #all_params = post_lstm_params + ques_lstm_params + ans_lstm_params + post_ques_ans_dense2_params + utility_params
    all_params = post_lstm_params + ques_lstm_params + post_ques_dense2_params

    loss += args.rho * sum(T.sum(l**2) for l in all_params)
    utility_loss += args.rho * sum(T.sum(l**2) for l in utility_params)

    updates = lasagne.updates.adam(loss,
                                   all_params,
                                   learning_rate=args.learning_rate)
    utility_updates = lasagne.updates.adam(utility_loss,
                                           utility_params,
                                           learning_rate=args.learning_rate)

    train_fn = theano.function([posts, post_masks, ques_list, ques_masks_list, ans_list, ans_masks_list, labels], \
            [loss] + utility_post_ans_preds + all_sq_errors, updates=updates)
    dev_fn = theano.function([posts, post_masks, ques_list, ques_masks_list, ans_list, ans_masks_list, labels], \
            [loss] + utility_post_ans_preds + all_sq_errors,)
    # train_fn = theano.function([posts, post_masks, ques_list, ques_masks_list, ans_list, ans_masks_list, labels], \
    # 								[loss] + pqa_preds + utility_post_ans_preds, updates=updates)
    # dev_fn = theano.function([posts, post_masks, ques_list, ques_masks_list, ans_list, ans_masks_list, labels], \
    # 								[loss] + pqa_preds + utility_post_ans_preds,)
    utility_train_fn = theano.function([utility_posts, utility_post_masks, utility_labels], \
            [utility_preds, utility_loss], updates=utility_updates)
    utility_dev_fn = theano.function([utility_posts, utility_post_masks, utility_labels], \
            [utility_preds, utility_loss],)

    return train_fn, dev_fn, utility_train_fn, utility_dev_fn
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=100,
                    emb_size=300,
                    batch_size=50,
                    filter_size=[3],
                    sent_len=40,
                    claim_len=40,
                    cand_size=10,
                    hidden_size=[300, 300],
                    max_pred_pick=5):

    model_options = locals().copy()
    print "model options", model_options

    pred_id2label = {1: 'SUPPORTS', 0: 'REFUTES', 2: 'NOT ENOUGH INFO'}

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    "load raw data"
    train_sents, train_sent_masks, train_sent_labels, train_claims, train_claim_mask, train_labels, word2id = load_fever_train(
        sent_len, claim_len, cand_size)
    train_3th_sents, train_3th_sent_masks, train_3th_sent_labels, train_3th_claims, train_3th_claim_mask, train_3th_labels, word2id = load_fever_train_NoEnoughInfo(
        sent_len, claim_len, cand_size, word2id)
    test_sents, test_sent_masks, test_sent_labels, test_claims, test_claim_mask, test_sent_names, test_ground_names, test_labels, word2id = load_fever_dev(
        sent_len, claim_len, cand_size, word2id)
    test_3th_sents, test_3th_sent_masks, test_3th_sent_labels, test_3th_claims, test_3th_claim_mask, test_3th_labels, word2id = load_fever_dev_NoEnoughInfo(
        sent_len, claim_len, cand_size, word2id)
    dev_sents, dev_sent_masks, dev_sent_labels, dev_claims, dev_claim_mask, dev_sent_names, dev_ground_names, dev_labels, word2id = load_fever_test(
        sent_len, claim_len, cand_size, word2id)
    dev_3th_sents, dev_3th_sent_masks, dev_3th_sent_labels, dev_3th_claims, dev_3th_claim_mask, dev_3th_labels, word2id = load_fever_test_NoEnoughInfo(
        sent_len, claim_len, cand_size, word2id)

    train_sents = np.asarray(train_sents, dtype='int32')
    train_3th_sents = np.asarray(train_3th_sents, dtype='int32')
    joint_train_sents = np.concatenate((train_sents, train_3th_sents))
    test_sents = np.asarray(test_sents, dtype='int32')
    test_3th_sents = np.asarray(test_3th_sents, dtype='int32')
    joint_test_sents = np.concatenate((test_sents, test_3th_sents))
    dev_sents = np.asarray(dev_sents, dtype='int32')
    dev_3th_sents = np.asarray(dev_3th_sents, dtype='int32')
    joint_dev_sents = np.concatenate((dev_sents, dev_3th_sents))

    train_sent_masks = np.asarray(train_sent_masks, dtype=theano.config.floatX)
    train_3th_sent_masks = np.asarray(train_3th_sent_masks,
                                      dtype=theano.config.floatX)
    joint_train_sent_masks = np.concatenate(
        (train_sent_masks, train_3th_sent_masks))
    test_sent_masks = np.asarray(test_sent_masks, dtype=theano.config.floatX)
    test_3th_sent_masks = np.asarray(test_3th_sent_masks,
                                     dtype=theano.config.floatX)
    joint_test_sent_masks = np.concatenate(
        (test_sent_masks, test_3th_sent_masks))
    dev_sent_masks = np.asarray(dev_sent_masks, dtype=theano.config.floatX)
    dev_3th_sent_masks = np.asarray(dev_3th_sent_masks,
                                    dtype=theano.config.floatX)
    joint_dev_sent_masks = np.concatenate((dev_sent_masks, dev_3th_sent_masks))

    train_sent_labels = np.asarray(train_sent_labels, dtype='int32')
    train_3th_sent_labels = np.asarray(train_3th_sent_labels, dtype='int32')
    joint_train_sent_labels = np.concatenate(
        (train_sent_labels, train_3th_sent_labels))
    test_sent_labels = np.asarray(test_sent_labels, dtype='int32')
    test_3th_sent_labels = np.asarray(test_3th_sent_labels, dtype='int32')
    joint_test_sent_labels = np.concatenate(
        (test_sent_labels, test_3th_sent_labels))
    dev_sent_labels = np.asarray(dev_sent_labels, dtype='int32')
    dev_3th_sent_labels = np.asarray(dev_3th_sent_labels, dtype='int32')
    joint_dev_sent_labels = np.concatenate(
        (dev_sent_labels, dev_3th_sent_labels))

    train_claims = np.asarray(train_claims, dtype='int32')
    train_3th_claims = np.asarray(train_3th_claims, dtype='int32')
    joint_train_claims = np.concatenate((train_claims, train_3th_claims))
    test_claims = np.asarray(test_claims, dtype='int32')
    test_3th_claims = np.asarray(test_3th_claims, dtype='int32')
    joint_test_claims = np.concatenate((test_claims, test_3th_claims))
    dev_claims = np.asarray(dev_claims, dtype='int32')
    dev_3th_claims = np.asarray(dev_3th_claims, dtype='int32')
    joint_dev_claims = np.concatenate((dev_claims, dev_3th_claims))

    train_claim_mask = np.asarray(train_claim_mask, dtype=theano.config.floatX)
    train_3th_claim_mask = np.asarray(train_3th_claim_mask,
                                      dtype=theano.config.floatX)
    joint_train_claim_mask = np.concatenate(
        (train_claim_mask, train_3th_claim_mask))
    test_claim_mask = np.asarray(test_claim_mask, dtype=theano.config.floatX)
    test_3th_claim_mask = np.asarray(test_3th_claim_mask,
                                     dtype=theano.config.floatX)
    joint_test_claim_mask = np.concatenate(
        (test_claim_mask, test_3th_claim_mask))
    dev_claim_mask = np.asarray(dev_claim_mask, dtype=theano.config.floatX)
    dev_3th_claim_mask = np.asarray(dev_3th_claim_mask,
                                    dtype=theano.config.floatX)
    joint_dev_claim_mask = np.concatenate((dev_claim_mask, dev_3th_claim_mask))

    train_labels = np.asarray(train_labels, dtype='int32')
    train_3th_labels = np.asarray(train_3th_labels, dtype='int32')
    joint_train_labels = np.concatenate((train_labels, train_3th_labels))
    test_labels = np.asarray(test_labels, dtype='int32')
    test_3th_labels = np.asarray(test_3th_labels, dtype='int32')
    joint_test_labels = np.concatenate((test_labels, test_3th_labels))
    dev_labels = np.asarray(dev_labels, dtype='int32')
    dev_3th_labels = np.asarray(dev_3th_labels, dtype='int32')
    joint_dev_labels = np.concatenate((dev_labels, dev_3th_labels))

    joint_train_size = len(joint_train_claims)
    joint_test_size = len(joint_test_claims)
    joint_dev_size = len(joint_dev_claims)
    train_size = len(train_claims)
    test_size = len(test_claims)
    dev_size = len(dev_claims)
    test_3th_size = len(test_3th_claims)
    dev_3th_size = len(dev_3th_claims)
    vocab_size = len(word2id) + 1
    print 'joint_train size: ', joint_train_size, ' joint_dev size: ', joint_test_size, ' joint_test size: ', joint_dev_size
    print 'train size: ', train_size, ' dev size: ', test_size, ' test size: ', dev_size
    print 'vocab size: ', vocab_size

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    "now, start to build the input form of the model"
    sents_ids = T.itensor3()  #(batch, cand_size, sent_len)
    sents_mask = T.ftensor3()
    sents_labels = T.imatrix()  #(batch, cand_size)
    claim_ids = T.imatrix()  #(batch, claim_len)
    claim_mask = T.fmatrix()

    joint_sents_ids = T.itensor3()  #(batch, cand_size, sent_len)
    joint_sents_mask = T.ftensor3()
    joint_sents_labels = T.imatrix()  #(batch, cand_size)
    joint_claim_ids = T.imatrix()  #(batch, claim_len)
    joint_claim_mask = T.fmatrix()
    joint_labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    embed_input_sents = init_embeddings[sents_ids.flatten(
    )].reshape((batch_size * cand_size, sent_len, emb_size)).dimshuffle(
        0, 2, 1
    )  #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_claim = init_embeddings[claim_ids.flatten()].reshape(
        (batch_size, claim_len, emb_size)).dimshuffle(0, 2, 1)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    task1_att_conv_W, task1_att_conv_b = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]))
    task1_conv_W_context, task1_conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    att_conv_W, att_conv_b = create_conv_para(rng,
                                              filter_shape=(hidden_size[0], 1,
                                                            emb_size,
                                                            filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))

    NN_para = [
        conv_W, conv_b, task1_att_conv_W, task1_att_conv_b, att_conv_W,
        att_conv_b, task1_conv_W_context, conv_W_context
    ]

    conv_model_sents = Conv_with_Mask(
        rng,
        input_tensor3=embed_input_sents,
        mask_matrix=sents_mask.reshape(
            (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model_sents.maxpool_vec  #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size
    batch_sent_emb = sent_embeddings.reshape(
        (batch_size, cand_size, hidden_size[0]))

    conv_model_claims = Conv_with_Mask(
        rng,
        input_tensor3=embed_input_claim,
        mask_matrix=claim_mask,
        image_shape=(batch_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    claim_embeddings = conv_model_claims.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size
    batch_claim_emb = T.repeat(claim_embeddings.dimshuffle(0, 'x', 1),
                               cand_size,
                               axis=1)
    '''
    attentive conv for task1
    '''
    task1_attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(
        rng,
        input_tensor3=
        embed_input_sents,  #batch_size*cand_size, emb_size, sent_len
        input_tensor3_r=T.repeat(embed_input_claim, cand_size, axis=0),
        mask_matrix=sents_mask.reshape(
            (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])),
        mask_matrix_r=T.repeat(claim_mask, cand_size, axis=0),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=task1_att_conv_W,
        b=task1_att_conv_b,
        W_context=task1_conv_W_context,
        b_context=task1_conv_b_context)
    task1_attentive_sent_embeddings_l = task1_attentive_conv_layer.attentive_maxpool_vec_l  #(batch_size*cand_size, hidden_size)
    task1_attentive_sent_embeddings_r = task1_attentive_conv_layer.attentive_maxpool_vec_r

    concate_claim_sent = T.concatenate([
        batch_claim_emb, batch_sent_emb,
        T.sum(batch_claim_emb * batch_sent_emb, axis=2).dimshuffle(0, 1, 'x')
    ],
                                       axis=2)
    concate_2_matrix = concate_claim_sent.reshape(
        (batch_size * cand_size, hidden_size[0] * 2 + 1))

    LR_input = T.concatenate([
        concate_2_matrix, task1_attentive_sent_embeddings_l,
        task1_attentive_sent_embeddings_r
    ],
                             axis=1)
    LR_input_size = hidden_size[0] * 2 + 1 + hidden_size[0] * 2

    # LR_input = concate_2_matrix
    # LR_input_size = hidden_size[0]*2+1
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_a = create_ensemble_para(
        rng, 1, LR_input_size)  # the weight matrix hidden_size*2
    # LR_b = theano.shared(value=np.zeros((8,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
    LR_para = [U_a]
    # layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=8, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector
    score_matrix = T.nnet.sigmoid(LR_input.dot(U_a))  #batch * 12
    inter_matrix = score_matrix.reshape((batch_size, cand_size))

    # inter_sent_claim = T.batched_dot(batch_sent_emb, batch_claim_emb) #(batch_size, cand_size, 1)
    # inter_matrix = T.nnet.sigmoid(inter_sent_claim.reshape((batch_size, cand_size)))
    '''
    maybe 1.0-inter_matrix can be rewritten into 1/e^(inter_matrix)
    '''
    # prob_pos = T.where( sents_labels < 1, 1.0-inter_matrix, inter_matrix)
    # loss = -T.mean(T.log(prob_pos))
    #f1 as loss
    batch_overlap = T.sum(sents_labels * inter_matrix, axis=1)
    batch_recall = batch_overlap / T.sum(sents_labels, axis=1)
    batch_precision = batch_overlap / T.sum(inter_matrix, axis=1)
    batch_f1 = 2.0 * batch_recall * batch_precision / (batch_recall +
                                                       batch_precision)
    loss = -T.mean(T.log(batch_f1))
    # loss = T.nnet.nnet.binary_crossentropy(inter_matrix, sents_labels).mean()
    '''
    training task2, predict 3 labels
    '''
    joint_embed_input_sents = init_embeddings[joint_sents_ids.flatten(
    )].reshape((batch_size * cand_size, sent_len, emb_size)).dimshuffle(
        0, 2, 1
    )  #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    joint_embed_input_claim = init_embeddings[
        joint_claim_ids.flatten()].reshape(
            (batch_size, claim_len, emb_size)).dimshuffle(0, 2, 1)
    joint_conv_model_sents = Conv_with_Mask(
        rng,
        input_tensor3=joint_embed_input_sents,
        mask_matrix=joint_sents_mask.reshape(
            (joint_sents_mask.shape[0] * joint_sents_mask.shape[1],
             joint_sents_mask.shape[2])),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    joint_sent_embeddings = joint_conv_model_sents.maxpool_vec  #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size
    joint_batch_sent_emb = joint_sent_embeddings.reshape(
        (batch_size, cand_size, hidden_size[0]))
    joint_premise_emb = T.sum(joint_batch_sent_emb *
                              joint_sents_labels.dimshuffle(0, 1, 'x'),
                              axis=1)  #(batch, hidden_size)

    joint_conv_model_claims = Conv_with_Mask(
        rng,
        input_tensor3=joint_embed_input_claim,
        mask_matrix=joint_claim_mask,
        image_shape=(batch_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    joint_claim_embeddings = joint_conv_model_claims.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    joint_premise_hypo_emb = T.concatenate(
        [joint_premise_emb, joint_claim_embeddings],
        axis=1)  #(batch, 2*hidden_size)
    '''
    attentive conv in task2
    '''
    joint_sents_tensor3 = joint_embed_input_sents.dimshuffle(0, 2, 1).reshape(
        (batch_size, cand_size * sent_len, emb_size))
    joint_sents_dot = T.batched_dot(
        joint_sents_tensor3, joint_sents_tensor3.dimshuffle(
            0, 2, 1))  #(batch_size, cand_size*sent_len, cand_size*sent_len)
    joint_sents_dot_2_matrix = T.nnet.softmax(
        joint_sents_dot.reshape(
            (batch_size * cand_size * sent_len, cand_size * sent_len)))
    joint_sents_context = T.batched_dot(
        joint_sents_dot_2_matrix.reshape(
            (batch_size, cand_size * sent_len, cand_size * sent_len)),
        joint_sents_tensor3)  #(batch_size, cand_size*sent_len, emb_size)
    joint_add_sents_context = joint_embed_input_sents + joint_sents_context.reshape(
        (batch_size * cand_size, sent_len, emb_size)
    ).dimshuffle(
        0, 2, 1
    )  #T.concatenate([joint_embed_input_sents, joint_sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len)

    attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(
        rng,
        input_tensor3=
        joint_add_sents_context,  #batch_size*cand_size, 2*emb_size, sent_len
        input_tensor3_r=T.repeat(joint_embed_input_claim, cand_size, axis=0),
        mask_matrix=joint_sents_mask.reshape(
            (joint_sents_mask.shape[0] * joint_sents_mask.shape[1],
             joint_sents_mask.shape[2])),
        mask_matrix_r=T.repeat(joint_claim_mask, cand_size, axis=0),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=att_conv_W,
        b=att_conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l.reshape(
        (batch_size, cand_size,
         hidden_size[0]))  #(batch_size*cand_size, hidden_size)
    attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r.reshape(
        (batch_size, cand_size, hidden_size[0]))
    masked_sents_attconv = attentive_sent_embeddings_l * joint_sents_labels.dimshuffle(
        0, 1, 'x')
    masked_claim_attconv = attentive_sent_embeddings_r * joint_sents_labels.dimshuffle(
        0, 1, 'x')
    fine_max = T.concatenate([
        T.max(masked_sents_attconv, axis=1),
        T.max(masked_claim_attconv, axis=1)
    ],
                             axis=1)  #(batch, 2*hidden)
    # fine_sum = T.concatenate([T.sum(masked_sents_attconv, axis=1),T.sum(masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden)
    "Logistic Regression layer"
    joint_LR_input = T.concatenate([joint_premise_hypo_emb, fine_max], axis=1)
    joint_LR_input_size = 2 * hidden_size[0] + 2 * hidden_size[0]

    joint_U_a = create_ensemble_para(rng, 3,
                                     joint_LR_input_size)  # (input_size, 3)
    joint_LR_b = theano.shared(value=np.zeros((3, ),
                                              dtype=theano.config.floatX),
                               name='LR_b',
                               borrow=True)  #bias for each target class
    joint_LR_para = [joint_U_a, joint_LR_b]

    joint_layer_LR = LogisticRegression(
        rng,
        input=joint_LR_input,
        n_in=joint_LR_input_size,
        n_out=3,
        W=joint_U_a,
        b=joint_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    joint_loss = joint_layer_LR.negative_log_likelihood(
        joint_labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.
    '''
    testing
    '''
    # binarize_prob = T.where( inter_matrix > 0.5, 1, 0)  #(batch_size, cand_size

    masked_inter_matrix = inter_matrix * sents_labels  #(batch, cand_size)
    test_premise_emb = T.sum(batch_sent_emb *
                             masked_inter_matrix.dimshuffle(0, 1, 'x'),
                             axis=1)
    test_premise_hypo_emb = T.concatenate([test_premise_emb, claim_embeddings],
                                          axis=1)

    #fine-maxsum
    sents_tensor3 = embed_input_sents.dimshuffle(0, 2, 1).reshape(
        (batch_size, cand_size * sent_len, emb_size))
    sents_dot = T.batched_dot(sents_tensor3, sents_tensor3.dimshuffle(
        0, 2, 1))  #(batch_size, cand_size*sent_len, cand_size*sent_len)
    sents_dot_2_matrix = T.nnet.softmax(
        sents_dot.reshape(
            (batch_size * cand_size * sent_len, cand_size * sent_len)))
    sents_context = T.batched_dot(
        sents_dot_2_matrix.reshape(
            (batch_size, cand_size * sent_len, cand_size * sent_len)),
        sents_tensor3)  #(batch_size, cand_size*sent_len, emb_size)
    add_sents_context = embed_input_sents + sents_context.reshape(
        (batch_size * cand_size, sent_len, emb_size)
    ).dimshuffle(
        0, 2, 1
    )  #T.concatenate([embed_input_sents, sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len)

    test_attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(
        rng,
        input_tensor3=
        add_sents_context,  #batch_size*cand_size, 2*emb_size, sent_len
        input_tensor3_r=T.repeat(embed_input_claim, cand_size, axis=0),
        mask_matrix=sents_mask.reshape(
            (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])),
        mask_matrix_r=T.repeat(claim_mask, cand_size, axis=0),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=att_conv_W,
        b=att_conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    # attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l  #(batch_size*cand_size, hidden_size)
    # attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r

    test_attentive_sent_embeddings_l = test_attentive_conv_layer.attentive_maxpool_vec_l.reshape(
        (batch_size, cand_size,
         hidden_size[0]))  #(batch_size*cand_size, hidden_size)
    test_attentive_sent_embeddings_r = test_attentive_conv_layer.attentive_maxpool_vec_r.reshape(
        (batch_size, cand_size, hidden_size[0]))
    test_masked_sents_attconv = test_attentive_sent_embeddings_l * masked_inter_matrix.dimshuffle(
        0, 1, 'x')
    test_masked_claim_attconv = test_attentive_sent_embeddings_r * masked_inter_matrix.dimshuffle(
        0, 1, 'x')
    test_fine_max = T.concatenate([
        T.max(test_masked_sents_attconv, axis=1),
        T.max(test_masked_claim_attconv, axis=1)
    ],
                                  axis=1)  #(batch, 2*hidden)
    # test_fine_sum = T.concatenate([T.sum(test_masked_sents_attconv, axis=1),T.sum(test_masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden)

    test_LR_input = T.concatenate([test_premise_hypo_emb, test_fine_max],
                                  axis=1)
    test_LR_input_size = joint_LR_input_size

    test_layer_LR = LogisticRegression(
        rng,
        input=test_LR_input,
        n_in=test_LR_input_size,
        n_out=3,
        W=joint_U_a,
        b=joint_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector

    params = [init_embeddings] + NN_para + LR_para + joint_LR_para
    cost = loss + joint_loss
    "Use AdaGrad to update parameters"
    updates = Gradient_Cost_Para(cost, params, learning_rate)

    train_model = theano.function([
        sents_ids, sents_mask, sents_labels, claim_ids, claim_mask,
        joint_sents_ids, joint_sents_mask, joint_sents_labels, joint_claim_ids,
        joint_claim_mask, joint_labels
    ],
                                  cost,
                                  updates=updates,
                                  allow_input_downcast=True,
                                  on_unused_input='ignore')
    test_model = theano.function([
        sents_ids, sents_mask, sents_labels, claim_ids, claim_mask,
        joint_labels
    ], [
        inter_matrix,
        test_layer_LR.errors(joint_labels), test_layer_LR.y_pred
    ],
                                 allow_input_downcast=True,
                                 on_unused_input='ignore')
    dev_model = theano.function([
        sents_ids, sents_mask, sents_labels, claim_ids, claim_mask,
        joint_labels
    ], [
        inter_matrix,
        test_layer_LR.errors(joint_labels), test_layer_LR.y_pred
    ],
                                allow_input_downcast=True,
                                on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    joint_n_train_batches = joint_train_size / batch_size
    joint_train_batch_start = list(
        np.arange(joint_n_train_batches) *
        batch_size) + [joint_train_size - batch_size]
    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]

    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]
    n_test_3th_batches = test_3th_size / batch_size
    test_3th_batch_start = list(np.arange(n_test_3th_batches) *
                                batch_size) + [test_3th_size - batch_size]

    n_dev_batches = dev_size / batch_size
    dev_batch_start = list(
        np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size]
    n_dev_3th_batches = dev_3th_size / batch_size
    dev_3th_batch_start = list(np.arange(n_dev_3th_batches) *
                               batch_size) + [dev_3th_size - batch_size]

    max_acc = 0.0
    max_test_f1 = 0.0
    max_test_acc = 0.0

    cost_i = 0.0
    joint_train_indices = range(joint_train_size)
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            joint_train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed
        random.Random(100).shuffle(train_indices)
        iter_accu = 0

        for joint_batch_id in joint_train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * joint_n_train_batches + iter_accu + 1
            iter_accu += 1
            joint_train_id_batch = joint_train_indices[
                joint_batch_id:joint_batch_id + batch_size]
            for i in range(3):
                batch_id = random.choice(train_batch_start)
                train_id_batch = train_indices[batch_id:batch_id + batch_size]
                cost_i += train_model(
                    train_sents[train_id_batch],
                    train_sent_masks[train_id_batch],
                    train_sent_labels[train_id_batch],
                    train_claims[train_id_batch],
                    train_claim_mask[train_id_batch],
                    #joint_sents_ids,joint_sents_mask,joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels
                    joint_train_sents[joint_train_id_batch],
                    joint_train_sent_masks[joint_train_id_batch],
                    joint_train_sent_labels[joint_train_id_batch],
                    joint_train_claims[joint_train_id_batch],
                    joint_train_claim_mask[joint_train_id_batch],
                    joint_train_labels[joint_train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            # if (epoch==1 and iter%1000==0) or (epoch>=2 and iter%5==0):
            if iter % 100 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                f1_sum = 0.0
                error_sum = 0.0
                full_evi = 0
                predictions = []
                for test_batch_id in test_batch_start:  # for each test batch
                    batch_prob, error_i, pred_i = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_sent_masks[test_batch_id:test_batch_id +
                                        batch_size],
                        test_sent_labels[test_batch_id:test_batch_id +
                                         batch_size],
                        test_claims[test_batch_id:test_batch_id + batch_size],
                        test_claim_mask[test_batch_id:test_batch_id +
                                        batch_size],
                        test_labels[test_batch_id:test_batch_id + batch_size])
                    error_sum += error_i
                    batch_sent_labels = test_sent_labels[
                        test_batch_id:test_batch_id + batch_size]
                    batch_sent_names = test_sent_names[
                        test_batch_id:test_batch_id + batch_size]
                    batch_ground_names = test_ground_names[
                        test_batch_id:test_batch_id + batch_size]
                    batch_ground_labels = test_labels[
                        test_batch_id:test_batch_id + batch_size]
                    for i in range(batch_size):
                        instance_i = {}
                        instance_i['label'] = pred_id2label.get(
                            batch_ground_labels[i])
                        instance_i['predicted_label'] = pred_id2label.get(
                            pred_i[i])
                        pred_sent_names = []
                        gold_sent_names = batch_ground_names[i]
                        zipped = [(batch_prob[i, k], batch_sent_labels[i][k],
                                   batch_sent_names[i][k])
                                  for k in range(cand_size)]
                        sorted_zip = sorted(zipped,
                                            key=lambda x: x[0],
                                            reverse=True)
                        for j in range(cand_size):
                            triple = sorted_zip[j]
                            if triple[1] == 1.0:
                                '''
                                we should consider a rank, instead of binary
                                if triple[0] >0.5: can control the recall, influence the strict_acc
                                '''
                                if triple[0] > 0.5:
                                    # pred_sent_names.append(batch_sent_names[i][j])
                                    pred_sent_names.append(triple[2])
                                # if len(pred_sent_names) == max_pred_pick:
                                #     break
                        instance_i['predicted_evidence'] = pred_sent_names
                        # print 'pred_sent_names:',pred_sent_names
                        # print 'gold_sent_names:',gold_sent_names
                        new_gold_names = []
                        for gold_name in gold_sent_names:
                            new_gold_names.append([None, None] + gold_name)
                        instance_i['evidence'] = [new_gold_names]
                        predictions.append(instance_i)

                strict_score, label_accuracy, precision, recall, f1 = fever_score(
                    predictions)
                print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1
                # test_f1=f1_sum/(len(test_batch_start)*batch_size)

                for test_batch_id in test_3th_batch_start:  # for each test batch
                    _, error_i, pred_i = test_model(
                        test_3th_sents[test_batch_id:test_batch_id +
                                       batch_size],
                        test_3th_sent_masks[test_batch_id:test_batch_id +
                                            batch_size],
                        test_3th_sent_labels[test_batch_id:test_batch_id +
                                             batch_size],
                        test_3th_claims[test_batch_id:test_batch_id +
                                        batch_size],
                        test_3th_claim_mask[test_batch_id:test_batch_id +
                                            batch_size],
                        test_3th_labels[test_batch_id:test_batch_id +
                                        batch_size])
                    for i in range(batch_size):
                        instance_i = {}
                        instance_i['label'] = pred_id2label.get(2)
                        instance_i['predicted_label'] = pred_id2label.get(
                            pred_i[i])
                        instance_i['predicted_evidence'] = []
                        instance_i['evidence'] = []
                        predictions.append(instance_i)

                strict_score, label_accuracy, precision, recall, f1 = fever_score(
                    predictions)
                print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1
                if f1 > max_test_f1 or strict_score > max_test_acc:
                    if f1 > max_test_f1:
                        max_test_f1 = f1
                    if strict_score > max_test_acc:
                        max_test_acc = strict_score
                    #test
                    print '....................\n'
                    f1_sum = 0.0
                    error_sum = 0.0
                    full_evi = 0
                    predictions = []
                    fine_grained_sent_predictions = {
                        1: [],
                        2: [],
                        3: [],
                        4: [],
                        5: []
                    }
                    fine_grained_page_predictions = {
                        1: [],
                        2: [],
                        3: [],
                        4: []
                    }
                    for dev_batch_id in dev_batch_start:  # for each test batch
                        batch_prob, error_i, pred_i = dev_model(
                            dev_sents[dev_batch_id:dev_batch_id + batch_size],
                            dev_sent_masks[dev_batch_id:dev_batch_id +
                                           batch_size],
                            dev_sent_labels[dev_batch_id:dev_batch_id +
                                            batch_size],
                            dev_claims[dev_batch_id:dev_batch_id + batch_size],
                            dev_claim_mask[dev_batch_id:dev_batch_id +
                                           batch_size],
                            dev_labels[dev_batch_id:dev_batch_id + batch_size])
                        error_sum += error_i
                        batch_sent_labels = dev_sent_labels[
                            dev_batch_id:dev_batch_id + batch_size]
                        batch_sent_names = dev_sent_names[
                            dev_batch_id:dev_batch_id + batch_size]
                        batch_ground_names = dev_ground_names[
                            dev_batch_id:dev_batch_id + batch_size]
                        batch_ground_labels = dev_labels[
                            dev_batch_id:dev_batch_id + batch_size]
                        for i in range(batch_size):
                            instance_i = {}
                            instance_i['label'] = pred_id2label.get(
                                batch_ground_labels[i])
                            instance_i['predicted_label'] = pred_id2label.get(
                                pred_i[i])
                            pred_sent_names = []
                            gold_sent_names = batch_ground_names[i]
                            zipped = [(batch_prob[i,
                                                  k], batch_sent_labels[i][k],
                                       batch_sent_names[i][k])
                                      for k in range(cand_size)]
                            sorted_zip = sorted(zipped,
                                                key=lambda x: x[0],
                                                reverse=True)
                            for j in range(cand_size):
                                triple = sorted_zip[j]
                                if triple[1] == 1.0:
                                    '''
                                    we should consider a rank, instead of binary
                                    if triple[0] >0.5: can control the recall, influence the strict_acc
                                    '''
                                    if triple[0] > 0.5:
                                        # pred_sent_names.append(batch_sent_names[i][j])
                                        pred_sent_names.append(triple[2])
                                    # if len(pred_sent_names) == max_pred_pick:
                                    #     break
                            instance_i['predicted_evidence'] = pred_sent_names
                            # print 'pred_sent_names:',pred_sent_names
                            # print 'gold_sent_names:',gold_sent_names
                            new_gold_names = []
                            for gold_name in gold_sent_names:
                                new_gold_names.append([None, None] + gold_name)
                            instance_i['evidence'] = [new_gold_names]
                            predictions.append(instance_i)

                            evi_sent_size, evi_page_size = count_sent_page(
                                gold_sent_names)
                            fine_grained_sent_predictions.get(
                                evi_sent_size).append(instance_i)
                            fine_grained_page_predictions.get(
                                evi_page_size).append(instance_i)
                    strict_score, label_accuracy, precision, recall, f1 = fever_score(
                        predictions)
                    print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1
                    print '......sent...\n'
                    for i in range(1, 6):
                        predictions_i = fine_grained_sent_predictions.get(i)
                        if len(predictions_i) > 0:
                            strict_score, label_accuracy, precision, recall, f1 = fever_score(
                                predictions_i)
                            print i, '\tstrict, all, pre, rec, f1: ', strict_score, label_accuracy, precision, recall, f1
                        else:
                            print i, '\tstrict, all, pre, rec, f1: ', 0.0, 0.0, 0.0, 0.0, 0.0
                    print '......page...\n'
                    for i in range(1, 5):
                        predictions_i = fine_grained_page_predictions.get(i)
                        if len(predictions_i) > 0:
                            strict_score, label_accuracy, precision, recall, f1 = fever_score(
                                predictions_i)
                            print i, '\tstrict, all, pre, rec, f1: ', strict_score, label_accuracy, precision, recall, f1
                        else:
                            print i, '\tstrict, all, pre, rec, f1: ', 0.0, 0.0, 0.0, 0.0, 0.0

                    for dev_batch_id in dev_3th_batch_start:  # for each test batch
                        _, error_i, pred_i = dev_model(
                            dev_3th_sents[dev_batch_id:dev_batch_id +
                                          batch_size],
                            dev_3th_sent_masks[dev_batch_id:dev_batch_id +
                                               batch_size],
                            dev_3th_sent_labels[dev_batch_id:dev_batch_id +
                                                batch_size],
                            dev_3th_claims[dev_batch_id:dev_batch_id +
                                           batch_size],
                            dev_3th_claim_mask[dev_batch_id:dev_batch_id +
                                               batch_size],
                            dev_3th_labels[dev_batch_id:dev_batch_id +
                                           batch_size])
                        for i in range(batch_size):
                            instance_i = {}
                            instance_i['label'] = pred_id2label.get(2)
                            instance_i['predicted_label'] = pred_id2label.get(
                                pred_i[i])
                            instance_i['predicted_evidence'] = []
                            instance_i['evidence'] = []
                            predictions.append(instance_i)

                    strict_score, label_accuracy, precision, recall, f1 = fever_score(
                        predictions)
                    print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
lstm_b[42:63] = (5. * np.ones((21, ))).astype(theano.config.floatX)
lstm_b[63:] = bh

rat = np.zeros((21, ))
for i in range(21):
    rat[i] = 1. * sum(y1.flatten() == i) / y1.size

print rat

# endregion

# region Build Model
print 'Building Model...'
np.random.seed(seed=123)

tX = T.ftensor3('tX')
tH = T.fmatrix('tH')
tC = T.fmatrix('tC')
tm = T.fmatrix('tm')
ty = T.imatrix('ty')

classifier = LSTM_A(seqs=tX,
                    h0s=tH,
                    c0s=tC,
                    masks=tm,
                    dim_x=909,
                    dim_h=21,
                    dim_y=21,
                    wt_y=wt_y,
                    lstm_W=lstm_W,
                    lstm_U=lstm_U,
Пример #56
0
def main(options):
    print 'Build and compile network'
    input_data = T.ftensor3('input_data')
    input_mask = T.fmatrix('input_mask')
    target_data = T.imatrix('target_data')
    target_mask = T.fmatrix('target_mask')

    network = build_network(input_data=input_data,
                            input_mask=input_mask,
                            num_inputs=options['num_inputs'],
                            num_units_list=options['num_units_list'],
                            num_outputs=options['num_outputs'],
                            dropout_ratio=options['dropout_ratio'],
                            weight_noise=options['weight_noise'],
                            use_layer_norm=options['use_layer_norm'],
                            peepholes=options['peepholes'],
                            learn_init=options['learn_init'],
                            grad_clipping=options['grad_clipping'],
                            gradient_steps=options['gradient_steps'],
                            use_projection=options['use_projection'])

    network_params = get_all_params(network, trainable=True)

    print("number of parameters in model: %d" % count_params(network, trainable=True))

    if options['reload_model']:
        print('Loading Parameters...')
        pretrain_network_params_val,  pretrain_update_params_val, pretrain_total_batch_cnt = pickle.load(open(options['reload_model'], 'rb'))

        print('Applying Parameters...')
        set_model_param_value(network_params, pretrain_network_params_val)
    else:
        pretrain_update_params_val = None
        pretrain_total_batch_cnt = 0

    print 'Build network trainer'
    training_fn, trainer_params = set_network_trainer(input_data=input_data,
                                                      input_mask=input_mask,
                                                      target_data=target_data,
                                                      target_mask=target_mask,
                                                      num_outputs=options['num_outputs'],
                                                      network=network,
                                                      updater=options['updater'],
                                                      learning_rate=options['lr'],
                                                      grad_max_norm=options['grad_norm'],
                                                      l2_lambda=options['l2_lambda'],
                                                      load_updater_params=pretrain_update_params_val)

    print 'Build network predictor'
    predict_fn = set_network_predictor(input_data=input_data,
                                       input_mask=input_mask,
                                       target_data=target_data,
                                       target_mask=target_mask,
                                       num_outputs=options['num_outputs'],
                                       network=network)


    print 'Load data stream'
    train_datastream = get_datastream(path=options['data_path'],
                                      which_set='train_si84',
                                      batch_size=options['batch_size'])

    print 'Start training'
    if os.path.exists(options['save_path'] + '_eval_history.npz'):
        evaluation_history = numpy.load(options['save_path'] + '_eval_history.npz')['eval_history'].tolist()
    else:
        evaluation_history = [[[10.0, 10.0, 1.0], [10.0, 10.0, 1.0]]]
    early_stop_flag = False
    early_stop_cnt = 0
    total_batch_cnt = 0

    try:
        # for each epoch
        for e_idx in range(options['num_epochs']):
            # for each batch
            for b_idx, data in enumerate(train_datastream.get_epoch_iterator()):
                total_batch_cnt += 1
                if pretrain_total_batch_cnt>=total_batch_cnt:
                    continue

                # get input, target data
                input_data = data[0].astype(floatX)
                input_mask = data[1].astype(floatX)

                # get target data
                target_data = data[2]
                target_mask = data[3].astype(floatX)

                # get output
                train_output = training_fn(input_data,
                                           input_mask,
                                           target_data,
                                           target_mask)
                train_predict_cost = train_output[0]
                network_grads_norm = train_output[1]

                # show intermediate result
                if total_batch_cnt%options['train_disp_freq'] == 0 and total_batch_cnt!=0:
                    best_idx = numpy.asarray(evaluation_history)[:, 1, 2].argmin()
                    print '============================================================================================'
                    print 'Model Name: ', options['save_path'].split('/')[-1]
                    print '============================================================================================'
                    print 'Epoch: ', str(e_idx), ', Update: ', str(total_batch_cnt)
                    print '--------------------------------------------------------------------------------------------'
                    print 'Prediction Cost: ', str(train_predict_cost)
                    print 'Gradient Norm: ', str(network_grads_norm)
                    print '--------------------------------------------------------------------------------------------'
                    print 'Train NLL: ', str(evaluation_history[-1][0][0]), ', BPC: ', str(evaluation_history[-1][0][1]), ', FER: ', str(evaluation_history[-1][0][2])
                    print 'Valid NLL: ', str(evaluation_history[-1][1][0]), ', BPC: ', str(evaluation_history[-1][1][1]), ', FER: ', str(evaluation_history[-1][1][2])
                    print '--------------------------------------------------------------------------------------------'
                    print 'Best NLL: ', str(evaluation_history[best_idx][1][0]), ', BPC: ', str(evaluation_history[best_idx][1][1]), ', FER: ', str(evaluation_history[best_idx][1][2])

                # evaluation
                if total_batch_cnt%options['train_eval_freq'] == 0 and total_batch_cnt!=0:
                    train_eval_datastream = get_datastream(path=options['data_path'],
                                                           which_set='train_si84',
                                                           batch_size=options['eval_batch_size'])
                    valid_eval_datastream = get_datastream(path=options['data_path'],
                                                           which_set='test_dev93',
                                                           batch_size=options['eval_batch_size'])
                    train_nll, train_bpc, train_fer = network_evaluation(predict_fn,
                                                                         train_eval_datastream)
                    valid_nll, valid_bpc, valid_fer = network_evaluation(predict_fn,
                                                                         valid_eval_datastream)

                    # check over-fitting
                    if valid_fer>numpy.asarray(evaluation_history)[:, 1, 2].min():
                        early_stop_cnt += 1.
                    else:
                        early_stop_cnt = 0.
                        best_network_params_vals = get_model_param_values(network_params)
                        pickle.dump(best_network_params_vals,
                                    open(options['save_path'] + '_best_model.pkl', 'wb'))

                    if early_stop_cnt>10:
                        early_stop_flag = True
                        break

                    # save results
                    evaluation_history.append([[train_nll, train_bpc, train_fer],
                                               [valid_nll, valid_bpc, valid_fer]])
                    numpy.savez(options['save_path'] + '_eval_history',
                                eval_history=evaluation_history)

                # save network
                if total_batch_cnt%options['train_save_freq'] == 0 and total_batch_cnt!=0:
                    cur_network_params_val = get_model_param_values(network_params)
                    cur_trainer_params_val = get_update_params_values(trainer_params)
                    cur_total_batch_cnt = total_batch_cnt
                    pickle.dump([cur_network_params_val, cur_trainer_params_val, cur_total_batch_cnt],
                                open(options['save_path'] + '_last_model.pkl', 'wb'))

            if early_stop_flag:
                break

    except KeyboardInterrupt:
        print 'Training Interrupted'
        cur_network_params_val = get_model_param_values(network_params)
        cur_trainer_params_val = get_update_params_values(trainer_params)
        cur_total_batch_cnt = total_batch_cnt
        pickle.dump([cur_network_params_val, cur_trainer_params_val, cur_total_batch_cnt],
                    open(options['save_path'] + '_last_model.pkl', 'wb'))
Пример #57
0
	def __init__(self, We, params):

		lstm_layers_num = 1
		emb_size = We.shape[1]
		self.eta = params.eta
		self.num_labels = params.num_labels
		self.en_hidden_size = params.en_hidden_size
		self.de_hidden_size = params.de_hidden_size

		self.lstm_layers_num = params.lstm_layers_num
		self._train = None
		self._utter = None
		self.params = []
		self.encoder_lstm_layers = []
		self.decoder_lstm_layers = []
		self.hos = []
		self.Cos = []

		encoderInputs = tensor.imatrix()
		decoderInputs, decoderTarget = tensor.imatrices(2)
		encoderMask, TF, decoderMask, decoderInputs0 = tensor.fmatrices(4)


		self.lookuptable = theano.shared(We)

		#### the last one is for the stary symbole
		self.de_lookuptable = theano.shared(name="Decoder LookUpTable", value=init_xavier_uniform(self.num_labels +1, self.de_hidden_size), borrow=True)
			
		self.linear = theano.shared(name="Linear", value = init_xavier_uniform(self.de_hidden_size+2*self.en_hidden_size, self.num_labels), borrow= True)
		self.linear_bias = theano.shared(name="Hidden to Bias", value=np.asarray(np.random.randn(self.num_labels, )*0., dtype=theano.config.floatX), borrow=True)                     
  
		#self.hidden_decode = theano.shared(name="Hidden to Decode", value= init_xavier_uniform(2*en_hidden_size, self.de_hidden_size), borrow = True)

		#self.hidden_bias = theano.shared(
                #        name="Hidden to Bias",
                #        value=np.asarray(np.random.randn(self.de_hidden_size, )*0., dtype=theano.config.floatX) ,
                #        borrow=True
                #        )		

	
		#self.params += [self.linear, self.de_lookuptable, self.hidden_decode, self.hidden_bias]    #concatenate
		self.params += [self.linear, self.linear_bias , self.de_lookuptable]    #the initial hidden state of decoder lstm is zeros
		#(max_sent_size, batch_size, hidden_size)
		state_below = self.lookuptable[encoderInputs.flatten()].reshape((encoderInputs.shape[0], encoderInputs.shape[1], emb_size))
		for _ in range(self.lstm_layers_num):
			
			enclstm_f = LSTM(emb_size, self.en_hidden_size)
			enclstm_b = LSTM(emb_size, self.en_hidden_size, True)
			self.encoder_lstm_layers.append(enclstm_f)    #append
			self.encoder_lstm_layers.append(enclstm_b)    #append
			self.params += enclstm_f.params + enclstm_b.params   #concatenate
			
			hs_f, Cs_f = enclstm_f.forward(state_below, encoderMask)
			hs_b, Cs_b = enclstm_b.forward(state_below, encoderMask)
			
			hs = tensor.concatenate([hs_f, hs_b], axis=2)
			Cs = tensor.concatenate([Cs_f, Cs_b], axis=2)
			hs0 = tensor.concatenate([hs_f[-1], hs_b[0]], axis=1)
			Cs0 = tensor.concatenate([Cs_f[-1], Cs_b[0]], axis=1) 			
			#self.hos += tensor.tanh(tensor.dot(hs0, self.hidden_decode) + self.hidden_bias),
			#self.Cos += tensor.tanh(tensor.dot(Cs0, self.hidden_decode) + self.hidden_bias),
			self.hos += tensor.alloc(np.asarray(0., dtype=theano.config.floatX), encoderInputs.shape[1], self.de_hidden_size),			
			self.Cos += tensor.alloc(np.asarray(0., dtype=theano.config.floatX), encoderInputs.shape[1], self.de_hidden_size),
			state_below = hs

		Encoder = state_below

		ei, di, dt = tensor.imatrices(3)    #place holders
                em, dm, tf, di0 =tensor.fmatrices(4)


		self.encoder_function = theano.function(inputs=[ei, em], outputs=Encoder, givens={encoderInputs:ei, encoderMask:em})
		
		#####################################################
		#####################################################
		state_below = self.de_lookuptable[decoderInputs.flatten()].reshape((decoderInputs.shape[0], decoderInputs.shape[1], self.de_hidden_size))
		for i in range(self.lstm_layers_num):
			declstm = LSTM(self.de_hidden_size, self.de_hidden_size)
			self.decoder_lstm_layers += declstm,    #append
			self.params += declstm.params    #concatenate
			ho, Co = self.hos[i], self.Cos[i]
			state_below, Cs = declstm.forward(state_below, decoderMask, ho, Co)
		
		
		##### Here we include the representation from the decoder	
		decoder_lstm_outputs = tensor.concatenate([state_below, Encoder], axis=2)

		linear_outputs = tensor.dot(decoder_lstm_outputs, self.linear) + self.linear_bias[None, None, :]
		softmax_outputs, _  = theano.scan(
			fn=lambda x: tensor.nnet.softmax(x),
			sequences=[linear_outputs],
			)

		def _NLL(pred, y, m):
			return -m * tensor.log(pred[tensor.arange(encoderInputs.shape[1]), y])

		costs, _ = theano.scan(fn=_NLL, sequences=[softmax_outputs, decoderTarget, decoderMask])
		loss = costs.sum() / decoderMask.sum() + params.L2*sum(lasagne.regularization.l2(x) for x in self.params)

		updates = lasagne.updates.adam(loss, self.params, self.eta)
        	#updates = lasagne.updates.apply_momentum(updates, self.params, momentum=0.9)

		###################################################
		#### using the ground truth when training
		##################################################
		self._train = theano.function(
			inputs=[ei, em, di, dm, dt],
			outputs=[loss, softmax_outputs],
			updates=updates,
			givens={encoderInputs:ei, encoderMask:em, decoderInputs:di, decoderMask:dm, decoderTarget:dt}
			)


		#########################################################################
		### For schedule sampling
		#########################################################################
	
		
		###### always use privous predict as next input 
                def _step2(ctx_, state_, hs_, Cs_):
			### ctx_: b x h
			### state_ : b x h
			### hs_ : 1 x b x h    the first dimension is the number of the decoder layers
			### Cs_ : 1 x b x h    the first dimension is the number of the decoder layers 

                        hs, Cs = [], []
                        token_idxs = tensor.cast(state_.argmax(axis=-1), "int32" )
			msk_ = tensor.fill( (tensor.zeros_like(token_idxs, dtype="float32")), 1)
			msk_ = msk_.dimshuffle('x', 0)
                        state_below0 = self.de_lookuptable[token_idxs].reshape((1, ctx_.shape[0], self.de_hidden_size))
                        for i, lstm in enumerate(self.decoder_lstm_layers):
                                h, C = lstm.forward(state_below0, msk_, hs_[i], Cs_[i])    #mind msk
                                hs += h[-1],
                                Cs += C[-1],
                                state_below0 = h

                        hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable(Cs)
			state_below0 = state_below0.reshape((ctx_.shape[0], self.de_hidden_size))			
			state_below0 = tensor.concatenate([ctx_, state_below0], axis =1)
                        newpred = tensor.dot(state_below0, self.linear) + self.linear_bias[None, :]
                        state_below = tensor.nnet.softmax(newpred)

			##### the beging symbole probablity is 0
                        extra_p = tensor.zeros_like(hs[:,:,0])
                        state_below = tensor.concatenate([state_below, extra_p.T], axis=1)
 
                        return state_below, hs, Cs
		 
		ctx_0, state_0 =tensor.fmatrices(2)
		hs_0 = tensor.ftensor3()
		Cs_0 = tensor.ftensor3()

		state_below_tmp, hs_tmp, Cs_tmp = _step2(ctx_0, state_0, hs_0, Cs_0)
		self.f_next = theano.function([ctx_0, state_0, hs_0, Cs_0], [state_below_tmp, hs_tmp, Cs_tmp], name='f_next')

		hs0, Cs0 = tensor.as_tensor_variable(self.hos, name="hs0"), tensor.as_tensor_variable(self.Cos, name="Cs0")
		train_outputs, _ = theano.scan(
                        fn=_step2,
			sequences= [Encoder],
                        outputs_info=[decoderInputs0, hs0, Cs0],
                        n_steps=encoderInputs.shape[0]
                        )
		
		train_predict = train_outputs[0]
		train_costs, _ = theano.scan(fn=_NLL, sequences=[train_predict, decoderTarget, decoderMask])
		
                train_loss = train_costs.sum() / decoderMask.sum() + params.L2*sum(lasagne.regularization.l2(x) for x in self.params)
		
		#from adam import adam		
                #train_updates = adam(train_loss, self.params, self.eta)
                #train_updates = lasagne.updates.apply_momentum(train_updates, self.params, momentum=0.9)
		#train_updates = lasagne.updates.sgd(train_loss, self.params, self.eta)
                #train_updates = lasagne.updates.apply_momentum(train_updates, self.params, momentum=0.9)
		from momentum import momentum
                train_updates = momentum(train_loss, self.params, params.eta, momentum=0.9)
		
		self._train2 = theano.function(
                        inputs=[ei, em, di0, dm, dt],
                        outputs=[train_loss, train_predict],
                        updates=train_updates,
                        givens={encoderInputs:ei, encoderMask:em, decoderInputs0:di0, decoderMask:dm, decoderTarget:dt}
			#givens={encoderInputs:ei, encoderMask:em, decoderInputs:di, decoderMask:dm, decoderTarget:dt, TF:tf}
                        )
		
		listof_token_idx = train_predict.argmax(axis=-1)
		self._utter = theano.function(
                        inputs=[ei, em, di0],
                        outputs=listof_token_idx,
                        givens={encoderInputs:ei, encoderMask:em, decoderInputs0:di0}
                        )
Пример #58
0
def build_model(shared_params, options):
    trng = RandomStreams(1234)
    drop_ratio = options['drop_ratio']
    batch_size = options['batch_size']
    n_dim = options['n_dim']

    w_emb = shared_params['w_emb']

    dropout = theano.shared(numpy.float32(0.))
    image_feat = T.ftensor3('image_feat')
    # T x batch_size
    input_idx = T.imatrix('input_idx')
    input_mask = T.matrix('input_mask')
    # label is the TRUE label
    label = T.ivector('label')

    empty_word = theano.shared(value=np.zeros((1, options['n_emb']),
                                              dtype='float32'),
                               name='empty_word')
    w_emb_extend = T.concatenate([empty_word, shared_params['w_emb']],
                                 axis=0)
    input_emb = w_emb_extend[input_idx]

    # get the transformed image feature
    h_0 = theano.shared(numpy.zeros((batch_size, n_dim), dtype='float32'))
    c_0 = theano.shared(numpy.zeros((batch_size, n_dim), dtype='float32'))

    if options['sent_drop']:
        input_emb = dropout_layer(input_emb, dropout, trng, drop_ratio)

    h_from_lstm, c_encode = lstm_layer(shared_params, input_emb, input_mask,
                                    h_0, c_0, options, prefix='sent_lstm')
    # pick the last one as encoder
    
    image_feat_down = fflayer(shared_params, image_feat, options,
                              prefix='image_mlp',
                              act_func=options.get('image_mlp_act',
                                                   'tanh'))
    r_0 = theano.shared(numpy.zeros((batch_size, n_dim), dtype='float32'))
    h_encode = wbw_attention_layer(shared_params, 
                                   image_feat_down,
                                   h_from_lstm, 
                                   input_mask,
                                   r_0, options,
                                   prefix='wbw_attention'  )
    h_encode = h_encode[-1]
    image_feat_attention_1 = fflayer(shared_params, image_feat_down, options,
                                     prefix='image_att_mlp_1',
                                     act_func=options.get('image_att_mlp_act',
                                                          'tanh'))
    h_encode_attention_1 = fflayer(shared_params, h_encode, options,
                                   prefix='sent_att_mlp_1',
                                   act_func=options.get('sent_att_mlp_act',
                                                        'tanh'))  #
    combined_feat_attention_1 = image_feat_attention_1 + \
                                h_encode_attention_1[:, None, :]
    if options['use_attention_drop']:
        combined_feat_attention_1 = dropout_layer(combined_feat_attention_1,
                                                  dropout, trng, drop_ratio)
    combined_feat_attention_1 = fflayer(shared_params,
                                        combined_feat_attention_1, options,
                                        prefix='combined_att_mlp_1',
                                        act_func=options.get(
                                            'combined_att_mlp_act',
                                            'tanh'))
    prob_attention_1 = T.nnet.softmax(combined_feat_attention_1[:, :, 0])
    image_feat_ave_1 = (prob_attention_1[:, :, None] * image_feat_down).sum(axis=1)

    combined_hidden_1 = image_feat_ave_1 + h_encode

    # second layer attention model
    image_feat_attention_2 = fflayer(shared_params, image_feat_down, options,
                                     prefix='image_att_mlp_2',
                                     act_func=options.get('image_att_mlp_act',
                                                          'tanh'))
    h_encode_attention_2 = fflayer(shared_params, combined_hidden_1, options,
                                   prefix='sent_att_mlp_2',
                                   act_func=options.get('sent_att_mlp_act',
                                                        'tanh'))
    combined_feat_attention_2 = image_feat_attention_2 + \
                                h_encode_attention_2[:, None, :]
    if options['use_attention_drop']:
        combined_feat_attention_2 = dropout_layer(combined_feat_attention_2,
                                                  dropout, trng, drop_ratio)

    combined_feat_attention_2 = fflayer(shared_params,
                                        combined_feat_attention_2, options,
                                        prefix='combined_att_mlp_2',
                                        act_func=options.get(
                                            'combined_att_mlp_act', 'tanh'))
    prob_attention_2 = T.nnet.softmax(combined_feat_attention_2[:, :, 0])

    image_feat_ave_2 = (prob_attention_2[:, :, None] * image_feat_down).sum(axis=1)

    if options.get('use_final_image_feat_only', False):
        combined_hidden = image_feat_ave_2 + h_encode
    else:
        combined_hidden = image_feat_ave_2 + combined_hidden_1

    for i in range(options['combined_num_mlp']):
        if options.get('combined_mlp_drop_%d'%(i), False):
            combined_hidden = dropout_layer(combined_hidden, dropout, trng,
                                            drop_ratio)
        if i == options['combined_num_mlp'] - 1:
            combined_hidden = fflayer(shared_params, combined_hidden, options,
                                      prefix='combined_mlp_%d'%(i),
                                      act_func='linear')
        else:
            combined_hidden = fflayer(shared_params, combined_hidden, options,
                                      prefix='combined_mlp_%d'%(i),
                                      act_func=options.get('combined_mlp_act_%d'%(i),
                                                           'tanh'))

    # drop the image output
    prob = T.nnet.softmax(combined_hidden)
    prob_y = prob[T.arange(prob.shape[0]), label]
    pred_label = T.argmax(prob, axis=1)
    # sum or mean?
    cost = -T.mean(T.log(prob_y))
    accu = T.mean(T.eq(pred_label, label))

    return image_feat, input_idx, input_mask, \
        label, dropout, cost, accu
Пример #59
0
import lasagne
from recognizer import Recognizer

BATCH_SIZE = 16
EMB_DIM = 256
SPEAKER_DIM = 128
ENC_DIM = 128
V = 43 + 1
NB_EPOCHS = 40
N_SPEAKERS = 21 + 1
OUTPUT_DIM = 63
LR = 0.001
SAVE_FILE_NAME = 'blizzard_cnn_mapper.pkl'
WEIGHTNORM = True

X = T.ftensor3()
mask = T.fmatrix()
ctx = T.imatrix()
learn_rate = T.fscalar()


def RecurrentMapper(ctx):
    emb_ctx = lib.ops.Embedding('Mapper.Generator.Embedding_Context', V,
                                ENC_DIM, ctx)
    batch_size = T.shape(ctx)[0]
    seq_len = T.shape(ctx)[1]
    out = lib.ops.BiGRU('Mapper.Generator.BiGRU', emb_ctx, ENC_DIM, 256)
    readout = lib.ops.Linear('Mapper.Generator.FC', out, 512, EMB_DIM)
    return readout

Пример #60
0
    # Constants
    num_train = 5000
    num_test = 500
    arch_size = [None, 110, 2]

    #  Set filename
    if args.exp=='task2':
        comb_filename = '{}_task2_{}_bn_{}_{}'.format(args.filename, args.model_type, args.batch_norm, args.seed)
    else:
        comb_filename = '{}_task1_{}_bn_{}_reg_samp_{}_samp_res_{}_{}'.format(args.filename, args.model_type,
            args.batch_norm, args.sample_regularly, args.sample_res, args.seed)
    if args.run_id != '':
        comb_filename += '_{}'.format(args.run_id)

    # Create symbolic vars
    input_var       = T.ftensor3('my_input_var')
    mask_var        = T.bmatrix('my_mask')
    target_var      = T.ivector('my_targets')
    time_var        = T.fmatrix('my_timevar')

    # Build model
    print("Building network ...")
    #   Get input dimensions
    network = get_rnn(input_var, mask_var, time_var, arch_size, args.grad_clip,
        bn=args.batch_norm, model_type=args.model_type)
    # Instantiate log
    log = defaultdict(list)
    print("Built.")

    # Resume if desired
    if args.resume: