示例#1
0
    def __init__(self,
                 emb_dim,
                 vocab_size,
                 num_slots,
                 max_sent_len,
                 optimiser=lasagne.updates.adam):
        self.emb_dim = emb_dim
        self.vocab_size = vocab_size
        self.num_slots = num_slots  # num of hdn state units.
        self.cell = RenCell(self.emb_dim, self.num_slots)
        self.optimiser = optimiser

        # Paceholders for input
        self.Stories = T.ltensor3(name='Stories')  # Num_stories x T x K_max
        self.Queries = T.ltensor3(
            name='Queries')  # Num_stories X Num_queries X K_max
        self.Indices = T.lmatrix(name="Indices")  # Num_Stories X Num_queries
        self.Answers = T.lmatrix(name='Answers')  # Num_stories X Num_queries

        # Data Set dimensions
        self.N = T.shape(self.Stories)[0]
        self.K = max_sent_len

        # Combine cell parameters with all the other parameters and init
        self.params = self.cell.params
        self.params.update(self._initialise_weights())

        # Build the Computation Graph and get the training function
        self._create_network(self.params)
        self.train_func = self._get_train_func()
示例#2
0
    def build_graph(self):
        # theano variables
        iw_b = T.lmatrix('iw_b')
        ic_b = T.ltensor3('ic_b')
        it_b = T.lmatrix('it_b')
        il_b = T.lmatrix('il_b')
        v_b = T.lmatrix('v_b')  # valid action mask
        y_b = T.lvector('y_b')  # index of the correct action from oracle

        steps = T.lscalar('steps')  # num_of steps
        lr = self.args.learn_rate * self.args.decay**T.cast(
            T.floor(steps / 2000.), 'float32')

        iw, ic, it, il, self.actor = self.get_actor(False)
        iw_avg, ic_avg, it_avg, il_avg, self.actor_avg = self.get_actor(True)

        actor_prob = L.get_output(self.actor_avg, {
            iw_avg: iw_b,
            ic_avg: ic_b,
            it_avg: it_b,
            il_avg: il_b
        },
                                  deterministic=True)
        actor_rest = actor_prob * T.cast(
            v_b, theano.config.floatX
        )  # mask the probabilities of invalid actions to 0
        actor_pred = T.argmax(actor_rest, 1)
        self.actor_predict = theano.function([v_b, iw_b, ic_b, it_b, il_b],
                                             actor_pred,
                                             on_unused_input='ignore')

        y_hat = L.get_output(self.actor, {
            iw: iw_b,
            ic: ic_b,
            it: it_b,
            il: il_b
        },
                             deterministic=False)
        xent = T.mean(lasagne.objectives.categorical_crossentropy(y_hat, y_b))
        reg = lasagne.regularization.regularize_network_params(
            L.get_all_layers(self.actor), lasagne.regularization.l2)
        cost = xent + self.args.reg_rate * reg
        correct = T.eq(T.argmax(y_hat, 1), y_b).sum()

        params = L.get_all_params(self.actor)
        avg_params = L.get_all_params(self.actor_avg)
        grads = T.grad(cost, params)
        if self.args.grad_norm:
            grads, norm = lasagne.updates.total_norm_constraint(
                grads, self.args.grad_norm, return_norm=True)

        updates = lasagne.updates.momentum(grads, params, lr,
                                           self.args.momentum)
        updates = apply_moving_average(params, avg_params, updates, steps,
                                       0.9999)

        inputs = [steps, y_b, v_b, iw_b, ic_b, it_b, il_b]
        self.train_actor_supervised = theano.function(inputs, [correct, cost],
                                                      updates=updates,
                                                      on_unused_input='ignore')
示例#3
0
文件: test.py 项目: gumaojie/morphlm
def test10():
    src = T.ltensor3("src")
    tgt = T.lmatrix("tgt")
    mask = T.matrix("mask")
    prd = T.matrix("prd")
    n_hids, vocab_size = 3, 60
    hs = HierarchicalSoftmax(src, n_hids, vocab_size)
    #prd = hs.test()
    res = hs.cost(tgt, mask)
    x = [
            [[1,1,1],[2,2,2],[3,3,3],[4,4,4]],
            [[3,3,3],[4,4,4],[5,5,5],[6,6,6]]
        ]
    y = [
            [1,1,1,1],
            [1,1,1,1]
        ]
    m = [
            [1,1,0,0],
            [1,1,0,0]
        ]
    fn3 = theano.function(inputs=[src,tgt,mask], outputs=[res], on_unused_input='ignore')
    res = fn3(x,y,m)
    print res , res[0].shape
    x_a = np.array(x)
    print x_a.shape, x_a[y]
示例#4
0
    def __init__(self, data, config, fast_predict=False):
        self.embedding_shapes = data.embedding_shapes
        self.lstm_type = config.lstm_cell
        self.lstm_hidden_size = int(config.lstm_hidden_size)
        self.num_lstm_layers = int(config.num_lstm_layers)
        self.max_grad_norm = float(config.max_grad_norm)

        self.vocab_size = data.word_dict.size()
        self.label_space_size = data.label_dict.size()
        self.unk_id = data.unk_id

        # Initialize layers and parameters
        self.embedding_layer = EmbeddingLayer(data.embedding_shapes,
                                              data.embeddings)
        self.params = [p for p in self.embedding_layer.params]

        self.rnn_layers = [None] * self.num_lstm_layers
        for l in range(self.num_lstm_layers):
            input_dim = self.embedding_layer.output_size if l == 0 else self.lstm_hidden_size
            input_dropout = config.input_dropout_prob if (
                config.per_layer_dropout or l == 0) else 0.0
            recurrent_dropout = config.recurrent_dropout_prob

            self.rnn_layers[l] = get_rnn_layer(self.lstm_type)(
                input_dim,
                self.lstm_hidden_size,
                input_dropout_prob=input_dropout,
                recurrent_dropout_prob=recurrent_dropout,
                fast_predict=fast_predict,
                prefix='lstm_{}'.format(l))
            self.params.extend(self.rnn_layers[l].params)

        self.softmax_layer = SoftmaxLayer(self.lstm_hidden_size,
                                          self.label_space_size)
        self.params.extend(self.softmax_layer.params)

        # Build model
        # Shape of x: [seq_len, batch_size, num_features]
        self.x0 = tensor.ltensor3('x')
        self.y0 = tensor.lmatrix('y')
        self.mask0 = tensor.matrix('mask', dtype=floatX)
        self.is_train = tensor.bscalar('is_train')

        self.x = self.x0.dimshuffle(1, 0, 2)
        self.y = self.y0.dimshuffle(1, 0)
        self.mask = self.mask0.dimshuffle(1, 0)

        self.inputs = [None] * (self.num_lstm_layers + 1)
        self.inputs[0] = self.embedding_layer.connect(self.x)
        self.rev_mask = self.mask[::-1]

        for l, rnn in enumerate(self.rnn_layers):
            outputs = rnn.connect(self.inputs[l],
                                  self.mask if l % 2 == 0 else self.rev_mask,
                                  self.is_train)
            self.inputs[l + 1] = outputs[::-1]

        self.scores, self.pred = self.softmax_layer.connect(self.inputs[-1])
        self.pred0 = self.pred.reshape([self.x.shape[0],
                                        self.x.shape[1]]).dimshuffle(1, 0)
    def get_distribution_by_ctx_emb_function(self):
        """ Return predictions and scores of shape [batch_size, time_steps, label space size].
        Used at test time.
    """
        inputs_0 = tensor.ltensor3('inputs_0')

        self.inputs = [None] * (self.num_lstm_layers + 1)
        self.inputs[0] = inputs_0
        self.rev_mask = self.mask[::-1]

        for l, rnn in enumerate(self.rnn_layers):
            outputs = rnn.connect(self.inputs[l],
                                  self.mask if l % 2 == 0 else self.rev_mask,
                                  self.is_train)
            self.inputs[l + 1] = outputs[::-1]

        self.scores, self.pred = self.softmax_layer.connect(self.inputs[-1])
        self.pred0 = self.pred.reshape(
            [self.mask.shape[0], self.mask.shape[1]]).dimshuffle(1, 0)

        # (sent_len, batch_size, label_space_size) --> (batch_size, sent_len, label_space_size)
        scores0 = self.scores.reshape([
            self.inputs[0].shape[0], self.inputs[0].shape[1],
            self.label_space_size
        ]).dimshuffle(1, 0, 2)

        return theano.function([inputs_0, self.mask0], [self.pred0, scores0],
                               name='f_ctx_gemb_pred',
                               allow_input_downcast=True,
                               on_unused_input='warn',
                               givens=({
                                   self.is_train: numpy.cast['int8'](0)
                               }))
示例#6
0
    def arch_memnet_selfsup(self):
        '''
        memory net with self supervision.
        '''
        contexts = T.ltensor3('contexts')
        querys = T.lmatrix('querys')
        yvs = T.lmatrix('yvs')

        params = []
        question_layer = Embed(self.vocab_size, self.hidden_dim)
        q = T.reshape(question_layer(querys.flatten()),
                      (self.batchsize, self.sen_maxlen, self.hidden_dim)
                      )
        if self.kwargs.get('position_encoding'):
            lmat = position_encoding(self.sen_maxlen, self.hidden_dim).dimshuffle('x', 0, 1)
            print '[memory network] use PE'
            q = q * lmat
        u = mean(q, axis=1)
        params.extend(question_layer.params)

        mem_layer = MemoryLayer(self.batchsize, self.mem_size, self.unit_size, self.vocab_size, self.hidden_dim,
                                **self.kwargs)
        probs = mem_layer.get_probs(contexts, u).dimshuffle(0, 2)

        inputs = {
            'contexts': contexts,
            'querys': querys,
            'yvs': yvs,
            'cvs': T.lmatrix('cvs')
        }
        return (probs, inputs, params)
    def get_eval_with_gemb_function(self):
        inputs_0 = tensor.ltensor3('inputs_0')

        self.inputs = [None] * (self.num_lstm_layers + 1)
        self.inputs[0] = inputs_0
        self.rev_mask = self.mask[::-1]

        for l, rnn in enumerate(self.rnn_layers):
            outputs = rnn.connect(self.inputs[l],
                                  self.mask if l % 2 == 0 else self.rev_mask,
                                  self.is_train)
            self.inputs[l + 1] = outputs[::-1]

        self.scores, self.pred = self.softmax_layer.connect(self.inputs[-1])
        self.pred0 = self.pred.reshape(
            [self.mask.shape[0], self.mask.shape[1]]).dimshuffle(1, 0)

        # (sent_len, batch_size, label_space_size) --> (batch_size, sent_len, label_space_size)
        scores0 = self.scores.reshape([
            self.inputs[0].shape[0], self.inputs[0].shape[1],
            self.label_space_size
        ]).dimshuffle(1, 0, 2)

        loss = CrossEntropyLoss().connect(self.scores, self.mask, self.y)
        return theano.function([inputs_0, self.mask0, self.y0],
                               [self.pred0, loss],
                               name='f_gemb_eval',
                               allow_input_downcast=True,
                               on_unused_input='warn',
                               givens=({
                                   self.is_train: numpy.cast['int8'](0)
                               }))
示例#8
0
 def make_node(self, prediction, prediction_mask, groundtruth,
               groundtruth_mask):
     prediction = tensor.as_tensor_variable(prediction)
     prediction_mask = tensor.as_tensor_variable(prediction_mask)
     groundtruth = tensor.as_tensor_variable(groundtruth)
     groundtruth_mask = tensor.as_tensor_variable(groundtruth_mask)
     return theano.Apply(
         self, [prediction, prediction_mask, groundtruth, groundtruth_mask],
         [tensor.ltensor3()])
示例#9
0
 def test_inc_wrong_rank(self):
     self.assertRaises(TypeError,
           sparse_gram_inc, self.base, self.amt, self.i0, tensor.lmatrix())
     self.assertRaises(TypeError,
           sparse_gram_inc, self.base, self.amt, tensor.lscalar(), self.i1)
     self.assertRaises(TypeError,
           sparse_gram_inc, self.base, tensor.ltensor3(), self.i0, self.i1)
     self.assertRaises(TypeError,
           sparse_gram_inc, tensor.vector(), self.amt, self.i0, self.i1)
def test_language_model():
    with temporary_content_path(TEST_VOCAB) as path:
        vocab = Vocabulary(path)
    with temporary_content_path(TEST_DICT_JSON, suffix=".json") as path:
        dict_ = Dictionary(path)

    floatX = theano.config.floatX

    def make_data_and_mask(data):
        data = [[str2vec(s, 3) for s in row] for row in data]
        data = np.array(data)
        mask = np.ones((data.shape[0], data.shape[1]),
                        dtype=floatX)
        return data, mask
    words_val, mask_val = make_data_and_mask([['p', 'e', 'a', ], ['a', 'e', 'p',]])
    mask_val[1,2] = 0
    print "data:"
    print words_val
    print "mask:"
    print mask_val
    mask_def_emb_val = np.asarray([[0, 1], [0,0]])

    # With the dictionary
    retrieval = Retrieval(vocab, dict_, exclude_top_k=7)
    lm = LanguageModel(7, 5, vocab.size(), vocab.size(), 
        vocab=vocab, retrieval=retrieval,
        compose_type='transform_and_sum',
        weights_init=Uniform(width=0.1),
        biases_init=Uniform(width=0.1))
    lm.initialize()
    words = tensor.ltensor3('words')
    mask = tensor.matrix('mask', dtype=floatX)
    costs = lm.apply(words, mask)
    cg = ComputationGraph(costs)
    def_mean, = VariableFilter(name='_dict_word_embeddings')(cg)
    def_mean_f = theano.function([words], def_mean)

    perplexities = VariableFilter(name_regex='perplexity.*')(cg)
    mask_def_emb, = VariableFilter(name='mask_def_emb')(cg)
    perplexities_f = theano.function([words, mask], perplexities)
    perplexities_v = perplexities_f(words_val, mask_val)
    mask_emb_f = theano.function([words, mask], mask_def_emb)
    mask_def_v = mask_emb_f(words_val, mask_val)
    for v,p in zip(perplexities_v,perplexities):
        print p.name, ":", v
    assert(np.allclose(mask_def_v, mask_def_emb_val))
示例#11
0
    def arch_memnet_lexical(self):
        '''
        each memory slot is a lexical.
        '''
        contexts = T.ltensor3('contexts')
        querys = T.lmatrix('querys')
        yvs = T.lvector('yvs')
        hop = 1

        params = []
        question_layer = Embed(self.vocab_size, self.hidden_dim)
        q = T.reshape(question_layer(querys.flatten()),
                      (self.batchsize, self.sen_maxlen, self.hidden_dim)
                      )
        if self.kwargs.get('position_encoding'):
            lmat = position_encoding(self.sen_maxlen, self.hidden_dim).dimshuffle('x', 0, 1)
            print '[memory network] use PE'
            q = q * lmat
        u = mean(q, axis=1)
        params.extend(question_layer.params)

        mem_layers = []
        for hi in range(hop):
            mem_layer = MemoryLayer(self.batchsize, self.mem_size, self.unit_size, self.vocab_size, self.hidden_dim,
                                    **self.kwargs)
            params.extend(mem_layer.params)
            mem_layers.append(mem_layer)
            o = mem_layer(contexts, u)
            u = u + o

        linear = LinearLayer(self.hidden_dim, self.vocab_size)
        params.extend(linear.params)
        probs = softmax(linear(u))
        inputs = {
            'contexts': contexts,
            'querys': querys,
            'yvs': yvs,
            'cvs': T.lmatrix('cvs')
        }
        return (probs, inputs, params)
示例#12
0
    def arch_lstmq(self, param_b=2):

        contexts = T.ltensor3('contexts')
        querys = T.lmatrix('querys')
        yvs = T.lvector('yvs')

        params = []
        question_layer = Embed(self.vocab_size, self.hidden_dim)
        params.extend(question_layer.params)
        q = T.reshape(question_layer(querys.flatten()),
                      (self.batchsize, self.sen_maxlen, self.hidden_dim)
                      )
        lmat = position_encoding(self.sen_maxlen, self.hidden_dim).dimshuffle('x', 0, 1)
        q = q * lmat
        u = mean(q, axis=1)


        embed_layer = Embed(self.vocab_size, self.hidden_dim)
        params.extend(embed_layer.params)
        lmat = position_encoding(self.unit_size, self.hidden_dim).dimshuffle('x', 'x', 0, 1)
        m = T.reshape(embed_layer(contexts.flatten()), (self.batchsize, self.mem_size, self.unit_size, self.hidden_dim))
        m = mean(m * lmat, axis=2)

        lstm = LSTMq(self.batchsize, self.hidden_dim)
        params.extend(lstm.params)
        o = lstm(m.dimshuffle(1, 0, 2), u)

        linear = LinearLayer(self.hidden_dim, self.vocab_size)
        params.extend(linear.params)
        probs = softmax(linear(o))

        inputs = {
            'contexts': contexts,
            'querys': querys,
            'yvs': yvs,
            'cvs': T.lmatrix('cvs')
        }
        return (probs, inputs, params)
示例#13
0
def main():
    # ZEROUT_DUMMY_WORD = False
    ZEROUT_DUMMY_WORD = True

    ## Load data
    # mode = 'TRAIN-ALL'
    #mode = 'TRAIN_DATA'
    #mode = 'TRAIN_NO_OVERLAP'
    #if len(sys.argv) > 1:
    #    mode = sys.argv[1]
    #    if not mode in ['TRAIN', 'TRAIN-ALL']:
    #        print "ERROR! The two possible training settings are: ['TRAIN', 'TRAIN-ALL']"
    #        sys.exit(1)

    mode = 'k_time_data1'.upper()

    print "Running training in the {} setting".format(mode)

    position_num = 10
    select_model = "PSCM"
    if select_model == "PSCM":
        click_model_index = 4  #PSCM
    elif select_model == "UBM":
        click_model_index = 1
    else:
        raise "MODEL SELECT ERROR!"
    data_dir = mode

    add_train = numpy.load(os.path.join(data_dir, 'train.additions.npy'))
    q_train = numpy.load(os.path.join(data_dir, 'train.questions.npy'))
    a_train = numpy.load(os.path.join(data_dir, 'train.answers.npy'))
    y_train = numpy.load(os.path.join(data_dir, 'train.labels.npy'))

    add_dev = numpy.load(os.path.join(data_dir, 'dev.additions.npy'))
    q_dev = numpy.load(os.path.join(data_dir, 'dev.questions.npy'))
    a_dev = numpy.load(os.path.join(data_dir, 'dev.answers.npy'))
    #q_overlap_dev = numpy.load(os.path.join(data_dir, 'dev.q_overlap_indices.npy'))
    #a_overlap_dev = numpy.load(os.path.join(data_dir, 'dev.a_overlap_indices.npy'))
    y_dev = numpy.load(os.path.join(data_dir, 'dev.labels.npy'))
    qids_dev = numpy.load(os.path.join(data_dir, 'dev.qids.npy'))

    add_test = numpy.load(os.path.join(data_dir, 'test.additions.npy'))
    q_test = numpy.load(os.path.join(data_dir, 'test.questions.npy'))
    a_test = numpy.load(os.path.join(data_dir, 'test.answers.npy'))
    #q_overlap_test = numpy.load(os.path.join(data_dir, 'test.q_overlap_indices.npy'))
    #a_overlap_test = numpy.load(os.path.join(data_dir, 'test.a_overlap_indices.npy'))
    y_test = numpy.load(os.path.join(data_dir, 'test.labels.npy'))
    qids_test = numpy.load(os.path.join(data_dir, 'test.qids.npy'))

    # x_train = numpy.load(os.path.join(data_dir, 'train.overlap_feats.npy'))
    # x_dev = numpy.load(os.path.join(data_dir, 'dev.overlap_feats.npy'))
    # x_test = numpy.load(os.path.join(data_dir, 'test.overlap_feats.npy'))

    # feats_ndim = x_train.shape[1]

    # from sklearn.preprocessing import StandardScaler
    # scaler = StandardScaler()
    # print "Scaling overlap features"
    # x_train = scaler.fit_transform(x_train)
    # x_dev = scaler.transform(x_dev)
    # x_test = scaler.transform(x_test)

    #multi dim

    #y_train_tmp = numpy.dstack((y_train, y_train, y_train))[0]
    #y_dev_tmp = numpy.dstack((y_dev, y_dev, y_dev))[0]
    #y_test_tmp = numpy.dstack((y_test, y_test, y_test))[0]

    #y_train = y_train_tmp
    #y_dev = y_dev_tmp
    #y_test = y_test_tmp

    max_query_id = numpy.max([
        numpy.max(add_train[:, 0]),
        numpy.max(add_test[:, 0]),
        numpy.max(add_dev[:, 0])
    ])
    max_url_id = numpy.max([
        numpy.max(add_train[:, 1:]),
        numpy.max(add_test[:, 1:]),
        numpy.max(add_dev[:, 1:])
    ])

    print 'max_query_id', max_query_id
    print 'max_url_id', max_url_id

    print 'y_train', numpy.unique(y_train, return_counts=True)
    print 'y_dev', numpy.unique(y_dev, return_counts=True)
    print 'y_test', numpy.unique(y_test, return_counts=True)

    print 'q_train', q_train.shape
    print 'q_dev', q_dev.shape
    print 'q_test', q_test.shape

    print 'a_train', a_train.shape
    print 'a_dev', a_dev.shape
    print 'a_test', a_test.shape

    ## Get the word embeddings from the nnet trained on SemEval
    # ndim = 40
    # nnet_outdir = 'exp/ndim=60;batch=100;max_norm=0;learning_rate=0.1;2014-12-02-15:53:14'
    # nnet_fname = os.path.join(nnet_outdir, 'nnet.dat')
    # params_fname = os.path.join(nnet_outdir, 'best_dev_params.epoch=00;batch=14640;dev_f1=83.12;test_acc=85.00.dat')
    # train_nnet, test_nnet = nn_layers.load_nnet(nnet_fname, params_fname)

    numpy_rng = numpy.random.RandomState(123)
    q_max_sent_size = q_train.shape[1]
    a_max_sent_size = a_train.shape[2]
    # print 'max', numpy.max(a_train)
    # print 'min', numpy.min(a_train)

    #ndim = 5
    #print "Generating random vocabulary for word overlap indicator features with dim:", ndim
    #dummy_word_id = numpy.max(a_overlap_train)
    # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim))
    #print "Gaussian"
    #vocab_emb_overlap = numpy_rng.randn(dummy_word_id + 1, ndim) * 0.25
    # vocab_emb_overlap = numpy_rng.randn(dummy_word_id+1, ndim) * 0.05
    # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim))
    #vocab_emb_overlap[-1] = 0

    # Load word2vec embeddings
    fname = os.path.join(data_dir, 'emb_vectors.skip.1124.4m.10w.npy')

    print "Loading word embeddings from", fname
    vocab_emb = numpy.load(fname)
    ndim = vocab_emb.shape[1]
    dummpy_word_idx = numpy.max(a_train)
    print "Word embedding matrix size:", vocab_emb.shape

    x = T.dmatrix('x')
    x_q = T.lmatrix('q')
    #x_q_overlap = T.lmatrix('q_overlap')
    #x_a = T.lmatrix('a')
    x_a_all = T.ltensor3('a_all')
    #x_a_overlap = T.lmatrix('a_overlap')
    #y = T.ivector('y')
    y = T.imatrix('y')
    add_info = T.dmatrix('add_info')

    #######
    n_outs = 2

    n_epochs = 15
    batch_size = 50
    learning_rate = 0.1
    max_norm = 0

    print 'batch_size', batch_size
    print 'n_epochs', n_epochs
    print 'learning_rate', learning_rate
    print 'max_norm', max_norm

    ## 1st conv layer.
    #ndim = vocab_emb.shape[1] + vocab_emb_overlap.shape[1]
    ndim = vocab_emb.shape[1]

    ### Nonlinearity type
    # activation = nn_layers.relu_f
    activation = T.tanh

    dropout_rate = 0.5
    nkernels = 100
    q_k_max = 1
    a_k_max = 1

    # filter_widths = [3,4,5]
    q_filter_widths = [5]
    a_filter_widths = [5]

    ###### QUESTION ######
    lookup_table_words = nn_layers.LookupTableFastStatic(
        W=vocab_emb, pad=max(q_filter_widths) - 1)
    #lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=max(q_filter_widths) - 1)

    #lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words, lookup_table_overlap])
    lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words])

    num_input_channels = 1
    input_shape = (batch_size, num_input_channels,
                   q_max_sent_size + 2 * (max(q_filter_widths) - 1), ndim)

    conv_layers = []
    for filter_width in q_filter_widths:
        filter_shape = (nkernels, num_input_channels, filter_width, ndim)
        conv = nn_layers.Conv2dLayer(rng=numpy_rng,
                                     filter_shape=filter_shape,
                                     input_shape=input_shape)
        non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0],
                                                    activation=activation)
        pooling = nn_layers.KMaxPoolLayer(k_max=q_k_max)
        conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(
            layers=[conv, non_linearity, pooling])
        conv_layers.append(conv2dNonLinearMaxPool)

    join_layer = nn_layers.ParallelLayer(layers=conv_layers)
    flatten_layer = nn_layers.FlattenLayer()

    nnet_q = nn_layers.FeedForwardNet(layers=[
        lookup_table,
        join_layer,
        flatten_layer,
    ])
    #nnet_q.set_input((x_q, x_q_overlap))
    nnet_q.set_input([x_q])
    ######

    ###### ANSWER ######
    nnet_a_list = []
    #lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths) - 1)
    for i in xrange(position_num):
        #lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths) - 1)
        #lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=max(q_filter_widths) - 1)

        #lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words, lookup_table_overlap])
        #lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words])

        # num_input_channels = len(lookup_table.layers)
        #input_shape = (batch_size, num_input_channels, a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim)
        input_shape = (batch_size, num_input_channels,
                       a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim)
        conv_layers = []
        for filter_width in a_filter_widths:
            filter_shape = (nkernels, num_input_channels, filter_width, ndim)
            conv = nn_layers.Conv2dLayer(rng=numpy_rng,
                                         filter_shape=filter_shape,
                                         input_shape=input_shape)
            non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0],
                                                        activation=activation)
            pooling = nn_layers.KMaxPoolLayer(k_max=a_k_max)
            conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(
                layers=[conv, non_linearity, pooling])
            conv_layers.append(conv2dNonLinearMaxPool)

        join_layer = nn_layers.ParallelLayer(layers=conv_layers)
        flatten_layer = nn_layers.FlattenLayer()

        nnet_a = nn_layers.FeedForwardNet(layers=[
            lookup_table,
            join_layer,
            flatten_layer,
        ])
        #nnet_a.set_input((x_a, x_a_overlap))
        nnet_a.set_input([x_a_all[:, i, :]])
        nnet_a_list.append(nnet_a)
    #######
    # print 'nnet_q.output', nnet_q.output.ndim

    q_logistic_n_in = nkernels * len(q_filter_widths) * q_k_max
    #a_logistic_n_in = nkernels * len(a_filter_widths) * a_k_max
    a_logistic_n_in = nkernels * len(a_filter_widths) * a_k_max

    print "q_logistic_n_in, ", q_logistic_n_in
    print "a_logistic_n_in, ", a_logistic_n_in

    #pairwise_layer = nn_layers.PositionPairwiseNoFeatsLayer(q_in=q_logistic_n_in, a_in=a_logistic_n_in,position=position_num)
    pairwise_layer = nn_layers.PositionOnlySimPairwiseNoFeatsLayer(
        q_in=q_logistic_n_in, a_in=a_logistic_n_in, position=position_num)
    pairwise_out_list = [nnet_q.output]
    for i in xrange(position_num):
        pairwise_out_list.append(nnet_a_list[i].output)
    pairwise_layer.set_input(pairwise_out_list)
    #pairwise_layer.set_input((nnet_q.output, nnet_a.output))

    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + a_logistic_n_in
    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 50
    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 1
    #n_in = q_logistic_n_in + a_logistic_n_in * position_num + 1 * position_num
    #n_in = 1 * position_num + position_num * (position_num - 1) / 2
    n_in = q_logistic_n_in + a_logistic_n_in * position_num + 1 * position_num + position_num * (
        position_num - 1) / 2
    # n_in = feats_ndim + 1
    # n_in = feats_ndim + 50

    hidden_layer = nn_layers.LinearLayer(numpy_rng,
                                         n_in=n_in,
                                         n_out=n_in,
                                         activation=activation)
    hidden_layer.set_input(pairwise_layer.output)

    #classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs)
    #classifier.set_input(hidden_layer.output)

    classifier = nn_layers.FeatureClickModelLayer(
        n_in=n_in,
        n_out=n_outs,
        max_q_id=max_query_id,
        max_u_id=max_url_id,
        dim=position_num,
        click_model_index=click_model_index)
    #classifier = nn_layers.SimpleClickModelLayer(n_in=n_in, n_out=n_outs, max_q_id=max_query_id, max_u_id=max_url_id, dim=position_num)
    #classifier = nn_layers.MultiDimLogisticRegression(n_in=n_in, n_out=n_outs, dim=position_num)
    #classifier = nn_layers.LogisticRegression2(n_in=n_in, n_out=n_outs)
    classifier.set_input([hidden_layer.output, add_info])

    #train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, pairwise_layer, hidden_layer, classifier],
    #                                      name="Training nnet")
    train_nnet = nn_layers.FeedForwardNet(
        layers=[nnet_q] + nnet_a_list +
        [pairwise_layer, hidden_layer, classifier],
        name="Training nnet")
    test_nnet = train_nnet
    #######

    #print train_nnet

    params = train_nnet.params

    ts = datetime.now().strftime('%Y-%m-%d-%H.%M.%S')
    nnet_outdir = 'exp.multi.out/model={},data={};ndim={};batch={};max_norm={};learning_rate={};{}'.format(
        select_model, mode, ndim, batch_size, max_norm, learning_rate, ts)
    if not os.path.exists(nnet_outdir):
        os.makedirs(nnet_outdir)
    nnet_fname = os.path.join(nnet_outdir, 'nnet.dat')
    print "Saving to", nnet_fname
    cPickle.dump([train_nnet, test_nnet],
                 open(nnet_fname, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)

    #total_params = sum([numpy.prod(param.shape.eval()) for param in params])
    #print 'Total params number:', total_params

    cost = train_nnet.layers[-1].training_cost(y)
    # y_train_counts = numpy.unique(y_train, return_counts=True)[1].astype(numpy.float32)
    # weights_data = numpy.sum(y_train_counts) / y_train_counts
    # weights_data_norm = numpy.linalg.norm(weights_data)
    # weights_data /= weights_data_norm
    # print 'weights_data', weights_data
    # weights = theano.shared(weights_data, borrow=True)
    # cost = train_nnet.layers[-1].training_cost_weighted(y, weights=weights)

    predictions = test_nnet.layers[-1].y_pred

    #predictions_prob = test_nnet.layers[-1].p_y_given_x[:, position_num:position_num * 2]
    predictions_prob = test_nnet.layers[-1].p_y_given_x

    ### L2 regularization
    # L2_word_emb = 1e-4
    # L2_conv1d = 3e-5
    # # L2_softmax = 1e-3
    # L2_softmax = 1e-4
    # print "Regularizing nnet weights"
    # for w in train_nnet.weights:
    #   L2_reg = 0.
    #   if w.name.startswith('W_emb'):
    #     L2_reg = L2_word_emb
    #   elif w.name.startswith('W_conv1d'):
    #     L2_reg = L2_conv1d
    #   elif w.name.startswith('W_softmax'):
    #     L2_reg = L2_softmax
    #   elif w.name == 'W':
    #     L2_reg = L2_softmax
    #   print w.name, L2_reg
    #   cost += T.sum(w**2) * L2_reg

    # batch_x = T.dmatrix('batch_x')
    batch_x_q = T.lmatrix('batch_x_q')
    #batch_x_a = T.lmatrix('batch_x_a')
    batch_x_a_all = T.ltensor3('batch_x_a_all')
    #batch_x_q_overlap = T.lmatrix('batch_x_q_overlap')
    #batch_x_a_overlap = T.lmatrix('batch_x_a_overlap')
    #batch_y = T.ivector('batch_y')
    batch_y = T.imatrix('batch_y')
    batch_add_info = T.dmatrix('batch_add_info')

    # updates = sgd_trainer.get_adagrad_updates(cost, params, learning_rate=learning_rate, max_norm=max_norm, _eps=1e-6)
    updates = sgd_trainer.get_adadelta_updates(cost,
                                               params,
                                               rho=0.95,
                                               eps=1e-6,
                                               max_norm=max_norm,
                                               word_vec_name='W_emb')

    inputs_pred = [
        batch_x_q,
        batch_x_a_all,
        batch_add_info,
        #batch_x_q_overlap,
        #batch_x_a_overlap,
        # batch_x,
    ]

    givens_pred = {
        x_q: batch_x_q,
        x_a_all: batch_x_a_all,
        add_info: batch_add_info,
        #x_q_overlap: batch_x_q_overlap,
        #x_a_overlap: batch_x_a_overlap,
        # x: batch_x
    }

    inputs_train = [
        batch_x_q,
        batch_x_a_all,
        #batch_x_q_overlap,
        #batch_x_a_overlap,
        # batch_x,
        batch_add_info,
        batch_y,
    ]

    givens_train = {
        x_q: batch_x_q,
        x_a_all: batch_x_a_all,
        #x_q_overlap: batch_x_q_overlap,
        #x_a_overlap: batch_x_a_overlap,
        # x: batch_x,
        add_info: batch_add_info,
        y: batch_y
    }

    train_fn = theano.function(inputs=inputs_train,
                               outputs=cost,
                               updates=updates,
                               givens=givens_train,
                               on_unused_input='warn')

    pred_fn = theano.function(inputs=inputs_pred,
                              outputs=predictions,
                              givens=givens_pred,
                              on_unused_input='warn')

    pred_prob_fn = theano.function(inputs=inputs_pred,
                                   outputs=predictions_prob,
                                   givens=givens_pred,
                                   on_unused_input='warn')

    def predict_batch(batch_iterator):
        #preds = numpy.vstack([pred_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap) for
        #                      batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, _ in batch_iterator])
        preds = numpy.vstack([
            pred_fn(batch_x_q, batch_x_a, batch_add_info)
            for batch_x_q, batch_x_a, batch_add_info, _ in batch_iterator
        ])
        real_preds = preds[:, -1 * position_num:]
        inner_outputs = preds

        return real_preds[:batch_iterator.
                          n_samples], inner_outputs[:batch_iterator.n_samples]

    def predict_prob_batch(batch_iterator):
        #preds = numpy.vstack([pred_prob_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap) for
        #                      batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, _ in batch_iterator])
        preds = numpy.vstack([
            pred_prob_fn(batch_x_q, batch_x_a, batch_add_info)
            for batch_x_q, batch_x_a, batch_add_info, _ in batch_iterator
        ])
        real_preds = preds[:, -1 * position_num:]
        inner_outputs = preds

        return real_preds[:batch_iterator.
                          n_samples], inner_outputs[:batch_iterator.n_samples]

    train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [q_train, a_train, add_train, y_train],
        batch_size=batch_size,
        randomize=True)
    dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [q_dev, a_dev, add_dev, y_dev],
        batch_size=batch_size,
        randomize=False)
    test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [q_test, a_test, add_test, y_test],
        batch_size=batch_size,
        randomize=False)

    labels = sorted(numpy.unique(y_test[:, -1]))
    print 'labels', labels

    def perplexity_score(labels, preds):
        positionPerplexity = [0.0] * position_num
        positionPerplexityClickSkip = [[0.0, 0.0]
                                       for i in xrange(position_num)]
        counts = [0] * position_num
        countsClickSkip = [[0, 0] for i in xrange(position_num)]
        for label, pred in zip(labels, preds):
            for i in range(0, len(label)):
                click = 1 if label[i] else 0
                tmp_pred = max(min(pred[i], 0.99999), 0.00001)
                logProb = math.log(tmp_pred, 2)
                if click == 0:
                    logProb = math.log(1 - tmp_pred, 2)
                positionPerplexity[i] += logProb
                positionPerplexityClickSkip[i][click] += logProb
                counts[i] += 1
                countsClickSkip[i][click] += 1
        positionPerplexity = [
            2**(-x / count if count else x)
            for (x, count) in zip(positionPerplexity, counts)
        ]
        positionPerplexityClickSkip = [[2 ** (-x[click] / (count[click] if count[click] else 1) if count else x) \
                for (x, count) in zip(positionPerplexityClickSkip, countsClickSkip)] for click in xrange(2)]
        perplexity = sum(positionPerplexity) / len(positionPerplexity)
        ret_str = "---------\n"
        ret_str += "Perplexity\t" + str(perplexity) + "\n"
        ret_str += "positionPerplexity"
        for i in range(0, position_num):
            ret_str += "\t" + str(positionPerplexity[i])
        ret_str += "\n"

        ret_str += "positionPerplexitySkip"
        for i in range(0, position_num):
            ret_str += "\t" + str(positionPerplexityClickSkip[0][i])
        ret_str += "\n"

        ret_str += "positionPerplexityClick"
        for i in range(0, position_num):
            ret_str += "\t" + str(positionPerplexityClickSkip[1][i])
        ret_str += "\n------------\n"
        #print ret_str
        return perplexity, ret_str

    def map_score(qids, labels, preds):
        qid2cand = defaultdict(list)
        for qid, label, pred in zip(qids, labels, preds):
            qid2cand[qid].append((pred, label))

        average_precs = []
        for qid, candidates in qid2cand.iteritems():
            average_prec = 0
            running_correct_count = 0
            for i, (score,
                    label) in enumerate(sorted(candidates, reverse=True), 1):
                if label > 0:
                    running_correct_count += 1
                    average_prec += float(running_correct_count) / i
            average_precs.append(average_prec / (running_correct_count + 1e-6))
        map_score = sum(average_precs) / len(average_precs)
        return map_score

    print "Zero out dummy word:", ZEROUT_DUMMY_WORD
    if ZEROUT_DUMMY_WORD:
        W_emb_list = [w for w in params if w.name == 'W_emb']
        zerout_dummy_word = theano.function(
            [], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list])

    # weights_dev = numpy.zeros(len(y_dev))
    # weights_dev[y_dev == 0] = weights_data[0]
    # weights_dev[y_dev == 1] = weights_data[1]
    # print weights_dev

    best_dev_acc = -numpy.inf
    best_dev_perp = numpy.inf
    epoch = 0
    timer_train = time.time()
    no_best_dev_update = 0
    num_train_batches = len(train_set_iterator)
    while epoch < n_epochs:
        timer = time.time()
        for i, (x_q, x_a, add, y) in enumerate(tqdm(train_set_iterator), 1):
            train_fn(x_q, x_a, add, y)

            # Make sure the null word in the word embeddings always remains zero
            if ZEROUT_DUMMY_WORD:
                zerout_dummy_word()

            if i % 10 == 0 or i == num_train_batches:
                y_pred_dev, y_inner_dev = predict_prob_batch(dev_set_iterator)
                #print "shape:"
                #print str(y_dev.shape)
                #print str(y_pred_dev.shape)
                # # dev_acc = map_score(qids_dev, y_dev, predict_prob_batch(dev_set_iterator)) * 100
                dev_acc = metrics.roc_auc_score(y_dev[:, -1],
                                                y_pred_dev[:, -1]) * 100
                dev_perp, dev_perp_str = perplexity_score(y_dev, y_pred_dev)
                if dev_acc > best_dev_acc:
                    y_pred, y_inner = predict_prob_batch(test_set_iterator)
                    test_acc = map_score(qids_test, y_test[:, -1],
                                         y_pred[:, -1]) * 100
                    print(
                        'epoch: {} batch: {} dev auc: {:.4f}; test map: {:.4f}; best_dev_acc: {:.4f}'
                        .format(epoch, i, dev_acc, test_acc, best_dev_acc))
                    best_dev_acc = dev_acc

                if dev_perp < best_dev_perp:
                    y_pred, y_inner = predict_prob_batch(test_set_iterator)
                    test_acc = map_score(qids_test, y_test[:, -1],
                                         y_pred[:, -1]) * 100
                    test_perplexity, test_perplexity_str = perplexity_score(
                        y_test, y_pred)
                    print(
                        'epoch: {} batch: {} dev auc: {:.4f}; test map: {:.4f}; best_dev_acc: {:.4f}; dev_perp: {:.4f}; best_dev_perp: {:.4f}'
                        .format(epoch, i, dev_acc, test_acc, best_dev_acc,
                                dev_perp, best_dev_perp))
                    print str(test_perplexity_str)
                    best_params = [
                        numpy.copy(p.get_value(borrow=True)) for p in params
                    ]
                    best_inner = y_inner
                    no_best_dev_update = 0
                    best_dev_perp = dev_perp
        if no_best_dev_update >= 3:
            print "Quitting after of no update of the best score on dev set", no_best_dev_update
            break

        numpy.savetxt(
            os.path.join(
                nnet_outdir,
                'test.epoch={:02d};batch={:05d};dev_perp={:.2f}.best_inner.npy'
                .format(epoch, i, best_dev_perp)), best_inner)
        print('epoch {} took {:.4f} seconds'.format(epoch,
                                                    time.time() - timer))
        epoch += 1
        no_best_dev_update += 1

    print('Training took: {:.4f} seconds'.format(time.time() - timer_train))
    for i, param in enumerate(best_params):
        params[i].set_value(param, borrow=True)

    y_pred_test, y_inner_test = predict_prob_batch(test_set_iterator)
    test_acc = map_score(qids_test, y_test[:, -1], y_pred_test[:, -1]) * 100
    test_perp, test_perp_str = perplexity_score(y_test, y_pred_test)
    print "FINAL ACCURACY"
    print str(test_acc)
    print "FINAL PERPLEXITY"
    print str(test_perp_str)
    fname = os.path.join(
        nnet_outdir,
        'best_dev_params.epoch={:02d};batch={:05d};dev_acc={:.2f}.dat'.format(
            epoch, i, best_dev_acc))
    numpy.savetxt(
        os.path.join(
            nnet_outdir,
            'test.epoch={:02d};batch={:05d};dev_acc={:.2f}.predictions.npy'.
            format(epoch, i, best_dev_acc)), y_pred_test)
    numpy.savetxt(
        os.path.join(
            nnet_outdir,
            'test.final.epoch={:02d};batch={:05d};dev_acc={:.2f}.best_inner.npy'
            .format(epoch, i, best_dev_acc)), best_inner)
    cPickle.dump(best_params,
                 open(fname, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)
def train_language_model(new_training_job, config, save_path, params,
                         fast_start, fuel_server, seed):
    c = config
    if seed:
        fuel.config.default_seed = seed
        blocks.config.config.default_seed = seed

    data, lm, retrieval = initialize_data_and_model(config)

    # full main loop can be saved...
    main_loop_path = os.path.join(save_path, 'main_loop.tar')
    # or only state (log + params) which can be useful not to pickle embeddings
    state_path = os.path.join(save_path, 'training_state.tar')
    stream_path = os.path.join(save_path, 'stream.pkl')
    best_tar_path = os.path.join(save_path, "best_model.tar")

    words = tensor.ltensor3('words')
    words_mask = tensor.matrix('words_mask')
    if theano.config.compute_test_value != 'off':
        test_value_data = next(
            data.get_stream('train', batch_size=4,
                            max_length=5).get_epoch_iterator())
        words.tag.test_value = test_value_data[0]
        words_mask.tag.test_value = test_value_data[1]

    costs, updates = lm.apply(words, words_mask)
    cost = rename(costs.mean(), 'mean_cost')

    cg = Model(cost)
    if params:
        logger.debug("Load parameters from {}".format(params))
        with open(params) as src:
            cg.set_parameter_values(load_parameters(src))

    length = rename(words.shape[1], 'length')
    perplexity, = VariableFilter(name='perplexity')(cg)
    perplexities = VariableFilter(name_regex='perplexity.*')(cg)
    monitored_vars = [length, cost] + perplexities
    if c['dict_path']:
        num_definitions, = VariableFilter(name='num_definitions')(cg)
        monitored_vars.extend([num_definitions])

    parameters = cg.get_parameter_dict()
    trained_parameters = parameters.values()
    saved_parameters = parameters.values()
    if c['embedding_path']:
        logger.debug("Exclude word embeddings from the trained parameters")
        trained_parameters = [
            p for p in trained_parameters
            if not p == lm.get_def_embeddings_params()
        ]
        saved_parameters = [
            p for p in saved_parameters
            if not p == lm.get_def_embeddings_params()
        ]

    if c['cache_size'] != 0:
        logger.debug("Enable fake recursivity for looking up embeddings")
        trained_parameters = [
            p for p in trained_parameters if not p == lm.get_cache_params()
        ]

    logger.info("Cost parameters" + "\n" + pprint.pformat([
        " ".join(
            (key, str(parameters[key].get_value().shape),
             'trained' if parameters[key] in trained_parameters else 'frozen'))
        for key in sorted(parameters.keys())
    ],
                                                          width=120))

    rules = []
    if c['grad_clip_threshold']:
        rules.append(StepClipping(c['grad_clip_threshold']))
    rules.append(Adam(learning_rate=c['learning_rate'], beta1=c['momentum']))
    algorithm = GradientDescent(cost=cost,
                                parameters=trained_parameters,
                                step_rule=CompositeRule(rules))

    if c['cache_size'] != 0:
        algorithm.add_updates(updates)

    train_monitored_vars = list(monitored_vars)
    if c['grad_clip_threshold']:
        train_monitored_vars.append(algorithm.total_gradient_norm)

    word_emb_RMS, = VariableFilter(name='word_emb_RMS')(cg)
    main_rnn_in_RMS, = VariableFilter(name='main_rnn_in_RMS')(cg)
    train_monitored_vars.extend([word_emb_RMS, main_rnn_in_RMS])

    if c['monitor_parameters']:
        train_monitored_vars.extend(parameter_stats(parameters, algorithm))

    # We use a completely random seed on purpose. With Fuel server
    # it's currently not possible to restore the state of the training
    # stream. That's why it's probably better to just have it stateless.
    stream_seed = numpy.random.randint(0, 10000000) if fuel_server else None
    training_stream = data.get_stream('train',
                                      batch_size=c['batch_size'],
                                      max_length=c['max_length'],
                                      seed=stream_seed)
    valid_stream = data.get_stream('valid',
                                   batch_size=c['batch_size_valid'],
                                   max_length=c['max_length'],
                                   seed=stream_seed)
    original_training_stream = training_stream
    if fuel_server:
        # the port will be configured by the StartFuelServer extension
        training_stream = ServerDataStream(
            sources=training_stream.sources,
            produces_examples=training_stream.produces_examples)

    validation = DataStreamMonitoring(monitored_vars,
                                      valid_stream,
                                      prefix="valid").set_conditions(
                                          before_first_epoch=not fast_start,
                                          on_resumption=True,
                                          every_n_batches=c['mon_freq_valid'])
    track_the_best = TrackTheBest(validation.record_name(perplexity),
                                  choose_best=min).set_conditions(
                                      on_resumption=True,
                                      after_epoch=True,
                                      every_n_batches=c['mon_freq_valid'])

    # don't save them the entire main loop to avoid pickling everything
    if c['fast_checkpoint']:
        load = (LoadNoUnpickling(state_path,
                                 load_iteration_state=True,
                                 load_log=True).set_conditions(
                                     before_training=not new_training_job))
        cp_args = {
            'save_main_loop': False,
            'save_separately': ['log', 'iteration_state'],
            'parameters': saved_parameters
        }

        checkpoint = Checkpoint(state_path,
                                before_training=not fast_start,
                                every_n_batches=c['save_freq_batches'],
                                after_training=not fast_start,
                                **cp_args)

        if c['checkpoint_every_n_batches']:
            intermediate_cp = IntermediateCheckpoint(
                state_path,
                every_n_batches=c['checkpoint_every_n_batches'],
                after_training=False,
                **cp_args)
    else:
        load = (Load(main_loop_path, load_iteration_state=True,
                     load_log=True).set_conditions(
                         before_training=not new_training_job))
        cp_args = {
            'save_separately': ['iteration_state'],
            'parameters': saved_parameters
        }

        checkpoint = Checkpoint(main_loop_path,
                                before_training=not fast_start,
                                every_n_batches=c['save_freq_batches'],
                                after_training=not fast_start,
                                **cp_args)

        if c['checkpoint_every_n_batches']:
            intermediate_cp = IntermediateCheckpoint(
                main_loop_path,
                every_n_batches=c['checkpoint_every_n_batches'],
                after_training=False,
                **cp_args)

    checkpoint = checkpoint.add_condition(
        ['after_batch', 'after_epoch'],
        OnLogRecord(track_the_best.notification_name), (best_tar_path, ))

    extensions = [
        load,
        StartFuelServer(original_training_stream,
                        stream_path,
                        before_training=fuel_server),
        Timing(every_n_batches=c['mon_freq_train'])
    ]

    if retrieval:
        extensions.append(
            RetrievalPrintStats(retrieval=retrieval,
                                every_n_batches=c['mon_freq_train'],
                                before_training=not fast_start))

    extensions.extend([
        TrainingDataMonitoring(train_monitored_vars,
                               prefix="train",
                               every_n_batches=c['mon_freq_train']),
        validation, track_the_best, checkpoint
    ])
    if c['checkpoint_every_n_batches']:
        extensions.append(intermediate_cp)
    extensions.extend([
        DumpTensorflowSummaries(save_path,
                                every_n_batches=c['mon_freq_train'],
                                after_training=True),
        Printing(on_resumption=True, every_n_batches=c['mon_freq_train']),
        FinishIfNoImprovementAfter(track_the_best.notification_name,
                                   iterations=50 * c['mon_freq_valid'],
                                   every_n_batches=c['mon_freq_valid']),
        FinishAfter(after_n_batches=c['n_batches'])
    ])

    logger.info("monitored variables during training:" + "\n" +
                pprint.pformat(train_monitored_vars, width=120))
    logger.info("monitored variables during valid:" + "\n" +
                pprint.pformat(monitored_vars, width=120))

    main_loop = MainLoop(algorithm,
                         training_stream,
                         model=Model(cost),
                         extensions=extensions)

    main_loop.run()
示例#15
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    parser = argparse.ArgumentParser(
        "Case study of generating a Markov chain with RNN.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "mode", choices=["train", "sample"],
        help="The mode to run. Use `train` to train a new model"
             " and `sample` to sample a sequence generated by an"
             " existing one.")
    parser.add_argument(
        "save_path", default="sine",
        help="The part to save PyLearn2 model")
    parser.add_argument(
        "--steps", type=int, default=100,
        help="Number of steps to plot")
    parser.add_argument(
        "--reset", action="store_true", default=False,
        help="Start training from scratch")
    args = parser.parse_args()

    num_states = ChainDataset.num_states

    if args.mode == "train":
        # Experiment configuration
        rng = numpy.random.RandomState(1)
        batch_size = 50
        seq_len = 100
        dim = 10
        feedback_dim = 8

        # Build the bricks and initialize them
        transition = GatedRecurrent(name="transition", activation=Tanh(),
                                    dim=dim)
        generator = SequenceGenerator(
            LinearReadout(readout_dim=num_states, source_names=["states"],
                          emitter=SoftmaxEmitter(name="emitter"),
                          feedbacker=LookupFeedback(
                              num_states, feedback_dim, name='feedback'),
                          name="readout"),
            transition,
            weights_init=IsotropicGaussian(0.01), biases_init=Constant(0),
            name="generator")
        generator.push_initialization_config()
        transition.weights_init = Orthogonal()
        generator.initialize()

        logger.debug("Parameters:\n" +
                     pprint.pformat(
                         [(key, value.get_value().shape) for key, value
                          in Selector(generator).get_params().items()],
                         width=120))
        logger.debug("Markov chain entropy: {}".format(
            ChainDataset.entropy))
        logger.debug("Expected min error: {}".format(
            -ChainDataset.entropy * seq_len * batch_size))

        if os.path.isfile(args.save_path) and not args.reset:
            model = Pylearn2Model.load(args.save_path)
        else:
            model = Pylearn2Model(generator)

        # Build the cost computation graph.
        # Note: would be probably nicer to make cost part of the model.
        x = tensor.ltensor3('x')
        cost = Pylearn2Cost(model.brick.cost(x[:, :, 0]).sum())

        dataset = ChainDataset(rng, seq_len)
        sgd = SGD(learning_rate=0.0001, cost=cost,
                  batch_size=batch_size, batches_per_iter=10,
                  monitoring_dataset=dataset,
                  monitoring_batch_size=batch_size,
                  monitoring_batches=1,
                  learning_rule=Pylearn2LearningRule(
                      SGDLearningRule(),
                      dict(training_objective=cost.cost)))
        train = Pylearn2Train(dataset, model, algorithm=sgd,
                              save_path=args.save_path, save_freq=10)
        train.main_loop()
    elif args.mode == "sample":
        model = Pylearn2Model.load(args.save_path)
        generator = model.brick

        sample = ComputationGraph(generator.generate(
            n_steps=args.steps, batch_size=1, iterate=True)).function()

        states, outputs, costs = [data[:, 0] for data in sample()]

        numpy.set_printoptions(precision=3, suppress=True)
        print("Generation cost:\n{}".format(costs.sum()))

        freqs = numpy.bincount(outputs).astype(floatX)
        freqs /= freqs.sum()
        print("Frequencies:\n {} vs {}".format(freqs,
                                               ChainDataset.equilibrium))

        trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX)
        for a, b in zip(outputs, outputs[1:]):
            trans_freqs[a, b] += 1
        trans_freqs /= trans_freqs.sum(axis=1)[:, None]
        print("Transition frequencies:\n{}\nvs\n{}".format(
            trans_freqs, ChainDataset.trans_prob))
    else:
        assert False
示例#16
0
def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, batch_size=10000, emb_size=50,
                    margin=0.3, L2_weight=1e-10, update_freq=1, norm_threshold=5.0, max_truncate=40, line_no=16450007, neg_size=60, test_neg_size=300,
                    comment=''):#L1Distance_
    model_options = locals().copy()
    print "model options", model_options
    triple_path='/mounts/data/proj/wenpeng/Dataset/freebase/SimpleQuestions_v2/freebase-subsets/'
    rng = numpy.random.RandomState(1234)
#     triples, entity_size, relation_size, entity_count, relation_count=load_triples(triple_path+'freebase_mtr100_mte100-train.txt', line_no, triple_path)#vocab_size contain train, dev and test
    triples, entity_size, relation_size, train_triples_set, train_entity_set, train_relation_set,statistics=load_Train(triple_path+'freebase-FB5M2M-combined.txt', line_no, triple_path)
    train_h2t=statistics[0]
    train_t2h=statistics[1]
    train_r2t=statistics[2]
    train_r2h=statistics[3]
    train_r_replace_tail_prop=statistics[4]
    
    print 'triple size:', len(triples), 'entity_size:', entity_size, 'relation_size:', relation_size#, len(entity_count), len(relation_count)

       


    rand_values=random_value_normal((entity_size, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    entity_E=theano.shared(value=rand_values, borrow=True)      
    rand_values=random_value_normal((relation_size, emb_size), theano.config.floatX, numpy.random.RandomState(4321))
    relation_E=theano.shared(value=rand_values, borrow=True)    
    
    GRU_U, GRU_W, GRU_b=create_GRU_para(rng, word_dim=emb_size, hidden_dim=emb_size)  
#     GRU_U1, GRU_W1, GRU_b1=create_GRU_para(rng, word_dim=emb_size, hidden_dim=emb_size)  
#     GRU_U2, GRU_W2, GRU_b2=create_GRU_para(rng, word_dim=emb_size, hidden_dim=emb_size)  
#     GRU_U_combine, GRU_W_combine, GRU_b_combine=create_nGRUs_para(rng, word_dim=emb_size, hidden_dim=emb_size, n=3) 
#     para_to_load=[entity_E, relation_E, GRU_U, GRU_W, GRU_b]
#     load_model_from_file(triple_path+'Best_Paras_dim'+str(emb_size), para_to_load)  #+'_hits10_63.616'
#     GRU_U_combine=[GRU_U0, GRU_U1, GRU_U2]
#     GRU_W_combine=[GRU_W0, GRU_W1, GRU_W2]
#     GRU_b_combine=[GRU_b0, GRU_b1, GRU_b2]

#     w2v_entity_rand_values=random_value_normal((entity_size, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
#    
#     w2v_relation_rand_values=random_value_normal((relation_size, emb_size), theano.config.floatX, numpy.random.RandomState(4321))
# 
#     w2v_entity_rand_values=load_word2vec_to_init(w2v_entity_rand_values, triple_path+'freebase_mtr100_mte100-train.txt_ids_entityEmb50.txt')
#     w2v_relation_rand_values=load_word2vec_to_init(w2v_relation_rand_values, triple_path+'freebase_mtr100_mte100-train.txt_ids_relationEmb50.txt')
#     w2v_entity_rand_values=theano.shared(value=w2v_entity_rand_values, borrow=True)       
#     w2v_relation_rand_values=theano.shared(value=w2v_relation_rand_values, borrow=True)  
      
#     entity_E_ensemble=entity_E+norm_matrix(w2v_entity_rand_values)
#     relation_E_ensemble=relation_E+norm_matrix(w2v_relation_rand_values)
    
    norm_entity_E=norm_matrix(entity_E)
    norm_relation_E=norm_matrix(relation_E)
    
        
    n_batchs=line_no/batch_size
    remain_triples=line_no%batch_size
    if remain_triples>0:
        batch_start=list(numpy.arange(n_batchs)*batch_size)+[line_no-batch_size]
    else:
        batch_start=list(numpy.arange(n_batchs)*batch_size)

#     batch_start=theano.shared(numpy.asarray(batch_start, dtype=theano.config.floatX), borrow=True)
#     batch_start=T.cast(batch_start, 'int64')   
    
    # allocate symbolic variables for the data
#     index = T.lscalar()
    x_index_l = T.lmatrix('x_index_l')   # now, x is the index matrix, must be integer
    n_index_T = T.ltensor3('n_index_T')
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'
    

    
    dist_tail=one_batch_parallel_Ramesh(x_index_l, norm_entity_E, norm_relation_E, GRU_U, GRU_W, GRU_b, emb_size)
    
    
    loss__tail_is=one_neg_batches_parallel_Ramesh(n_index_T, norm_entity_E, norm_relation_E, GRU_U, GRU_W, GRU_b, emb_size)
    loss_tail_i=T.maximum(0.0, margin+dist_tail.reshape((dist_tail.shape[0],1))-loss__tail_is) 
#     loss_relation_i=T.maximum(0.0, margin+dist_relation.reshape((dist_relation.shape[0],1))-loss_relation_is)     
#     loss_head_i=T.maximum(0.0, margin+dist_head.reshape((dist_head.shape[0],1))-loss_head_is)    
    
    
#     loss_tail_i_test=T.maximum(0.0, 0.0+dist_tail.reshape((dist_tail.shape[0],1))-loss__tail_is)   
#     binary_matrix_test=T.gt(loss_tail_i_test, 0)
#     sum_vector_test=T.sum(binary_matrix_test, axis=1)
#     binary_vector_hits10=T.gt(sum_vector_test, 10)
#     test_loss=T.sum(binary_vector_hits10)*1.0/batch_size  
#     loss_relation_i=T.maximum(0.0, margin+dis_relation.reshape((dis_relation.shape[0],1))-loss__relation_is) 
#     loss_head_i=T.maximum(0.0, margin+dis_head.reshape((dis_head.shape[0],1))-loss__head_is)     
#     def neg_slice(neg_matrix):
#         dist_tail_slice, dis_relation_slice, dis_head_slice=one_batch_parallel_Ramesh(neg_matrix, entity_E, relation_E, GRU_U_combine, GRU_W_combine, GRU_b_combine, emb_size)
#         loss_tail_i=T.maximum(0.0, margin+dist_tail-dist_tail_slice) 
#         loss_relation_i=T.maximum(0.0, margin+dis_relation-dis_relation_slice) 
#         loss_head_i=T.maximum(0.0, margin+dis_head-dis_head_slice) 
#         return loss_tail_i, loss_relation_i, loss_head_i
#     
#     (loss__tail_is, loss__relation_is, loss__head_is), updates = theano.scan(
#        neg_slice,
#        sequences=n_index_T,
#        outputs_info=None)  
    
    loss_tails=T.mean(T.sum(loss_tail_i, axis=1) )
#     loss_relations=T.mean(T.sum(loss_relation_i, axis=1) )
#     loss_heads=T.mean(T.sum(loss_head_i, axis=1) )
    loss=loss_tails#+loss_relations+loss_heads
    L2_loss=debug_print((entity_E** 2).sum()+(relation_E** 2).sum()\
                      +(GRU_U** 2).sum()+(GRU_W** 2).sum(), 'L2_reg')
#     Div_loss=Diversify_Reg(GRU_U[0])+Diversify_Reg(GRU_U[1])+Diversify_Reg(GRU_U[2])+\
#         Diversify_Reg(GRU_W[0])+Diversify_Reg(GRU_W[1])+Diversify_Reg(GRU_W[2])
    cost=loss+L2_weight*L2_loss#+div_reg*Div_loss
    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = [entity_E, relation_E, GRU_U, GRU_W, GRU_b]
#     params_conv = [conv_W, conv_b]
    params_to_store=[entity_E, relation_E, GRU_U, GRU_W, GRU_b]
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
    grads = T.grad(cost, params)
    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-9)))   #AdaGrad
        updates.append((acc_i, acc))    

   

#     grads = T.grad(cost, params)
#     updates = []
#     for param_i, grad_i in zip(params, grads):
#         updates.append((param_i, param_i - learning_rate * grad_i))   #AdaGrad 
        
    train_model = theano.function([x_index_l, n_index_T], [loss, cost], updates=updates,on_unused_input='ignore')
#     test_model = theano.function([x_index_l, n_index_T], test_loss, on_unused_input='ignore')
# 
#     train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y],
#           givens={
#             x_index_l: indices_train_l[index: index + batch_size],
#             x_index_r: indices_train_r[index: index + batch_size],
#             y: trainY[index: index + batch_size],
#             left_l: trainLeftPad_l[index],
#             right_l: trainRightPad_l[index],
#             left_r: trainLeftPad_r[index],
#             right_r: trainRightPad_r[index],
#             length_l: trainLengths_l[index],
#             length_r: trainLengths_r[index],
#             norm_length_l: normalized_train_length_l[index],
#             norm_length_r: normalized_train_length_r[index],
#             mts: mt_train[index: index + batch_size],
#             wmf: wm_train[index: index + batch_size]}, on_unused_input='ignore')



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
#     validation_frequency = min(n_train_batches/5, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False
    
    svm_max=0.0
    best_epoch=0
#     corpus_triples_set=train_triples_set|dev_triples_set|test_triples_set
    best_train_loss=1000000
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
#         learning_rate/=epoch
#         print 'lr:', learning_rate
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        #shuffle(train_batch_start)#shuffle training data
        loss_sum=0.0
        for start in batch_start:
            if start%100000==0:
                print start, '...'
            pos_triples=triples[start:start+batch_size]
            all_negs=[]
#             count=0
            for pos_triple in pos_triples:
                neg_triples=get_n_neg_triples_train(pos_triple, train_triples_set, train_entity_set, train_r_replace_tail_prop, neg_size)
# #                 print 'neg_head_triples'
#                 neg_relation_triples=get_n_neg_triples(pos_triple, train_triples_set, train_entity_set, train_relation_set, 1, neg_size/3)
# #                 print 'neg_relation_triples'
#                 neg_tail_triples=get_n_neg_triples(pos_triple, train_triples_set, train_entity_set, train_relation_set, 2, neg_size/3)
#                 print 'neg_tail_triples'
                all_negs.append(neg_triples)
#                 print 'neg..', count
#                 count+=1
            neg_tensor=numpy.asarray(all_negs).reshape((batch_size, neg_size, 3)).transpose(1,0,2)
            loss, cost= train_model(pos_triples, neg_tensor)
            loss_sum+=loss
        loss_sum/=len(batch_start)
        print 'Training loss:', loss_sum, 'cost:', cost
        
        
#         loss_test=0.0
# 
#         for test_start in batch_start_test:
#             pos_triples=test_triples[test_start:test_start+batch_size]
#             all_negs=[]
#             for pos_triple in pos_triples:
#                 neg_triples=get_n_neg_triples_new(pos_triple, corpus_triples_set, test_entity_set, test_relation_set, test_neg_size/2, True)
#                 all_negs.append(neg_triples)
#                 
#             neg_tensor=numpy.asarray(all_negs).reshape((batch_size, test_neg_size, 3)).transpose(1,0,2)
#             loss_test+= test_model(pos_triples, neg_tensor)
#             
#             
#         loss_test/=n_batchs_test
#         print '\t\t\tUpdating epoch', epoch, 'finished! Test hits10:', 1.0-loss_test
        if loss_sum< best_train_loss:
            store_model_to_file(triple_path+comment+'Best_Paras_dim'+str(emb_size), params_to_store)
#             store_model_to_file(triple_path+'Divreg_Best_Paras_dim'+str(emb_size), params_to_store)
            best_train_loss=loss_sum
            print 'Finished storing best  params'
#             exit(0)
        print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min'
        mid_time = time.clock()            
        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
示例#17
0
    logger = logging.getLogger(__name__)

    configuration = getattr(configurations, args.proto)()
    # added by Zhaopeng Tu, 2016-05-12
    if args.state:
        configuration.update(eval(open(args.state).read()))
    logger.info("\nModel options:\n{}".format(pprint.pformat(configuration)))

    src = T.lmatrix()
    src_mask = T.matrix()
    trg = T.lmatrix()
    trg_mask = T.matrix()

    # added by Longyue
    src_hist = T.ltensor3()
    src_hist_mask = T.tensor3()


    # added by Zhaopeng Tu, 2016-07-13
    # for fast training of new parameters
    ite = T.fscalar()

    rng = numpy.random.RandomState(1234)

    enc_dec = EncoderDecoder(rng, **configuration)
    # modified by Zhaopeng Tu, 2016-07-13
    # for fast training of new parameters
    # enc_dec.build_trainer(src, src_mask, trg, trg_mask)
    enc_dec.build_trainer(src, src_mask, src_hist, src_hist_mask, trg, trg_mask, ite)
    enc_dec.build_sampler()
示例#18
0
 def make_node(self, groundtruth, recognized):
     recognized = tensor.as_tensor_variable(recognized)
     groundtruth = tensor.as_tensor_variable(groundtruth)
     return theano.Apply(
         self, [groundtruth, recognized],
         [tensor.ltensor3(), tensor.ltensor3()])
示例#19
0
def train(cfig, epochs, language, model_alias, models_name):
    sentence = T.lmatrix()
    sentence_mask = T.matrix()
    sentence_morph = T.ltensor3()
    sentence_morph_mask = T.tensor3()

    use_noise = T.iscalar()
    lm = rnnlm_quick(**cfig)
    if model_alias == 'lstm2layer':
        use_maxout = True
        getattr(lm, models_name['lstm2layer'])(sentence, sentence_mask, use_noise, use_maxout)
    elif model_alias == 'rnnlm':
        use_maxout = True
        getattr(lm, models_name['rnnlm'])(sentence, sentence_mask, use_noise, use_maxout)
    else:
        getattr(lm, models_name[model_alias])(sentence, sentence_mask, sentence_morph, sentence_morph_mask, use_noise)

    cost_sum = lm.cost
    cost_mean = lm.cost/sentence.shape[1]

    params = lm.params
    regular = lm.L1 * 1e-5 + lm.L2 * 1e-5
    grads = T.grad(cost_mean, params)

    hard_clipping = cfig['hard_clipping']
    soft_clipping = T.fscalar()
    skip_nan_batch = 0
    grads, nan_num, inf_num = step_clipping(params, grads, soft_clipping, cfig['shrink_scale_after_skip_nan_grad'], cfig['skip_nan_grad'])

    updates = adadelta(params, grads, hard_clipping)
    vs , vs_morph , vs_morph_mask = DStream(datatype='valid', config=cfig)
    ts , ts_morph , ts_morph_mask = DStream(datatype='test', config=cfig)

    fn = theano.function([sentence, sentence_mask, sentence_morph, sentence_morph_mask, use_noise, soft_clipping],
                         [cost_mean, nan_num, inf_num], updates=updates , on_unused_input='ignore')
    test_fn = theano.function([sentence, sentence_mask, sentence_morph, sentence_morph_mask, use_noise],
                              [cost_sum] , on_unused_input='ignore')

    start_time = datetime.now()
    start_cpu_time = time.clock()
    cur_time = start_time
    cur_cpu_time = start_cpu_time
    print ('training start at {}'.format(start_time))

    valid_errs = []
    test_errs = []
    time_his, time_cpu_his = [], []
    patience = 200
    bad_counter = 0

    for epoch in range(epochs):
        ds , ds_morph , ds_morph_mask = DStream(datatype='train', config=cfig)
        for data_tuple , data_morph , mask_morph in zip(ds.get_epoch_iterator() , ds_morph , ds_morph_mask):
            data , mask = data_tuple
            #print data.shape , data_morph.shape , mask_morph.shape
            if cfig['drop_last_batch_if_small'] and (0.0 + len(data)) / cfig['batch_size'] < 0.95:
                #logger.info('drop batch with: {}/{} ratio'.format(len(data), cfig['batch_size']))
                pass # FIXME any idea to identify the last batch?
            else:
                cur_clip = soft_clipping_curve(epoch, cfig['soft_clipping_epoch'], cfig['soft_clipping_begin'], cfig['soft_clipping_end'])
                cur_batch_time = datetime.now()
                cur_batch_cpu_time = time.clock()
                data_morph = data_morph.transpose((1 , 0 , 2))
                mask_morph = mask_morph.transpose((1 , 0 , 2))
                c, grad_nan_num, grad_inf_num = fn(data.T, mask.T, data_morph , mask_morph , 1, cur_clip)
                batch_elasped_seconds = (datetime.now() - cur_batch_time).total_seconds()
                batch_elasped_cpu_seconds = (time.clock() - cur_batch_cpu_time)
                #print data.shape , data_morph.shape , mask_morph.shape
                logger.info('grad nan/inf num: {} {} at epoch {} cost {},{}'.format(grad_nan_num, grad_inf_num, epoch, batch_elasped_seconds, batch_elasped_cpu_seconds))
        valid_err = test(test_fn, vs , vs_morph , vs_morph_mask)
        test_err = test(test_fn, ts , ts_morph , ts_morph_mask)
        valid_errs.append(valid_err)
        test_errs.append(test_err)
        if valid_err <= numpy.array(valid_errs).min():
            bad_counter = 0
        if len(valid_errs) > patience and valid_err >= \
            numpy.array(valid_errs)[:-patience].min():
            bad_counter += 1
        valid_min = numpy.min(valid_errs)
        valid_min_idx = numpy.argmin(valid_errs)
        valid_min_test = test_errs[valid_min_idx]
        pre_time, pre_cpu_time = cur_time, cur_cpu_time
        cur_time, cur_cpu_time= datetime.now(), time.clock()
        elasped_minutes = (cur_time - start_time).total_seconds() / 60.
        elasped_cpu_minutes = (cur_cpu_time - start_cpu_time) / 60.
        batch_elasped_seconds = (cur_time - pre_time).total_seconds()
        batch_elasped_cpu_seconds = (cur_cpu_time - pre_cpu_time)
        print ('{:>3} epoch {:>2} bad t/v {:>5.2f} {:>5.2f} itr:{:>3} min_itr:{:>3} t/vbest:{:>5.2f} {:>5.2f} batch {:>4.0f}s, all {:>5.1f}m, cpu {:>4.0f}s, all{:>5.1f}m nan {} inf {}'.\
            format(epoch, bad_counter, test_err, valid_err, len(valid_errs)-1, valid_min_idx, valid_min_test, valid_min, batch_elasped_seconds, elasped_minutes, batch_elasped_cpu_seconds, elasped_cpu_minutes, grad_nan_num, grad_inf_num));
        time_his.append(batch_elasped_seconds)
        time_cpu_his.append(batch_elasped_cpu_seconds)
        sys.stdout.flush()
        if bad_counter > patience:
            print "Early Stop! outter loop"
            break
    valid_min = numpy.min(valid_errs)
    valid_min_idx = numpy.argmin(valid_errs)
    valid_min_test = test_errs[valid_min_idx]
    test_min = numpy.min(test_errs)
    test_min_idx = numpy.argmin(test_errs)
    cur_cfig = str(valid_min_idx)  + ':' + str(valid_min)
    cur_cfig += ',' + str(valid_min_test) + ','
    cur_cfig += str(test_min_idx)  + ':' + str(test_min)
    final_res = '#== epoch:valid_min,valid_min_test,epoch:test_min: ' + cur_cfig
    print final_res
    end_time = datetime.now()
    use_time = (end_time - start_time).total_seconds()
    end_cpu_time = time.clock()
    use_cpu_time = (end_cpu_time - start_cpu_time)
    print 'training cost time {}s'.format(use_time)
    print 'training cost cpu time {}s'.format(use_cpu_time)
    print 'epoch cost mean time {}'.format(numpy.mean(time_his))
    print 'epoch cost mean cpu time {}'.format(numpy.mean(time_cpu_his))
    vals = [ "#"*100,str(datetime.now()),language,model_alias,final_res,"#"*100]
    print '\n'.join(vals)
    result = [numpy.mean(time_his), numpy.mean(time_cpu_his), use_time, use_cpu_time, valid_min, valid_min_test, test_min]
    return result
示例#20
0
    def build_sampler(self):

        # added by Longyue
        x_hist = T.ltensor3()
        x_hist_mask = T.tensor3()
        annotations_1 = self.encoder_hist_1.apply_1(x_hist, x_hist_mask)
        annotations_1 = annotations_1[-1]
        annotations_2 = self.encoder_hist_2.apply_2(annotations_1)
        annotations_3 = annotations_2[-1]

        x = T.lmatrix()

        # Build Networks
        # src_mask is None
        c = self.encoder.apply(x, None, annotations_3)
        #init_context = ctx[0, :, -self.n_hids_src:]
        # mean pooling
        init_context = c.mean(0)

        # added by Longyue
        init_context = concatenate([init_context, annotations_3],
                                   axis=annotations_3.ndim - 1)

        init_state = self.decoder.create_init_state(init_context)

        outs = [init_state, c, annotations_3]
        if not self.with_attention:
            outs.append(init_context)

        # compile function
        print 'Building compile_init_state_and_context function ...'
        self.compile_init_and_context = theano.function(
            [x, x_hist, x_hist_mask], outs, name='compile_init_and_context')
        print 'Done'

        y = T.lvector()
        cur_state = T.matrix()
        # if it is the first word, emb should be all zero, and it is indicated by -1
        trg_emb = T.switch(y[:, None] < 0, T.alloc(0., 1, self.n_in_trg),
                           self.table_trg.apply(y))

        # added by Zhaopeng Tu, 2016-06-09
        # for with_attention=False
        if self.with_attention and self.with_coverage:
            cov_before = T.tensor3()
            if self.coverage_type is 'linguistic':
                print 'Building compile_fertility ...'
                fertility = self.decoder._get_fertility(c)
                fertility = T.addbroadcast(fertility, 1)
                self.compile_fertility = theano.function(
                    [c], [fertility], name='compile_fertility')
                print 'Done'
            else:
                fertility = None
        else:
            cov_before = None
            fertility = None

        # apply one step
        # modified by Zhaopeng Tu, 2016-04-29
        # [next_state, ctxs] = self.decoder.apply(state_below=trg_emb,
        results = self.decoder.apply(
            state_below=trg_emb,
            init_state=cur_state,
            # added by Zhaopeng Tu, 2016-06-09
            init_context=None if self.with_attention else init_context,
            c=c if self.with_attention else None,
            hist=annotations_3,  # added by Longyue
            one_step=True,
            # added by Zhaopeng Tu, 2016-04-27
            cov_before=cov_before,
            fertility=fertility)
        next_state = results[0]
        if self.with_attention:
            ctxs, alignment = results[1], results[2]
            if self.with_coverage:
                cov = results[3]
        else:
            # if with_attention=False, we always use init_context as the source representation
            ctxs = init_context

        readout = self.decoder.readout(next_state, ctxs, trg_emb)

        # maxout
        if self.maxout_part > 1:
            readout = self.decoder.one_step_maxout(readout)

        # apply dropout
        if self.dropout < 1.0:
            readout = Dropout(self.trng, readout, 0, self.dropout)

        # compute the softmax probability
        next_probs = self.logistic_layer.get_probs(readout)

        # sample from softmax distribution to get the sample
        next_sample = self.trng.multinomial(pvals=next_probs).argmax(1)

        # compile function
        print 'Building compile_next_state_and_probs function ...'
        inps = [y, cur_state]
        if self.with_attention:
            inps.append(c)
        else:
            inps.append(init_context)

        # added by Longyue
        inps.append(annotations_3)

        outs = [next_probs, next_state, next_sample]
        # added by Zhaopeng Tu, 2016-06-09
        if self.with_attention:
            outs.append(alignment)
            # added by Zhaopeng Tu, 2016-04-29
            if self.with_coverage:
                inps.append(cov_before)
                if self.coverage_type is 'linguistic':
                    inps.append(fertility)
                outs.append(cov)

        self.compile_next_state_and_probs = theano.function(
            inps, outs, name='compile_next_state_and_probs')
        print 'Done'

        # added by Zhaopeng Tu, 2016-07-18
        # for reconstruction
        if self.with_reconstruction:
            # Build Networks
            # trg_mask is None
            inverse_c = T.tensor3()
            # mean pooling
            inverse_init_context = inverse_c.mean(0)

            inverse_init_state = self.inverse_decoder.create_init_state(
                inverse_init_context)

            outs = [inverse_init_state]
            if not self.with_attention:
                outs.append(inverse_init_context)

            # compile function
            print 'Building compile_inverse_init_state_and_context function ...'
            self.compile_inverse_init_and_context = theano.function(
                [inverse_c], outs, name='compile_inverse_init_and_context')
            print 'Done'

            src = T.lvector()
            inverse_cur_state = T.matrix()
            trg_mask = T.matrix()
            # if it is the first word, emb should be all zero, and it is indicated by -1
            src_emb = T.switch(src[:, None] < 0, T.alloc(0., 1, self.n_in_src),
                               self.table_src.apply(src))

            # apply one step
            # modified by Zhaopeng Tu, 2016-04-29
            inverse_results = self.inverse_decoder.apply(
                state_below=src_emb,
                init_state=inverse_cur_state,
                # added by Zhaopeng Tu, 2016-06-09
                init_context=None
                if self.with_attention else inverse_init_context,
                c=inverse_c if self.with_attention else None,
                c_mask=trg_mask,
                one_step=True)
            inverse_next_state = inverse_results[0]
            if self.with_attention:
                inverse_ctxs, inverse_alignment = inverse_results[
                    1], inverse_results[2]
            else:
                # if with_attention=False, we always use init_context as the source representation
                inverse_ctxs = init_context

            inverse_readout = self.inverse_decoder.readout(
                inverse_next_state, inverse_ctxs, src_emb)

            # maxout
            if self.maxout_part > 1:
                inverse_readout = self.inverse_decoder.one_step_maxout(
                    inverse_readout)

            # apply dropout
            if self.dropout < 1.0:
                inverse_readout = Dropout(self.srng, inverse_readout, 0,
                                          self.dropout)

            # compute the softmax probability
            inverse_next_probs = self.inverse_logistic_layer.get_probs(
                inverse_readout)

            # sample from softmax distribution to get the sample
            inverse_next_sample = self.srng.multinomial(
                pvals=inverse_next_probs).argmax(1)

            # compile function
            print 'Building compile_inverse_next_state_and_probs function ...'
            inps = [src, trg_mask, inverse_cur_state]
            if self.with_attention:
                inps.append(inverse_c)
            else:
                inps.append(inverse_init_context)
            outs = [
                inverse_next_probs, inverse_next_state, inverse_next_sample
            ]
            # added by Zhaopeng Tu, 2016-06-09
            if self.with_attention:
                outs.append(inverse_alignment)

            self.compile_inverse_next_state_and_probs = theano.function(
                inps, outs, name='compile_inverse_next_state_and_probs')
            print 'Done'
示例#21
0
 def make_node(self, groundtruth, recognized):
     recognized = tensor.as_tensor_variable(recognized)
     groundtruth = tensor.as_tensor_variable(groundtruth)
     return theano.Apply(
         self, [groundtruth, recognized], [tensor.ltensor3(), tensor.ltensor3()])
示例#22
0
def train_extractive_qa(new_training_job, config, save_path, params,
                        fast_start, fuel_server, seed):
    if seed:
        fuel.config.default_seed = seed
        blocks.config.config.default_seed = seed

    root_path = os.path.join(save_path, 'training_state')
    extension = '.tar'
    tar_path = root_path + extension
    best_tar_path = root_path + '_best' + extension

    c = config
    data, qam = initialize_data_and_model(c)

    if theano.config.compute_test_value != 'off':
        test_value_data = next(
            data.get_stream('train', shuffle=True, batch_size=4,
                            max_length=5).get_epoch_iterator(as_dict=True))
        for var in qam.input_vars.values():
            var.tag.test_value = test_value_data[var.name]

    costs = qam.apply_with_default_vars()
    cost = rename(costs.mean(), 'mean_cost')

    cg = Model(cost)
    if params:
        logger.debug("Load parameters from {}".format(params))
        with open(params) as src:
            cg.set_parameter_values(load_parameters(src))

    length = rename(qam.contexts.shape[1], 'length')
    batch_size = rename(qam.contexts.shape[0], 'batch_size')
    predicted_begins, = VariableFilter(name='predicted_begins')(cg)
    predicted_ends, = VariableFilter(name='predicted_ends')(cg)
    exact_match, = VariableFilter(name='exact_match')(cg)
    exact_match_ratio = rename(exact_match.mean(), 'exact_match_ratio')
    context_unk_ratio, = VariableFilter(name='context_unk_ratio')(cg)
    monitored_vars = [
        length, batch_size, cost, exact_match_ratio, context_unk_ratio
    ]
    if c['dict_path']:
        def_unk_ratio, = VariableFilter(name='def_unk_ratio')(cg)
        num_definitions = rename(qam.input_vars['defs'].shape[0],
                                 'num_definitions')
        max_definition_length = rename(qam.input_vars['defs'].shape[1],
                                       'max_definition_length')
        monitored_vars.extend(
            [def_unk_ratio, num_definitions, max_definition_length])
        if c['def_word_gating'] == 'self_attention':
            def_gates = VariableFilter(name='def_gates')(cg)
            def_gates_min = tensor.minimum(*[x.min() for x in def_gates])
            def_gates_max = tensor.maximum(*[x.max() for x in def_gates])
            monitored_vars.extend([
                rename(def_gates_min, 'def_gates_min'),
                rename(def_gates_max, 'def_gates_max')
            ])
    text_match_ratio = TextMatchRatio(data_path=os.path.join(
        fuel.config.data_path[0], 'squad/dev-v1.1.json'),
                                      requires=[
                                          predicted_begins, predicted_ends,
                                          tensor.ltensor3('contexts_text'),
                                          tensor.lmatrix('q_ids')
                                      ],
                                      name='text_match_ratio')

    parameters = cg.get_parameter_dict()
    trained_parameters = parameters.values()
    if c['embedding_path']:
        logger.debug("Exclude  word embeddings from the trained parameters")
        trained_parameters = [
            p for p in trained_parameters if not p == qam.embeddings_var()
        ]
    if c['train_only_def_part']:
        def_reading_parameters = qam.def_reading_parameters()
        trained_parameters = [
            p for p in trained_parameters if p in def_reading_parameters
        ]

    logger.info("Cost parameters" + "\n" + pprint.pformat([
        " ".join(
            (key, str(parameters[key].get_value().shape),
             'trained' if parameters[key] in trained_parameters else 'frozen'))
        for key in sorted(parameters.keys())
    ],
                                                          width=120))

    # apply dropout to the training cost and to all the variables
    # that we monitor during training
    train_cost = cost
    train_monitored_vars = list(monitored_vars)
    if c['dropout']:
        regularized_cg = ComputationGraph([cost] + train_monitored_vars)
        # Dima: the dropout that I implemented first
        bidir_outputs, = VariableFilter(bricks=[Bidirectional],
                                        roles=[OUTPUT])(cg)
        readout_layers = VariableFilter(bricks=[Rectifier], roles=[OUTPUT])(cg)
        dropout_vars = [bidir_outputs] + readout_layers
        logger.debug("applying dropout to {}".format(", ".join(
            [v.name for v in dropout_vars])))
        regularized_cg = apply_dropout(regularized_cg, dropout_vars,
                                       c['dropout'])
        # a new dropout with exactly same mask at different steps
        emb_vars = VariableFilter(roles=[EMBEDDINGS])(regularized_cg)
        emb_dropout_mask = get_dropout_mask(emb_vars[0], c['emb_dropout'])
        if c['emb_dropout_type'] == 'same_mask':
            regularized_cg = apply_dropout2(regularized_cg,
                                            emb_vars,
                                            c['emb_dropout'],
                                            dropout_mask=emb_dropout_mask)
        elif c['emb_dropout_type'] == 'regular':
            regularized_cg = apply_dropout(regularized_cg, emb_vars,
                                           c['emb_dropout'])
        else:
            raise ValueError("unknown dropout type {}".format(
                c['emb_dropout_type']))
        train_cost = regularized_cg.outputs[0]
        train_monitored_vars = regularized_cg.outputs[1:]

    rules = []
    if c['grad_clip_threshold']:
        rules.append(StepClipping(c['grad_clip_threshold']))
    rules.append(Adam(learning_rate=c['learning_rate'], beta1=c['momentum']))
    algorithm = GradientDescent(cost=train_cost,
                                parameters=trained_parameters,
                                step_rule=CompositeRule(rules))

    if c['grad_clip_threshold']:
        train_monitored_vars.append(algorithm.total_gradient_norm)
    if c['monitor_parameters']:
        train_monitored_vars.extend(parameter_stats(parameters, algorithm))

    training_stream = data.get_stream('train',
                                      batch_size=c['batch_size'],
                                      shuffle=True,
                                      max_length=c['max_length'])
    original_training_stream = training_stream
    if fuel_server:
        # the port will be configured by the StartFuelServer extension
        training_stream = ServerDataStream(
            sources=training_stream.sources,
            produces_examples=training_stream.produces_examples)

    extensions = [
        LoadNoUnpickling(tar_path, load_iteration_state=True,
                         load_log=True).set_conditions(
                             before_training=not new_training_job),
        StartFuelServer(original_training_stream,
                        os.path.join(save_path, 'stream.pkl'),
                        before_training=fuel_server),
        Timing(every_n_batches=c['mon_freq_train']),
        TrainingDataMonitoring(train_monitored_vars,
                               prefix="train",
                               every_n_batches=c['mon_freq_train']),
    ]
    validation = DataStreamMonitoring(
        [text_match_ratio] + monitored_vars,
        data.get_stream('dev',
                        batch_size=c['batch_size_valid'],
                        raw_text=True,
                        q_ids=True),
        prefix="dev").set_conditions(before_training=not fast_start,
                                     after_epoch=True)
    dump_predictions = DumpPredictions(save_path,
                                       text_match_ratio,
                                       before_training=not fast_start,
                                       after_epoch=True)
    track_the_best_exact = TrackTheBest(
        validation.record_name(exact_match_ratio),
        choose_best=max).set_conditions(before_training=True, after_epoch=True)
    track_the_best_text = TrackTheBest(
        validation.record_name(text_match_ratio),
        choose_best=max).set_conditions(before_training=True, after_epoch=True)
    extensions.extend([
        validation, dump_predictions, track_the_best_exact, track_the_best_text
    ])
    # We often use pretrained word embeddings and we don't want
    # to load and save them every time. To avoid that, we use
    # save_main_loop=False, we only save the trained parameters,
    # and we save the log and the iterations state separately
    # in the tar file.
    extensions.extend([
        Checkpoint(tar_path,
                   parameters=trained_parameters,
                   save_main_loop=False,
                   save_separately=['log', 'iteration_state'],
                   before_training=not fast_start,
                   every_n_epochs=c['save_freq_epochs'],
                   every_n_batches=c['save_freq_batches'],
                   after_training=not fast_start).add_condition(
                       ['after_batch', 'after_epoch'],
                       OnLogRecord(track_the_best_text.notification_name),
                       (best_tar_path, )),
        DumpTensorflowSummaries(save_path,
                                after_epoch=True,
                                every_n_batches=c['mon_freq_train'],
                                after_training=True),
        RetrievalPrintStats(retrieval=data._retrieval,
                            every_n_batches=c['mon_freq_train'],
                            before_training=not fast_start),
        Printing(after_epoch=True, every_n_batches=c['mon_freq_train']),
        FinishAfter(after_n_batches=c['n_batches'],
                    after_n_epochs=c['n_epochs']),
        Annealing(c['annealing_learning_rate'],
                  after_n_epochs=c['annealing_start_epoch']),
        LoadNoUnpickling(best_tar_path,
                         after_n_epochs=c['annealing_start_epoch'])
    ])

    main_loop = MainLoop(algorithm,
                         training_stream,
                         model=Model(cost),
                         extensions=extensions)
    main_loop.run()
示例#23
0
def build_mlp(args, netid, input_var=None, mask_inputs=False):
    """Build MLP model"""
    # pylint: disable=bad-continuation
    # This creates an MLP of two hidden layers of 800 units each, followed by
    # a softmax output layer of 10 units. It applies 20% dropout to the input
    # data and 50% dropout to the hidden layers.

    # Input layer, specifying the expected input shape of the network
    # (unspecified batchsize, 1 channel, 28 rows and 28 columns) and
    # linking it to the given Theano variable `input_var`, if any:

    l_in = lasagne.layers.InputLayer(shape=(None, 1, 28, 28),
                                     input_var=input_var,
                                     name="%d_%s" % (netid, "l_in"))

    mask_in = None
    if mask_inputs:
        mask_in = T.ltensor3()
    # Apply 20% dropout to the input data:
    l_in_drop = dropout.DropoutLayer(l_in,
                                     mask=mask_in,
                                     p=args.input_dropout_rate,
                                     name="%d_%s" % (netid, "l_in_drop"))

    # Add a fully-connected layer of 800 units, using the linear rectifier, and
    # initializing weights with Glorot's scheme (which is the default anyway):
    l_hid1 = lasagne.layers.DenseLayer(
        l_in_drop,
        num_units=200,
        nonlinearity=lasagne.nonlinearities.rectify,
        W=lasagne.init.GlorotUniform(),
        name="%d_%s" % (netid, "l_hid1"))

    # We'll now add dropout of 50%:
    mask_hid1 = None
    if mask_inputs:
        mask_hid1 = T.lvector()
    l_hid1_drop = dropout.DropoutLayer(l_hid1,
                                       mask=mask_hid1,
                                       p=args.dropout_rate,
                                       name="%d_%s" % (netid, "l_hid1_drop"))

    # Another 800-unit layer:
    l_hid2 = lasagne.layers.DenseLayer(
        l_hid1_drop,
        num_units=200,
        nonlinearity=lasagne.nonlinearities.rectify,
        name="%d_%s" % (netid, "l_hid2"))

    # 50% dropout again:
    mask_hid2 = None
    if mask_inputs:
        mask_hid2 = T.lvector()
    l_hid2_drop = dropout.DropoutLayer(l_hid2,
                                       mask=mask_hid2,
                                       p=args.dropout_rate,
                                       name="%d_%s" % (netid, "l_hid2_drop"))

    # Finally, we'll add the fully-connected output layer, of 10 softmax units:
    l_out = lasagne.layers.DenseLayer(
        l_hid2_drop,
        num_units=10,
        nonlinearity=lasagne.nonlinearities.softmax,
        name="%d_%s" % (netid, "l_out"))

    masks = [mask_in, mask_hid1, mask_hid2]
    # Each layer is linked to its incoming layer(s), so we only need to pass
    # the output layer to give access to a network in Lasagne:
    return l_out, masks
示例#24
0
def generate_embeddings(config,
                        tar_path,
                        part,
                        dest_path,
                        format_,
                        average=False,
                        encoder_embeddings=None,
                        **kwargs):
    """
    generate embeddings for all the defintions, average them and serialize OR
    if encoder_embeddings, serialize the models' encoder embeddings

    config: name of the config of the model
    tar_path: tar path of the model parameters
    part: part of the dataset (should be either 'train', 'valid', 'test' or 'all')
    dest_path: directory where the serialized embeddings will be written
    format: either 'dict' or 'glove'
    encoder_embeddings: None, 'only', 'mixed', 'if_missing'
      - None: don't include encoder embeddings
      - 'only': don't read any data, just serialize the encoder embeddings
      - 'mixed': add the encoder embeddings to the list of definition embeddings
      - 'if_missing': add the encoder embeddings when there is no corresponding def
    average: if true, multi-prototype embeddings will be averaged
    """
    if not os.path.exists(dest_path):
        os.makedirs(dest_path)

    c = config
    data, model = initialize_data_and_model(c, train_phase=False)
    words = T.ltensor3('words')
    words_mask = T.matrix('words_mask')
    keys = T.lmatrix('keys')
    n_identical_keys = T.lvector('n_identical_keys')
    sym_args = [words, words_mask]

    if format_ not in ['dict', 'glove']:
        raise ValueError("format should be either: dict, glove")

    if not c['encoder'] and encoder_embeddings != 'only':
        raise ValueError('Error: this model does not have an encoder.')

    if use_keys(c):
        sym_args.append(keys)
    if use_n_identical_keys(c):
        sym_args.append(n_identical_keys)

    costs = model.apply(*sym_args, train_phase=False)

    cg = Model(costs)

    with open(tar_path) as src:
        cg.set_parameter_values(load_parameters(src))

    if encoder_embeddings:
        if encoder_embeddings == 'only' and not c['encoder']:
            embeddings_array = model.get_def_embeddings_params('key').eval()
        else:
            embeddings_array = model.get_def_embeddings_params('main').eval()
        entries = model.get_embeddings_entries()
        enc_embeddings = {
            e: np.asarray(a)
            for e, a in zip(entries, embeddings_array)
        }
        if encoder_embeddings == 'only':
            serialize_embeddings(enc_embeddings, format_, dest_path,
                                 "encoder_embeddings")
            return 0

    embeddings_var, = VariableFilter(name='embeddings')(cg)
    compute = dict({"embeddings": embeddings_var})
    if c['proximity_coef'] != 0:
        prox_var, = VariableFilter(name='proximity_term')(cg)
        compute["proximity_term"] = prox_var
    print "sym args", sym_args
    predict_f = theano.function(sym_args, compute)
    batch_size = 256  # size of test_unseen
    stream = data.get_stream(part,
                             batch_size=batch_size,
                             max_length=c['max_length'],
                             remove_keys=False,
                             remove_n_identical_keys=False)
    raw_data = []  # list of dicts containing the inputs and computed outputs
    i = 0
    vocab = model._vocab
    print "start computing"
    embeddings = defaultdict(list)
    for input_data in stream.get_epoch_iterator(as_dict=True):
        if i % 10 == 0:
            print "iteration:", i
        words = input_data['words']
        words_mask = input_data['words_mask']
        keys = input_data['keys']
        n_identical_keys = input_data['n_identical_keys']
        args = [words, words_mask]
        if use_keys(c):
            args.append(keys)
        if use_n_identical_keys(c):
            args.append(n_identical_keys)

        to_save = predict_f(*args)
        for k, h in zip(keys, to_save['embeddings']):
            key = vec2str(k)
            if encoder_embeddings == 'if_missing':
                try:
                    del enc_embeddings[key]
                except KeyError:
                    pass
            embeddings[key].append(h)
        i += 1

    if encoder_embeddings in ['mixed', 'if_missing']:
        for k, e in enc_embeddings.iteritems():
            embeddings[k].append(e)

    if encoder_embeddings == 'mixed':
        prefix_fname = 'mix_e_'
    elif encoder_embeddings == 'if_missing':
        prefix_fname = 'if_mis_e_'
    else:
        prefix_fname = ''

    # combine:
    if average:
        mean_embeddings = {}
        for k in embeddings.keys():
            mean_embeddings[k] = np.mean(np.asarray(embeddings[k]), axis=0)
        serialize_embeddings(mean_embeddings, format_, dest_path,
                             prefix_fname + "mean_embeddings")
    else:
        serialize_embeddings(embeddings, format_, dest_path,
                             prefix_fname + "embeddings")
示例#25
0
def main(mode, save_path, steps, time_budget, reset):

    num_states = ChainDataset.num_states

    if mode == "train":
        # Experiment configuration
        rng = numpy.random.RandomState(1)
        batch_size = 50
        seq_len = 100
        dim = 10
        feedback_dim = 8

        # Build the bricks and initialize them
        transition = GatedRecurrent(name="transition", activation=Tanh(),
                                    dim=dim)
        generator = SequenceGenerator(
            LinearReadout(readout_dim=num_states, source_names=["states"],
                          emitter=SoftmaxEmitter(name="emitter"),
                          feedbacker=LookupFeedback(
                              num_states, feedback_dim, name='feedback'),
                          name="readout"),
            transition,
            weights_init=IsotropicGaussian(0.01), biases_init=Constant(0),
            name="generator")
        generator.push_initialization_config()
        transition.weights_init = Orthogonal()
        generator.initialize()

        logger.info("Parameters:\n" +
                    pprint.pformat(
                        [(key, value.get_value().shape) for key, value
                         in Selector(generator).get_params().items()],
                        width=120))
        logger.info("Markov chain entropy: {}".format(
            ChainDataset.entropy))
        logger.info("Expected min error: {}".format(
            -ChainDataset.entropy * seq_len * batch_size))

        if os.path.isfile(save_path) and not reset:
            model = Pylearn2Model.load(save_path)
        else:
            model = Pylearn2Model(generator)

        # Build the cost computation graph.
        # Note: would be probably nicer to make cost part of the model.
        x = tensor.ltensor3('x')
        cost = Pylearn2Cost(model.brick.cost(x[:, :, 0]).sum())

        dataset = ChainDataset(rng, seq_len)
        sgd = SGD(learning_rate=0.0001, cost=cost,
                  batch_size=batch_size, batches_per_iter=10,
                  monitoring_dataset=dataset,
                  monitoring_batch_size=batch_size,
                  monitoring_batches=1,
                  learning_rule=Pylearn2LearningRule(
                      SGDLearningRule(),
                      dict(training_objective=cost.cost)))
        train = Pylearn2Train(dataset, model, algorithm=sgd,
                              save_path=save_path, save_freq=10)
        train.main_loop(time_budget=time_budget)
    elif mode == "sample":
        model = Pylearn2Model.load(save_path)
        generator = model.brick

        sample = ComputationGraph(generator.generate(
            n_steps=steps, batch_size=1, iterate=True)).function()

        states, outputs, costs = [data[:, 0] for data in sample()]

        numpy.set_printoptions(precision=3, suppress=True)
        print("Generation cost:\n{}".format(costs.sum()))

        freqs = numpy.bincount(outputs).astype(floatX)
        freqs /= freqs.sum()
        print("Frequencies:\n {} vs {}".format(freqs,
                                               ChainDataset.equilibrium))

        trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX)
        for a, b in zip(outputs, outputs[1:]):
            trans_freqs[a, b] += 1
        trans_freqs /= trans_freqs.sum(axis=1)[:, None]
        print("Transition frequencies:\n{}\nvs\n{}".format(
            trans_freqs, ChainDataset.trans_prob))
    else:
        assert False
示例#26
0
def evaluate_lm(config, tar_path, part, num_examples, dest_path, **kwargs):
    c = config

    if part not in ['valid', 'test_unseen', 'test']:
        raise ValueError()

    data, lm, _ = initialize_data_and_model(c)
    words = T.ltensor3('words')
    words_mask = T.matrix('words_mask')

    costs = lm.apply(words, words_mask)
    cg = Model(costs)

    with open(tar_path) as src:
        cg.set_parameter_values(load_parameters(src))

    perplexities = VariableFilter(name_regex='perplexity.*')(cg)
    mask_sums = [p.tag.aggregation_scheme.denominator for p in perplexities]
    CEs = [p.tag.aggregation_scheme.numerator for p in perplexities]

    proba_out, = VariableFilter(name='proba_out')(cg)
    unk_ratios = VariableFilter(name_regex='unk_ratio.*')(cg)
    #num_definitions, = VariableFilter(name='num_definitions')(cg)
    print perplexities
    print CEs
    print mask_sums
    name_to_aggregate = [p.name for p in perplexities]
    for CE, mask_sum, name in zip(CEs, mask_sums, name_to_aggregate):
        CE.name = name + "_num"
        mask_sum.name = name + "_denom"

    compute_l = CEs + mask_sums + unk_ratios
    if part == 'test_unseen':
        compute_l.append(proba_out)

    compute = dict({p.name: p for p in compute_l})
    print "to compute:", compute.keys()
    predict_f = theano.function([words, words_mask], compute)

    if part == 'test_unseen':
        batch_size = 1
    else:
        batch_size = 128  # size of test_unseen
    stream = data.get_stream(part, batch_size=batch_size, max_length=100)
    raw_data = []  # list of dicts containing the inputs and computed outputs
    i = 0
    print "start computing"
    for input_data in stream.get_epoch_iterator(as_dict=True):
        if i and i % 100 == 0:
            print "iteration:", i
        words = input_data['words']
        words_mask = input_data['words_mask']
        to_save = predict_f(words, words_mask)
        to_save.update(input_data)
        raw_data.append(to_save)
        i += 1

    # aggregate in the log space
    aggregated = Counter()
    sum_mask_track = Counter()
    for d in raw_data:
        coef = d['words_mask'].sum()  # over timesteps and batches
        for name in name_to_aggregate:
            aggregated[name] += d[name + "_num"]
            sum_mask_track[name] += d[name + "_denom"]

    for k, v in aggregated.iteritems():
        print "k, v, m:", k, v, sum_mask_track[k]
        aggregated[k] = np.exp(v / sum_mask_track[k])

    n_params = sum([np.prod(p.shape.eval()) for p in cg.parameters])
    aggregated['n_params'] = n_params
    print "aggregated stats:", aggregated
    print "# of parameters {}".format(n_params)

    #TODO: check that different batch_size yields same validation error than
    # end of training validation error.
    # TODO: I think blocks aggreg is simply mean which should break
    # when we use masks??? investigate

    if not os.path.exists(dest_path):
        os.makedirs(dest_path)

    if part == 'test_unseen':
        np.savez(
            os.path.join(dest_path, "predictions"),
            words=input_data['words'],
            words_mask=input_data['words_mask'],
            #unk_ratio = to_save['unk_ratio'],
            #def_unk_ratio = to_save['def_unk_ratio'],
            proba_out=to_save['languagemodel_apply_proba_out'],
            vocab_in=lm._vocab.words[:c['num_input_words']],
            vocab_out=lm._vocab.words[:c['num_output_words']])

    json.dump(aggregated,
              open(os.path.join(dest_path, "aggregates.json"), "w"),
              sort_keys=True,
              indent=2)
示例#27
0
def train_model(new_training_job, config, save_path, params, fast_start,
                fuel_server, seed):
    c = config
    if seed:
        fuel.config.default_seed = seed
        blocks.config.config.default_seed = seed

    data, model = initialize_data_and_model(config, train_phase=True)

    # full main loop can be saved...
    main_loop_path = os.path.join(save_path, 'main_loop.tar')
    # or only state (log + params) which can be useful not to pickle embeddings
    state_path = os.path.join(save_path, 'training_state.tar')
    stream_path = os.path.join(save_path, 'stream.pkl')
    best_tar_path = os.path.join(save_path, "best_model.tar")

    keys = tensor.lmatrix('keys')
    n_identical_keys = tensor.lvector('n_identical_keys')
    words = tensor.ltensor3('words')
    words_mask = tensor.matrix('words_mask')
    if theano.config.compute_test_value != 'off':
        #TODO
        test_value_data = next(
            data.get_stream('train', batch_size=4,
                            max_length=5).get_epoch_iterator())
        words.tag.test_value = test_value_data[0]
        words_mask.tag.test_value = test_value_data[1]

    if use_keys(c) and use_n_identical_keys(c):
        costs = model.apply(words,
                            words_mask,
                            keys,
                            n_identical_keys,
                            train_phase=True)
    elif use_keys(c):
        costs = model.apply(words, words_mask, keys, train_phase=True)
    else:
        costs = model.apply(words, words_mask, train_phase=True)
    cost = rename(costs.mean(), 'mean_cost')

    cg = Model(cost)
    if params:
        logger.debug("Load parameters from {}".format(params))
        with open(params) as src:
            cg.set_parameter_values(load_parameters(src))

    length = rename(words.shape[1], 'length')
    perplexity, = VariableFilter(name='perplexity')(cg)
    monitored_vars = [length, cost, perplexity]
    if c['proximity_coef']:
        proximity_term, = VariableFilter(name='proximity_term')(cg)
        monitored_vars.append(proximity_term)

    print "inputs of the model:", cg.inputs

    parameters = cg.get_parameter_dict()
    trained_parameters = parameters.values()
    saved_parameters = parameters.values()
    if c['embedding_path']:
        if c['freeze_pretrained']:
            logger.debug(
                "Exclude pretrained encoder embeddings from the trained parameters"
            )
            to_freeze = 'main'
        elif c['provide_targets']:
            logger.debug(
                "Exclude pretrained targets from the trained parameters")
            to_freeze = 'target'
        trained_parameters = [
            p for p in trained_parameters
            if not p == model.get_def_embeddings_params(to_freeze)
        ]
        saved_parameters = [
            p for p in saved_parameters
            if not p == model.get_def_embeddings_params(to_freeze)
        ]

    logger.info("Cost parameters" + "\n" + pprint.pformat([
        " ".join(
            (key, str(parameters[key].get_value().shape),
             'trained' if parameters[key] in trained_parameters else 'frozen'))
        for key in sorted(parameters.keys())
    ],
                                                          width=120))

    rules = []
    if c['grad_clip_threshold']:
        rules.append(StepClipping(c['grad_clip_threshold']))
    rules.append(Adam(learning_rate=c['learning_rate'], beta1=c['momentum']))
    algorithm = GradientDescent(cost=cost,
                                parameters=trained_parameters,
                                step_rule=CompositeRule(rules))

    train_monitored_vars = list(monitored_vars)
    if c['grad_clip_threshold']:
        train_monitored_vars.append(algorithm.total_gradient_norm)

    if c['monitor_parameters']:
        train_monitored_vars.extend(parameter_stats(parameters, algorithm))

    # We use a completely random seed on purpose. With Fuel server
    # it's currently not possible to restore the state of the training
    # stream. That's why it's probably better to just have it stateless.
    stream_seed = numpy.random.randint(0, 10000000) if fuel_server else None
    training_stream = data.get_stream(
        'train',
        batch_size=c['batch_size'],
        max_length=c['max_length'],
        seed=stream_seed,
        remove_keys=not use_keys(c),
        remove_n_identical_keys=not use_n_identical_keys(c))
    print "trainin_stream will contains sources:", training_stream.sources

    original_training_stream = training_stream
    if fuel_server:
        # the port will be configured by the StartFuelServer extension
        training_stream = ServerDataStream(
            sources=training_stream.sources,
            produces_examples=training_stream.produces_examples)

    validate = c['mon_freq_valid'] > 0

    if validate:
        valid_stream = data.get_stream(
            'valid',
            batch_size=c['batch_size_valid'],
            max_length=c['max_length'],
            seed=stream_seed,
            remove_keys=not use_keys(c),
            remove_n_identical_keys=not use_n_identical_keys(c))
        validation = DataStreamMonitoring(
            monitored_vars, valid_stream,
            prefix="valid").set_conditions(before_first_epoch=not fast_start,
                                           on_resumption=True,
                                           every_n_batches=c['mon_freq_valid'])
        track_the_best = TrackTheBest(validation.record_name(cost),
                                      choose_best=min).set_conditions(
                                          on_resumption=True,
                                          after_epoch=True,
                                          every_n_batches=c['mon_freq_valid'])

    # don't save them the entire main loop to avoid pickling everything
    if c['fast_checkpoint']:
        cp_path = state_path
        load = (LoadNoUnpickling(cp_path,
                                 load_iteration_state=True,
                                 load_log=True).set_conditions(
                                     before_training=not new_training_job))
        cp_args = {
            'save_main_loop': False,
            'save_separately': ['log', 'iteration_state'],
            'parameters': saved_parameters
        }

    else:
        cp_path = main_loop_path
        load = (Load(cp_path, load_iteration_state=True,
                     load_log=True).set_conditions(
                         before_training=not new_training_job))
        cp_args = {
            'save_separately': ['iteration_state'],
            'parameters': saved_parameters
        }

    checkpoint = Checkpoint(cp_path,
                            before_training=not fast_start,
                            every_n_batches=c['save_freq_batches'],
                            after_training=not fast_start,
                            **cp_args)

    if c['checkpoint_every_n_batches'] > 0 or c[
            'checkpoint_every_n_epochs'] > 0:
        intermediate_cp = IntermediateCheckpoint(
            cp_path,
            every_n_epochs=c['checkpoint_every_n_epochs'],
            every_n_batches=c['checkpoint_every_n_batches'],
            after_training=False,
            **cp_args)

    if validate:
        checkpoint = checkpoint.add_condition(
            ['after_batch', 'after_epoch'],
            OnLogRecord(track_the_best.notification_name), (best_tar_path, ))

    extensions = [
        load,
        StartFuelServer(original_training_stream,
                        stream_path,
                        before_training=fuel_server),
        Timing(every_n_batches=c['mon_freq_train'])
    ]

    extensions.extend([
        TrainingDataMonitoring(train_monitored_vars,
                               prefix="train",
                               every_n_batches=c['mon_freq_train']),
    ])
    if validate:
        extensions.extend([validation, track_the_best])

    extensions.append(checkpoint)
    if c['checkpoint_every_n_batches'] > 0 or c[
            'checkpoint_every_n_epochs'] > 0:
        extensions.append(intermediate_cp)
    extensions.extend(
        [Printing(on_resumption=True, every_n_batches=c['mon_freq_train'])])

    if validate and c['n_valid_early'] > 0:
        extensions.append(
            FinishIfNoImprovementAfter(track_the_best.notification_name,
                                       iterations=c['n_valid_early'] *
                                       c['mon_freq_valid'],
                                       every_n_batches=c['mon_freq_valid']))
    extensions.append(FinishAfter(after_n_epochs=c['n_epochs']))

    logger.info("monitored variables during training:" + "\n" +
                pprint.pformat(train_monitored_vars, width=120))
    logger.info("monitored variables during valid:" + "\n" +
                pprint.pformat(monitored_vars, width=120))

    main_loop = MainLoop(algorithm,
                         training_stream,
                         model=Model(cost),
                         extensions=extensions)

    main_loop.run()
示例#28
0
def main():
    # ZEROUT_DUMMY_WORD = False
    ZEROUT_DUMMY_WORD = True

    ## Load data
    # mode = 'TRAIN-ALL'
    #mode = 'TRAIN_DATA'
    #mode = 'TRAIN_NO_OVERLAP'
    #if len(sys.argv) > 1:
    #    mode = sys.argv[1]
    #    if not mode in ['TRAIN', 'TRAIN-ALL']:
    #        print "ERROR! The two possible training settings are: ['TRAIN', 'TRAIN-ALL']"
    #        sys.exit(1)

    mode = 'k_time_data1'.upper()

    print "Running training in the {} setting".format(mode)

    position_num = 10
    select_model = "PSCM"
    if select_model == "PSCM":
        click_model_index = 4 #PSCM
    elif select_model == "UBM":
        click_model_index = 1
    else:
        raise "MODEL SELECT ERROR!"
    data_dir = mode

    add_train = numpy.load(os.path.join(data_dir, 'train.additions.npy'))
    q_train = numpy.load(os.path.join(data_dir, 'train.questions.npy'))
    a_train = numpy.load(os.path.join(data_dir, 'train.answers.npy'))
    y_train = numpy.load(os.path.join(data_dir, 'train.labels.npy'))

    add_dev = numpy.load(os.path.join(data_dir, 'dev.additions.npy'))
    q_dev = numpy.load(os.path.join(data_dir, 'dev.questions.npy'))
    a_dev = numpy.load(os.path.join(data_dir, 'dev.answers.npy'))
    #q_overlap_dev = numpy.load(os.path.join(data_dir, 'dev.q_overlap_indices.npy'))
    #a_overlap_dev = numpy.load(os.path.join(data_dir, 'dev.a_overlap_indices.npy'))
    y_dev = numpy.load(os.path.join(data_dir, 'dev.labels.npy'))
    qids_dev = numpy.load(os.path.join(data_dir, 'dev.qids.npy'))

    add_test = numpy.load(os.path.join(data_dir, 'test.additions.npy'))
    q_test = numpy.load(os.path.join(data_dir, 'test.questions.npy'))
    a_test = numpy.load(os.path.join(data_dir, 'test.answers.npy'))
    #q_overlap_test = numpy.load(os.path.join(data_dir, 'test.q_overlap_indices.npy'))
    #a_overlap_test = numpy.load(os.path.join(data_dir, 'test.a_overlap_indices.npy'))
    y_test = numpy.load(os.path.join(data_dir, 'test.labels.npy'))
    qids_test = numpy.load(os.path.join(data_dir, 'test.qids.npy'))

    # x_train = numpy.load(os.path.join(data_dir, 'train.overlap_feats.npy'))
    # x_dev = numpy.load(os.path.join(data_dir, 'dev.overlap_feats.npy'))
    # x_test = numpy.load(os.path.join(data_dir, 'test.overlap_feats.npy'))

    # feats_ndim = x_train.shape[1]

    # from sklearn.preprocessing import StandardScaler
    # scaler = StandardScaler()
    # print "Scaling overlap features"
    # x_train = scaler.fit_transform(x_train)
    # x_dev = scaler.transform(x_dev)
    # x_test = scaler.transform(x_test)

    #multi dim


    #y_train_tmp = numpy.dstack((y_train, y_train, y_train))[0]
    #y_dev_tmp = numpy.dstack((y_dev, y_dev, y_dev))[0]
    #y_test_tmp = numpy.dstack((y_test, y_test, y_test))[0]

    #y_train = y_train_tmp
    #y_dev = y_dev_tmp
    #y_test = y_test_tmp

    max_query_id = numpy.max([numpy.max(add_train[:, 0]), numpy.max(add_test[:, 0]), numpy.max(add_dev[:, 0])])
    max_url_id = numpy.max([numpy.max(add_train[:, 1:]), numpy.max(add_test[:, 1:]), numpy.max(add_dev[:, 1:])])

    print 'max_query_id', max_query_id
    print 'max_url_id', max_url_id

    print 'y_train', numpy.unique(y_train, return_counts=True)
    print 'y_dev', numpy.unique(y_dev, return_counts=True)
    print 'y_test', numpy.unique(y_test, return_counts=True)

    print 'q_train', q_train.shape
    print 'q_dev', q_dev.shape
    print 'q_test', q_test.shape

    print 'a_train', a_train.shape
    print 'a_dev', a_dev.shape
    print 'a_test', a_test.shape

    ## Get the word embeddings from the nnet trained on SemEval
    # ndim = 40
    # nnet_outdir = 'exp/ndim=60;batch=100;max_norm=0;learning_rate=0.1;2014-12-02-15:53:14'
    # nnet_fname = os.path.join(nnet_outdir, 'nnet.dat')
    # params_fname = os.path.join(nnet_outdir, 'best_dev_params.epoch=00;batch=14640;dev_f1=83.12;test_acc=85.00.dat')
    # train_nnet, test_nnet = nn_layers.load_nnet(nnet_fname, params_fname)

    numpy_rng = numpy.random.RandomState(123)
    q_max_sent_size = q_train.shape[1]
    a_max_sent_size = a_train.shape[2]
    # print 'max', numpy.max(a_train)
    # print 'min', numpy.min(a_train)

    #ndim = 5
    #print "Generating random vocabulary for word overlap indicator features with dim:", ndim
    #dummy_word_id = numpy.max(a_overlap_train)
    # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim))
    #print "Gaussian"
    #vocab_emb_overlap = numpy_rng.randn(dummy_word_id + 1, ndim) * 0.25
    # vocab_emb_overlap = numpy_rng.randn(dummy_word_id+1, ndim) * 0.05
    # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim))
    #vocab_emb_overlap[-1] = 0

    # Load word2vec embeddings
    fname = os.path.join(data_dir, 'emb_vectors.skip.1124.4m.10w.npy')

    print "Loading word embeddings from", fname
    vocab_emb = numpy.load(fname)
    ndim = vocab_emb.shape[1]
    dummpy_word_idx = numpy.max(a_train)
    print "Word embedding matrix size:", vocab_emb.shape

    x = T.dmatrix('x')
    x_q = T.lmatrix('q')
    #x_q_overlap = T.lmatrix('q_overlap')
    #x_a = T.lmatrix('a')
    x_a_all = T.ltensor3('a_all')
    #x_a_overlap = T.lmatrix('a_overlap')
    #y = T.ivector('y')
    y = T.imatrix('y')
    add_info = T.dmatrix('add_info')

    #######
    n_outs = 2

    n_epochs = 15
    batch_size = 50
    learning_rate = 0.1
    max_norm = 0

    print 'batch_size', batch_size
    print 'n_epochs', n_epochs
    print 'learning_rate', learning_rate
    print 'max_norm', max_norm

    ## 1st conv layer.
    #ndim = vocab_emb.shape[1] + vocab_emb_overlap.shape[1]
    ndim = vocab_emb.shape[1]

    ### Nonlinearity type
    # activation = nn_layers.relu_f
    activation = T.tanh

    dropout_rate = 0.5
    nkernels = 100
    q_k_max = 1
    a_k_max = 1

    # filter_widths = [3,4,5]
    q_filter_widths = [5]
    a_filter_widths = [5]

    ###### QUESTION ######
    lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths) - 1)
    #lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=max(q_filter_widths) - 1)

    #lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words, lookup_table_overlap])
    lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words])

    num_input_channels = 1
    input_shape = (batch_size, num_input_channels, q_max_sent_size + 2 * (max(q_filter_widths) - 1), ndim)

    conv_layers = []
    for filter_width in q_filter_widths:
        filter_shape = (nkernels, num_input_channels, filter_width, ndim)
        conv = nn_layers.Conv2dLayer(rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape)
        non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0], activation=activation)
        pooling = nn_layers.KMaxPoolLayer(k_max=q_k_max)
        conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(layers=[conv, non_linearity, pooling])
        conv_layers.append(conv2dNonLinearMaxPool)

    join_layer = nn_layers.ParallelLayer(layers=conv_layers)
    flatten_layer = nn_layers.FlattenLayer()

    nnet_q = nn_layers.FeedForwardNet(layers=[
        lookup_table,
        join_layer,
        flatten_layer,
    ])
    #nnet_q.set_input((x_q, x_q_overlap))
    nnet_q.set_input([x_q])
    ######


    ###### ANSWER ######
    nnet_a_list = []
    #lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths) - 1)
    for i in xrange(position_num):
        #lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths) - 1)
        #lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=max(q_filter_widths) - 1)

        #lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words, lookup_table_overlap])
        #lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words])

        # num_input_channels = len(lookup_table.layers)
        #input_shape = (batch_size, num_input_channels, a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim)
        input_shape = (batch_size, num_input_channels, a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim)
        conv_layers = []
        for filter_width in a_filter_widths:
            filter_shape = (nkernels, num_input_channels, filter_width, ndim)
            conv = nn_layers.Conv2dLayer(rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape)
            non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0], activation=activation)
            pooling = nn_layers.KMaxPoolLayer(k_max=a_k_max)
            conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(layers=[conv, non_linearity, pooling])
            conv_layers.append(conv2dNonLinearMaxPool)

        join_layer = nn_layers.ParallelLayer(layers=conv_layers)
        flatten_layer = nn_layers.FlattenLayer()

        nnet_a = nn_layers.FeedForwardNet(layers=[
            lookup_table,
            join_layer,
            flatten_layer,
        ])
        #nnet_a.set_input((x_a, x_a_overlap))
        nnet_a.set_input([x_a_all[:, i, :]])
        nnet_a_list.append(nnet_a)
    #######
    # print 'nnet_q.output', nnet_q.output.ndim

    q_logistic_n_in = nkernels * len(q_filter_widths) * q_k_max
    #a_logistic_n_in = nkernels * len(a_filter_widths) * a_k_max
    a_logistic_n_in = nkernels * len(a_filter_widths) * a_k_max

    print "q_logistic_n_in, ", q_logistic_n_in
    print "a_logistic_n_in, ", a_logistic_n_in

    #pairwise_layer = nn_layers.PositionPairwiseNoFeatsLayer(q_in=q_logistic_n_in, a_in=a_logistic_n_in,position=position_num)
    pairwise_layer = nn_layers.PositionOnlySimPairwiseNoFeatsLayer(q_in=q_logistic_n_in, a_in=a_logistic_n_in,position=position_num)
    pairwise_out_list = [nnet_q.output]
    for i in xrange(position_num):
        pairwise_out_list.append(nnet_a_list[i].output)
    pairwise_layer.set_input(pairwise_out_list)
    #pairwise_layer.set_input((nnet_q.output, nnet_a.output))

    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + a_logistic_n_in
    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 50
    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 1
    #n_in = q_logistic_n_in + a_logistic_n_in * position_num + 1 * position_num
    #n_in = 1 * position_num + position_num * (position_num - 1) / 2
    n_in = q_logistic_n_in + a_logistic_n_in * position_num + 1 * position_num + position_num * (position_num - 1) / 2
    # n_in = feats_ndim + 1
    # n_in = feats_ndim + 50

    hidden_layer = nn_layers.LinearLayer(numpy_rng, n_in=n_in, n_out=n_in, activation=activation)
    hidden_layer.set_input(pairwise_layer.output)

    #classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs)
    #classifier.set_input(hidden_layer.output)

    classifier = nn_layers.FeatureClickModelLayer(n_in=n_in, n_out=n_outs, max_q_id=max_query_id, max_u_id=max_url_id, dim=position_num,click_model_index=click_model_index)
    #classifier = nn_layers.SimpleClickModelLayer(n_in=n_in, n_out=n_outs, max_q_id=max_query_id, max_u_id=max_url_id, dim=position_num)
    #classifier = nn_layers.MultiDimLogisticRegression(n_in=n_in, n_out=n_outs, dim=position_num)
    #classifier = nn_layers.LogisticRegression2(n_in=n_in, n_out=n_outs)
    classifier.set_input([hidden_layer.output, add_info])

    #train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, pairwise_layer, hidden_layer, classifier],
    #                                      name="Training nnet")
    train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q] + nnet_a_list + [pairwise_layer, hidden_layer, classifier], name="Training nnet")
    test_nnet = train_nnet
    #######

    #print train_nnet

    params = train_nnet.params

    ts = datetime.now().strftime('%Y-%m-%d-%H.%M.%S')
    nnet_outdir = 'exp.multi.out/model={},data={};ndim={};batch={};max_norm={};learning_rate={};{}'.format(select_model,mode, ndim, batch_size, max_norm,learning_rate, ts)
    if not os.path.exists(nnet_outdir):
        os.makedirs(nnet_outdir)
    nnet_fname = os.path.join(nnet_outdir, 'nnet.dat')
    print "Saving to", nnet_fname
    cPickle.dump([train_nnet, test_nnet], open(nnet_fname, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL)

    #total_params = sum([numpy.prod(param.shape.eval()) for param in params])
    #print 'Total params number:', total_params

    cost = train_nnet.layers[-1].training_cost(y)
    # y_train_counts = numpy.unique(y_train, return_counts=True)[1].astype(numpy.float32)
    # weights_data = numpy.sum(y_train_counts) / y_train_counts
    # weights_data_norm = numpy.linalg.norm(weights_data)
    # weights_data /= weights_data_norm
    # print 'weights_data', weights_data
    # weights = theano.shared(weights_data, borrow=True)
    # cost = train_nnet.layers[-1].training_cost_weighted(y, weights=weights)

    predictions = test_nnet.layers[-1].y_pred

    #predictions_prob = test_nnet.layers[-1].p_y_given_x[:, position_num:position_num * 2]
    predictions_prob = test_nnet.layers[-1].p_y_given_x

    ### L2 regularization
    # L2_word_emb = 1e-4
    # L2_conv1d = 3e-5
    # # L2_softmax = 1e-3
    # L2_softmax = 1e-4
    # print "Regularizing nnet weights"
    # for w in train_nnet.weights:
    #   L2_reg = 0.
    #   if w.name.startswith('W_emb'):
    #     L2_reg = L2_word_emb
    #   elif w.name.startswith('W_conv1d'):
    #     L2_reg = L2_conv1d
    #   elif w.name.startswith('W_softmax'):
    #     L2_reg = L2_softmax
    #   elif w.name == 'W':
    #     L2_reg = L2_softmax
    #   print w.name, L2_reg
    #   cost += T.sum(w**2) * L2_reg

    # batch_x = T.dmatrix('batch_x')
    batch_x_q = T.lmatrix('batch_x_q')
    #batch_x_a = T.lmatrix('batch_x_a')
    batch_x_a_all = T.ltensor3('batch_x_a_all')
    #batch_x_q_overlap = T.lmatrix('batch_x_q_overlap')
    #batch_x_a_overlap = T.lmatrix('batch_x_a_overlap')
    #batch_y = T.ivector('batch_y')
    batch_y = T.imatrix('batch_y')
    batch_add_info = T.dmatrix('batch_add_info')

    # updates = sgd_trainer.get_adagrad_updates(cost, params, learning_rate=learning_rate, max_norm=max_norm, _eps=1e-6)
    updates = sgd_trainer.get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=max_norm,
                                               word_vec_name='W_emb')

    inputs_pred = [batch_x_q,
                   batch_x_a_all,
                   batch_add_info,
                   #batch_x_q_overlap,
                   #batch_x_a_overlap,
                   # batch_x,
    ]

    givens_pred = {x_q: batch_x_q,
                   x_a_all: batch_x_a_all,
                   add_info: batch_add_info,
                   #x_q_overlap: batch_x_q_overlap,
                   #x_a_overlap: batch_x_a_overlap,
                   # x: batch_x
    }

    inputs_train = [batch_x_q,
                    batch_x_a_all,
                    #batch_x_q_overlap,
                    #batch_x_a_overlap,
                    # batch_x,
                    batch_add_info,
                    batch_y,
    ]

    givens_train = {x_q: batch_x_q,
                    x_a_all: batch_x_a_all,
                    #x_q_overlap: batch_x_q_overlap,
                    #x_a_overlap: batch_x_a_overlap,
                    # x: batch_x,
                    add_info: batch_add_info,
                    y: batch_y}

    train_fn = theano.function(inputs=inputs_train,
                               outputs=cost,
                               updates=updates,
                               givens=givens_train,
                               on_unused_input='warn')

    pred_fn = theano.function(inputs=inputs_pred,
                              outputs=predictions,
                              givens=givens_pred,
                              on_unused_input='warn')

    pred_prob_fn = theano.function(inputs=inputs_pred,
                                   outputs=predictions_prob,
                                   givens=givens_pred,
                                   on_unused_input='warn')

    def predict_batch(batch_iterator):
        #preds = numpy.vstack([pred_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap) for
        #                      batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, _ in batch_iterator])
        preds = numpy.vstack([pred_fn(batch_x_q, batch_x_a, batch_add_info) for
                              batch_x_q, batch_x_a, batch_add_info, _ in batch_iterator])
        real_preds = preds[:, -1 * position_num:]
        inner_outputs = preds

        return real_preds[:batch_iterator.n_samples], inner_outputs[:batch_iterator.n_samples]

    def predict_prob_batch(batch_iterator):
        #preds = numpy.vstack([pred_prob_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap) for
        #                      batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, _ in batch_iterator])
        preds = numpy.vstack([pred_prob_fn(batch_x_q, batch_x_a, batch_add_info) for
                              batch_x_q, batch_x_a, batch_add_info, _ in batch_iterator])
        real_preds = preds[:, -1 * position_num:]
        inner_outputs = preds

        return real_preds[:batch_iterator.n_samples], inner_outputs[:batch_iterator.n_samples]

    train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(numpy_rng, [q_train, a_train, add_train, y_train],batch_size=batch_size, randomize=True)
    dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(numpy_rng,[q_dev, a_dev, add_dev, y_dev], batch_size=batch_size, randomize=False)
    test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(numpy_rng,[q_test, a_test, add_test, y_test], batch_size=batch_size, randomize=False)

    labels = sorted(numpy.unique(y_test[:, -1]))
    print 'labels', labels

    def perplexity_score(labels, preds):
        positionPerplexity = [0.0] * position_num
        positionPerplexityClickSkip = [[0.0, 0.0] for i in xrange(position_num)]
        counts = [0] * position_num
        countsClickSkip = [[0, 0] for i in xrange(position_num)]
        for label, pred in zip(labels, preds):
            for i in range(0, len(label)):
                click = 1 if label[i] else 0
                tmp_pred = max(min(pred[i], 0.99999), 0.00001)
                logProb = math.log(tmp_pred, 2)
                if click == 0:
                    logProb = math.log(1 - tmp_pred, 2)
                positionPerplexity[i] += logProb
                positionPerplexityClickSkip[i][click] += logProb
                counts[i] += 1
                countsClickSkip[i][click] += 1
        positionPerplexity = [2 ** (-x / count if count else x) for (x, count) in zip(positionPerplexity, counts)]
        positionPerplexityClickSkip = [[2 ** (-x[click] / (count[click] if count[click] else 1) if count else x) \
                for (x, count) in zip(positionPerplexityClickSkip, countsClickSkip)] for click in xrange(2)]
        perplexity = sum(positionPerplexity) / len(positionPerplexity)
        ret_str = "---------\n"
        ret_str += "Perplexity\t" + str(perplexity) + "\n"
        ret_str += "positionPerplexity"
        for i in range(0, position_num):
            ret_str += "\t" + str(positionPerplexity[i])
        ret_str += "\n"

        ret_str += "positionPerplexitySkip"
        for i in range(0, position_num):
            ret_str += "\t" + str(positionPerplexityClickSkip[0][i])
        ret_str += "\n"

        ret_str += "positionPerplexityClick"
        for i in range(0, position_num):
            ret_str += "\t" + str(positionPerplexityClickSkip[1][i])
        ret_str += "\n------------\n"
        #print ret_str
        return perplexity, ret_str

    def map_score(qids, labels, preds):
        qid2cand = defaultdict(list)
        for qid, label, pred in zip(qids, labels, preds):
            qid2cand[qid].append((pred, label))

        average_precs = []
        for qid, candidates in qid2cand.iteritems():
            average_prec = 0
            running_correct_count = 0
            for i, (score, label) in enumerate(sorted(candidates, reverse=True), 1):
                if label > 0:
                    running_correct_count += 1
                    average_prec += float(running_correct_count) / i
            average_precs.append(average_prec / (running_correct_count + 1e-6))
        map_score = sum(average_precs) / len(average_precs)
        return map_score

    print "Zero out dummy word:", ZEROUT_DUMMY_WORD
    if ZEROUT_DUMMY_WORD:
        W_emb_list = [w for w in params if w.name == 'W_emb']
        zerout_dummy_word = theano.function([], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list])


    # weights_dev = numpy.zeros(len(y_dev))
    # weights_dev[y_dev == 0] = weights_data[0]
    # weights_dev[y_dev == 1] = weights_data[1]
    # print weights_dev

    best_dev_acc = -numpy.inf
    best_dev_perp = numpy.inf
    epoch = 0
    timer_train = time.time()
    no_best_dev_update = 0
    num_train_batches = len(train_set_iterator)
    while epoch < n_epochs:
        timer = time.time()
        for i, (x_q, x_a, add, y) in enumerate(tqdm(train_set_iterator), 1):
            train_fn(x_q, x_a, add, y)

            # Make sure the null word in the word embeddings always remains zero
            if ZEROUT_DUMMY_WORD:
                zerout_dummy_word()

            if i % 10 == 0 or i == num_train_batches:
                y_pred_dev, y_inner_dev = predict_prob_batch(dev_set_iterator)
                #print "shape:"
                #print str(y_dev.shape)
                #print str(y_pred_dev.shape)
                # # dev_acc = map_score(qids_dev, y_dev, predict_prob_batch(dev_set_iterator)) * 100
                dev_acc = metrics.roc_auc_score(y_dev[:, -1], y_pred_dev[:, -1]) * 100
                dev_perp, dev_perp_str = perplexity_score(y_dev, y_pred_dev)
                if dev_acc > best_dev_acc:
                    y_pred, y_inner = predict_prob_batch(test_set_iterator)
                    test_acc = map_score(qids_test, y_test[:, -1], y_pred[:, -1]) * 100
                    print('epoch: {} batch: {} dev auc: {:.4f}; test map: {:.4f}; best_dev_acc: {:.4f}'.format(epoch, i, dev_acc, test_acc, best_dev_acc))
                    best_dev_acc = dev_acc

                if dev_perp < best_dev_perp:
                    y_pred, y_inner = predict_prob_batch(test_set_iterator)
                    test_acc = map_score(qids_test, y_test[:, -1], y_pred[:, -1]) * 100
                    test_perplexity, test_perplexity_str = perplexity_score(y_test, y_pred)
                    print('epoch: {} batch: {} dev auc: {:.4f}; test map: {:.4f}; best_dev_acc: {:.4f}; dev_perp: {:.4f}; best_dev_perp: {:.4f}'.format(epoch, i, dev_acc, test_acc, best_dev_acc, dev_perp, best_dev_perp))
                    print str(test_perplexity_str)
                    best_params = [numpy.copy(p.get_value(borrow=True)) for p in params]
                    best_inner = y_inner
                    no_best_dev_update = 0
                    best_dev_perp = dev_perp
        if no_best_dev_update >= 3:
            print "Quitting after of no update of the best score on dev set", no_best_dev_update
            break

        numpy.savetxt(os.path.join(nnet_outdir, 'test.epoch={:02d};batch={:05d};dev_perp={:.2f}.best_inner.npy'.format(epoch, i, best_dev_perp)), best_inner)
        print('epoch {} took {:.4f} seconds'.format(epoch, time.time() - timer))
        epoch += 1
        no_best_dev_update += 1

    print('Training took: {:.4f} seconds'.format(time.time() - timer_train))
    for i, param in enumerate(best_params):
        params[i].set_value(param, borrow=True)

    y_pred_test, y_inner_test = predict_prob_batch(test_set_iterator)
    test_acc = map_score(qids_test, y_test[:, -1], y_pred_test[:, -1]) * 100
    test_perp, test_perp_str = perplexity_score(y_test, y_pred_test)
    print "FINAL ACCURACY"
    print str(test_acc)
    print "FINAL PERPLEXITY"
    print str(test_perp_str)
    fname = os.path.join(nnet_outdir, 'best_dev_params.epoch={:02d};batch={:05d};dev_acc={:.2f}.dat'.format(epoch, i, best_dev_acc))
    numpy.savetxt(os.path.join(nnet_outdir, 'test.epoch={:02d};batch={:05d};dev_acc={:.2f}.predictions.npy'.format(epoch, i, best_dev_acc)), y_pred_test)
    numpy.savetxt(os.path.join(nnet_outdir, 'test.final.epoch={:02d};batch={:05d};dev_acc={:.2f}.best_inner.npy'.format(epoch, i, best_dev_acc)), best_inner)
    cPickle.dump(best_params, open(fname, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL)