def main():
    # ZEROUT_DUMMY_WORD = False
    ZEROUT_DUMMY_WORD = True

    ## Load data
    # mode = 'TRAIN-ALL'
    mode = 'train'
    if len(sys.argv) > 1:
        mode = sys.argv[1]
        if not mode in ['TRAIN', 'TRAIN-ALL']:
            print "ERROR! The two possible training settings are: ['TRAIN', 'TRAIN-ALL']"
            sys.exit(1)

    print "Running training in the {} setting".format(mode)

    data_dir = mode

    # 加载数据集词向量
    if mode in ['TRAIN-ALL']:
        q_train = numpy.load(os.path.join(data_dir, 'train-all.questions.npy'))
        a_train = numpy.load(os.path.join(data_dir, 'train-all.answers.npy'))
        q_overlap_train = numpy.load(
            os.path.join(data_dir, 'train-all.q_overlap_indices.npy'))
        a_overlap_train = numpy.load(
            os.path.join(data_dir, 'train-all.a_overlap_indices.npy'))
        y_train = numpy.load(os.path.join(data_dir, 'train-all.labels.npy'))
    else:
        q_train = numpy.load(os.path.join(data_dir, 'train.questions.npy'))
        a_train = numpy.load(os.path.join(data_dir, 'train.answers.npy'))
        q_overlap_train = numpy.load(
            os.path.join(data_dir, 'train.q_overlap_indices.npy'))
        a_overlap_train = numpy.load(
            os.path.join(data_dir, 'train.a_overlap_indices.npy'))
        y_train = numpy.load(os.path.join(data_dir, 'train.labels.npy'))

    q_dev = numpy.load(os.path.join(data_dir, 'dev.questions.npy'))
    a_dev = numpy.load(os.path.join(data_dir, 'dev.answers.npy'))
    q_overlap_dev = numpy.load(
        os.path.join(data_dir, 'dev.q_overlap_indices.npy'))
    a_overlap_dev = numpy.load(
        os.path.join(data_dir, 'dev.a_overlap_indices.npy'))
    y_dev = numpy.load(os.path.join(data_dir, 'dev.labels.npy'))
    qids_dev = numpy.load(os.path.join(data_dir, 'dev.qids.npy'))

    q_test = numpy.load(os.path.join(data_dir, 'test.questions.npy'))
    a_test = numpy.load(os.path.join(data_dir, 'test.answers.npy'))
    q_overlap_test = numpy.load(
        os.path.join(data_dir, 'test.q_overlap_indices.npy'))
    a_overlap_test = numpy.load(
        os.path.join(data_dir, 'test.a_overlap_indices.npy'))
    y_test = numpy.load(os.path.join(data_dir, 'test.labels.npy'))
    qids_test = numpy.load(os.path.join(data_dir, 'test.qids.npy'))

    # x里放的是overlap feat
    x_train = numpy.load(os.path.join(data_dir, 'train.overlap_feats.npy'))
    x_dev = numpy.load(os.path.join(data_dir, 'dev.overlap_feats.npy'))
    x_test = numpy.load(os.path.join(data_dir, 'test.overlap_feats.npy'))

    feats_ndim = x_train.shape[1]

    # from sklearn.preprocessing import StandardScaler
    # scaler = StandardScaler()
    # print "Scaling overlap features"
    # x_train = scaler.fit_transform(x_train)
    # x_dev = scaler.transform(x_dev)
    # x_test = scaler.transform(x_test)

    print 'y_train', numpy.unique(y_train, return_counts=True)
    print 'y_dev', numpy.unique(y_dev, return_counts=True)
    print 'y_test', numpy.unique(y_test, return_counts=True)

    print 'q_train', q_train.shape
    print 'q_dev', q_dev.shape
    print 'q_test', q_test.shape

    print 'a_train', a_train.shape
    print 'a_dev', a_dev.shape
    print 'a_test', a_test.shape

    # print 'a_overlap_train',a_overlap_train.shape

    # print 'x_train',x_train.shape
    # print 'x_dev',x_dev.shape
    # print 'x_test',x_test.shape

    ## Get the word embeddings from the nnet trained on SemEval
    # ndim = 40
    # nnet_outdir = 'exp/ndim=60;batch=100;max_norm=0;learning_rate=0.1;2014-12-02-15:53:14'
    # nnet_fname = os.path.join(nnet_outdir, 'nnet.dat')
    # params_fname = os.path.join(nnet_outdir, 'best_dev_params.epoch=00;batch=14640;dev_f1=83.12;test_acc=85.00.dat')
    # train_nnet, test_nnet = nn_layers.load_nnet(nnet_fname, params_fname)
    numpy.random.RandomState()
    # 指定种子值(指定种子值是为了使同样的条件下每次产生的随机数一样,避免程序调试时由随机数不同而引起的问题)
    numpy_rng = numpy.random.RandomState(123)
    # question中最长的长度
    q_max_sent_size = q_train.shape[1]
    # answer中最长的长度
    a_max_sent_size = a_train.shape[1]
    # print 'max', numpy.max(a_train)
    # print 'min', numpy.min(a_train)

    ndim = 5
    print "Generating random vocabulary for word overlap indicator features with dim:", ndim
    # numpy.max在不指定维度信息时,返回数组中的最大的一个值
    #QQQQQQ
    dummy_word_id = numpy.max(a_overlap_train)
    print "dummy_word_id:", dummy_word_id
    # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim))
    print "Gaussian"
    # 从标准正态分布中生成维度为(a,b)的随机数组
    # 这一行看起来像是对未登录词的初始化
    # QQQQQ
    vocab_emb_overlap = numpy_rng.randn(dummy_word_id + 1, ndim) * 0.25
    # vocab_emb_overlap = numpy_rng.randn(dummy_word_id+1, ndim) * 0.05
    # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim))

    # [-1]引用的是矩阵的最后一行
    # vocab_emb_overlap[-1] = 0

    # Load word2vec embeddings
    fname = os.path.join(data_dir, 'emb_aquaint+wiki.txt.gz.ndim=50.bin.npy')

    print "Loading word embeddings from", fname
    vocab_emb = numpy.load(fname)
    ndim = vocab_emb.shape[1]
    dummpy_word_idx = numpy.max(a_train)
    print "Word embedding matrix size:", vocab_emb.shape

    # x = T.dmatrix('x')
    x_q = T.lmatrix('q')
    x_q_overlap = T.lmatrix('q_overlap')
    x_a = T.lmatrix('a')
    x_a_overlap = T.lmatrix('a_overlap')
    y = T.ivector('y')

    #######
    n_outs = 2

    n_epochs = 25
    batch_size = 50
    learning_rate = 0.1
    max_norm = 0

    print 'batch_size', batch_size
    print 'n_epochs', n_epochs
    print 'learning_rate', learning_rate
    print 'max_norm', max_norm

    ## 1st conv layer.
    ndim = vocab_emb.shape[1] + vocab_emb_overlap.shape[1]
    # ndim = vocab_emb.shape[1]
    print "1st conv layer dim:", ndim

    ### Nonlinearity type
    # activation = nn_layers.relu_f
    activation = T.tanh

    dropout_rate = 0.5
    # feature map数目
    nkernels = 100
    q_k_max = 1
    a_k_max = 1

    # filter_widths = [3,4,5]
    q_filter_widths = [5]
    a_filter_widths = [5]

    ###### QUESTION ######
    # 首先获得词向量信息
    # QQQQ为什么要有这两层?似乎已经获得了词的词向量表示:可能是用于为是每个具体的句子获得词向量表示
    # QQQQQ pad具体实现
    # lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths)-1)
    lookup_table_words = nn_layers.LookupTableFast(W=vocab_emb,
                                                   pad=max(q_filter_widths) -
                                                   1)
    #QQQQQ这一层的用途?可能也是来获得具体的句子对的overlap向量
    lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap,
                                                     pad=max(q_filter_widths) -
                                                     1)
    # lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words])
    lookup_table = nn_layers.ParallelLookupTable(
        layers=[lookup_table_words, lookup_table_overlap])

    # 因为是文本数据所以是单通道
    num_input_channels = 1
    # QQQQQq_max_sent_size + 2 * (max(q_filter_widths) - 1) 这一项的含义:因为在lookup中都加了两倍的对应长度的pad
    # QQQQ以及最后一项为什么是ndim
    # Minibatch of feature map stacks, of shape(batch  size, stack size, nb row, nb  col) see the optional parameter image_shape
    input_shape = (batch_size, num_input_channels,
                   q_max_sent_size + 2 * (max(q_filter_widths) - 1), ndim)
    print "convlution layer input shape:", input_shape

    conv_layers = []
    # 对各个filter构造卷积层
    # QQQQ各层的w矩阵初始化方案有所不同?初始化可能有哪些方案以及各种方案的性能
    for filter_width in q_filter_widths:
        # 每一层卷积的构造
        #Set of filters used in CNN layer of shape (nb filters, stack size, nb row, nb col)
        filter_shape = (nkernels, num_input_channels, filter_width, ndim)
        # 此处采用的是2D卷积
        conv = nn_layers.Conv2dLayer(rng=numpy_rng,
                                     filter_shape=filter_shape,
                                     input_shape=input_shape)
        non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0],
                                                    activation=activation)
        pooling = nn_layers.KMaxPoolLayer(k_max=q_k_max)
        conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(
            layers=[conv, non_linearity, pooling])
        conv_layers.append(conv2dNonLinearMaxPool)

    join_layer = nn_layers.ParallelLayer(layers=conv_layers)
    flatten_layer = nn_layers.FlattenLayer()

    nnet_q = nn_layers.FeedForwardNet(layers=[
        lookup_table,
        join_layer,
        flatten_layer,
    ])
    nnet_q.set_input((x_q, x_q_overlap))
    # nnet_q.set_input(x_q)
    ######

    ###### ANSWER ######
    # lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths)-1)
    lookup_table_words = nn_layers.LookupTableFast(W=vocab_emb,
                                                   pad=max(q_filter_widths) -
                                                   1)
    lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap,
                                                     pad=max(q_filter_widths) -
                                                     1)

    lookup_table = nn_layers.ParallelLookupTable(
        layers=[lookup_table_words, lookup_table_overlap])
    # lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words])
    # num_input_channels = len(lookup_table.layers)
    input_shape = (batch_size, num_input_channels,
                   a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim)
    conv_layers = []
    for filter_width in a_filter_widths:
        filter_shape = (nkernels, num_input_channels, filter_width, ndim)
        conv = nn_layers.Conv2dLayer(rng=numpy_rng,
                                     filter_shape=filter_shape,
                                     input_shape=input_shape)
        non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0],
                                                    activation=activation)
        pooling = nn_layers.KMaxPoolLayer(k_max=a_k_max)
        conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(
            layers=[conv, non_linearity, pooling])
        conv_layers.append(conv2dNonLinearMaxPool)

    join_layer = nn_layers.ParallelLayer(layers=conv_layers)
    # QQQQ为啥这里有个flattenlayer
    flatten_layer = nn_layers.FlattenLayer()

    nnet_a = nn_layers.FeedForwardNet(layers=[
        lookup_table,
        join_layer,
        flatten_layer,
    ])
    # QQQQ此处x_a_overlap的用处是?
    nnet_a.set_input((x_a, x_a_overlap))
    # nnet_a.set_input(x_a)
    #######
    # print 'nnet_q.output', nnet_q.output.ndim

    # QQQQQ这里又是干嘛的
    q_logistic_n_in = nkernels * len(q_filter_widths) * q_k_max
    a_logistic_n_in = nkernels * len(a_filter_widths) * a_k_max

    # dropout_q = nn_layers.FastDropoutLayer(rng=numpy_rng)
    # dropout_a = nn_layers.FastDropoutLayer(rng=numpy_rng)
    # dropout_q.set_input(nnet_q.output)
    # dropout_a.set_input(nnet_a.output)

    # feats_nout = 10
    # x_hidden_layer = nn_layers.LinearLayer(numpy_rng, n_in=feats_ndim, n_out=feats_nout, activation=activation)
    # x_hidden_layer.set_input(x)

    # feats_nout = feats_ndim

    ### Dropout
    # classifier = nn_layers.PairwiseLogisticWithFeatsRegression(q_in=q_logistic_n_in,
    #                                                   a_in=a_logistic_n_in,
    #                                                   n_in=feats_nout,
    #                                                   n_out=n_outs)
    # classifier.set_input((dropout_q.output, dropout_a.output, x_hidden_layer.output))
    # classifier.set_input((dropout_q.output, dropout_a.output, x))

    # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, dropout_q, dropout_a, classifier],
    # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, dropout_q, dropout_a, classifier],
    #                                       name="Training nnet")

    # test_classifier = nn_layers.PairwiseLogisticWithFeatsRegression(q_in=q_logistic_n_in,
    #                                                         a_in=a_logistic_n_in,
    #                                                         n_in=feats_nout,
    #                                                         n_out=n_outs,
    #                                                         W=classifier.W,
    #                                                         W_feats=classifier.W_feats,
    #                                                         b=classifier.b)
    # test_classifier.set_input((nnet_q.output, nnet_a.output, x_hidden_layer.output))
    # test_classifier.set_input((nnet_q.output, nnet_a.output, x))
    # test_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, test_classifier],
    # test_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, test_classifier],
    #                                       name="Test nnet")
    #########

    # 此处应该是进行句子匹配层
    #   pairwise_layer = nn_layers.PairwiseMultiOnlySimWithFeatsLayer(q_in=q_logistic_n_in,

    pairwise_layer = nn_layers.PairwiseNoFeatsLayer(
        q_in=q_logistic_n_in,
        # pairwise_layer = nn_layers.PairwiseWithFeatsLayer(q_in=q_logistic_n_in,
        # pairwise_layer = nn_layers.PairwiseOnlySimWithFeatsLayer(q_in=q_logistic_n_in,
        a_in=a_logistic_n_in)
    pairwise_layer.set_input((nnet_q.output, nnet_a.output))

    # 此处n_in的取值要根据上一层匹配层的方案进行不同的计算
    #   n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + a_logistic_n_in
    #   n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 50
    #   n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 1
    n_in = q_logistic_n_in + a_logistic_n_in + 1
    # n_in = feats_ndim + 1
    # n_in = feats_ndim + 50

    hidden_layer = nn_layers.LinearLayer(numpy_rng,
                                         n_in=n_in,
                                         n_out=n_in,
                                         activation=activation)
    hidden_layer.set_input(pairwise_layer.output)

    classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs)
    classifier.set_input(hidden_layer.output)

    # dropout2
    # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, pairwise_layer, hidden_layer,dropout_q,dropout_a, classifier],
    train_nnet = nn_layers.FeedForwardNet(
        layers=[nnet_q, nnet_a, pairwise_layer, hidden_layer, classifier],
        # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, classifier],
        name="Training nnet")
    test_nnet = train_nnet
    #######

    print train_nnet

    params = train_nnet.params

    ts = datetime.now().strftime('%Y-%m-%d-%H.%M.%S')
    nnet_outdir = 'exp.out/ndim={}_batch={}_max_norm={}_learning_rate={}_{}'.format(
        ndim, batch_size, max_norm, learning_rate, ts)
    if not os.path.exists(nnet_outdir):
        os.makedirs(nnet_outdir)
    nnet_fname = os.path.join(nnet_outdir, 'nnet.dat')
    print "Saving to", nnet_fname
    # 将python对象序列化保存到本地的文件。
    cPickle.dump([train_nnet, test_nnet],
                 open(nnet_fname, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)

    total_params = sum([numpy.prod(param.shape.eval()) for param in params])
    print 'Total params number:', total_params

    # 损失函数交叉熵
    cost = train_nnet.layers[-1].training_cost(y)
    ########################################
    # # QQQQQ这种方式好奇怪???看起来像cost的另外一种求法
    # y_train_counts = numpy.unique(y_train, return_counts=True)[1].astype(numpy.float32)
    # weights_data = numpy.sum(y_train_counts) / y_train_counts
    # weights_data_norm = numpy.linalg.norm(weights_data)
    # weights_data /= weights_data_norm
    # print 'weights_data', weights_data
    # weights = theano.shared(weights_data, borrow=True)
    # cost = train_nnet.layers[-1].training_cost_weighted(y, weights=weights)
    ########################################################

    # 经过softmax后的最大值对应的类别
    predictions = test_nnet.layers[-1].y_pred
    # 经过softmax后的最大值
    predictions_prob = test_nnet.layers[-1].p_y_given_x[:, -1]

    # ### L2 regularization
    # L2_word_emb = 1e-4
    # L2_conv1d = 3e-5
    # # L2_softmax = 1e-3
    # L2_softmax = 1e-4
    # print "Regularizing nnet weights"
    # for w in train_nnet.weights:
    #   L2_reg = 0.
    #   if w.name.startswith('W_emb'):
    #     L2_reg = L2_word_emb
    #   elif w.name.startswith('W_conv1d'):
    #     L2_reg = L2_conv1d
    #   elif w.name.startswith('W_softmax'):
    #     L2_reg = L2_softmax
    #   elif w.name == 'W':
    #     L2_reg = L2_softmax
    #   print w.name, L2_reg
    #   cost += T.sum(w**2) * L2_reg
    #
    # batch_x = T.dmatrix('batch_x')
    batch_x_q = T.lmatrix('batch_x_q')
    batch_x_a = T.lmatrix('batch_x_a')
    batch_x_q_overlap = T.lmatrix('batch_x_q_overlap')
    batch_x_a_overlap = T.lmatrix('batch_x_a_overlap')
    batch_y = T.ivector('batch_y')

    # 训练优化方案
    # updates = sgd_trainer.get_adagrad_updates(cost, params, learning_rate=learning_rate, max_norm=max_norm, _eps=1e-6)
    updates = sgd_trainer.get_adadelta_updates(cost,
                                               params,
                                               rho=0.95,
                                               eps=1e-6,
                                               max_norm=max_norm,
                                               word_vec_name='W_emb')

    # batch_x是否注释,代表是否用overlap_feat特征用于训练
    inputs_pred = [
        batch_x_q,
        batch_x_a,
        batch_x_q_overlap,
        batch_x_a_overlap,
        # batch_x,
    ]

    givens_pred = {
        x_q: batch_x_q,
        x_a: batch_x_a,
        x_q_overlap: batch_x_q_overlap,
        x_a_overlap: batch_x_a_overlap,
        # x: batch_x
    }

    inputs_train = [
        batch_x_q,
        batch_x_a,
        batch_x_q_overlap,
        batch_x_a_overlap,
        # batch_x,
        batch_y,
    ]

    givens_train = {
        x_q: batch_x_q,
        x_a: batch_x_a,
        x_q_overlap: batch_x_q_overlap,
        x_a_overlap: batch_x_a_overlap,
        # x: batch_x,
        y: batch_y
    }

    # 训练函数定义
    train_fn = theano.function(inputs=inputs_train,
                               outputs=cost,
                               updates=updates,
                               givens=givens_train)
    # 选择答案
    pred_fn = theano.function(inputs=inputs_pred,
                              outputs=predictions,
                              givens=givens_pred)
    # 每个选项的概率
    pred_prob_fn = theano.function(inputs=inputs_pred,
                                   outputs=predictions_prob,
                                   givens=givens_pred)

    def predict_batch(batch_iterator):
        # numpy.hstack:Stack arrays in sequence horizontally (column wise).This is equivalent to concatenation along the second axis, except for 1-D arrays where it concatenates along the first axis
        preds = numpy.hstack([
            pred_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap)
            for batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, _
            in batch_iterator
        ])
        # preds = numpy.hstack([pred_prob_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, batch_x) for
        #                       batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, batch_x,_ in batch_iterator])
        # preds = numpy.hstack([pred_prob_fn(batch_x_q, batch_x_a,  batch_x) for
        #                       batch_x_q, batch_x_a,  batch_x, _ in batch_iterator])
        return preds[:batch_iterator.n_samples]

    def predict_prob_batch(batch_iterator):
        preds = numpy.hstack([
            pred_prob_fn(batch_x_q, batch_x_a, batch_x_q_overlap,
                         batch_x_a_overlap) for batch_x_q, batch_x_a,
            batch_x_q_overlap, batch_x_a_overlap, _ in batch_iterator
        ])
        # preds = numpy.hstack([pred_prob_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap,batch_x) for
        #                       batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, batch_x,_ in batch_iterator])
        # preds = numpy.hstack([pred_prob_fn(batch_x_q, batch_x_a, batch_x) for
        #                       batch_x_q, batch_x_a, batch_x, _ in batch_iterator])
        return preds[:batch_iterator.n_samples]


# 三个迭代器

    train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng,
        [q_train, a_train, q_overlap_train, a_overlap_train, y_train],
        batch_size=batch_size,
        randomize=True)
    dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [q_dev, a_dev, q_overlap_dev, a_overlap_dev, y_dev],
        batch_size=batch_size,
        randomize=False)
    test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [q_test, a_test, q_overlap_test, a_overlap_test, y_test],
        batch_size=batch_size,
        randomize=False)

    ####
    #   train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(numpy_rng, [q_train, a_train, q_overlap_train,
    #                                                                                   a_overlap_train,x_train,y_train],
    #                                                                       batch_size=batch_size, randomize=True)
    #   dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(numpy_rng,
    #                                                                     [q_dev, a_dev, q_overlap_dev, a_overlap_dev,x_dev,y_dev],
    #                                                                     batch_size=batch_size, randomize=False)
    #   test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(numpy_rng,
    #                                                                      [q_test, a_test, q_overlap_test, a_overlap_test,x_test,
    #                                                                       y_test], batch_size=batch_size, randomize=False)
    #####
    #   train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(numpy_rng, [q_train, a_train,  x_train, y_train],
    #                                                                       batch_size=batch_size, randomize=True)
    #   dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(numpy_rng,
    #                                                                     [q_dev, a_dev, x_dev,
    #                                                                      y_dev],
    #                                                                     batch_size=batch_size, randomize=False)
    #   test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(numpy_rng,
    #                                                                      [q_test, a_test, x_test,
    #                                                                       y_test], batch_size=batch_size, randomize=False)

    labels = sorted(numpy.unique(y_test))
    print 'labels', labels

    def map_score(qids, labels, preds):
        qid2cand = defaultdict(list)
        for qid, label, pred in zip(qids, labels, preds):
            qid2cand[qid].append((pred, label))

        average_precs = []
        for qid, candidates in qid2cand.iteritems():
            average_prec = 0
            running_correct_count = 0
            for i, (score,
                    label) in enumerate(sorted(candidates, reverse=True), 1):
                if label > 0:
                    running_correct_count += 1
                    average_prec += float(running_correct_count) / i
            average_precs.append(average_prec / (running_correct_count + 1e-6))
        map_score = sum(average_precs) / len(average_precs)
        return map_score

    print "Zero out dummy word:", ZEROUT_DUMMY_WORD
    if ZEROUT_DUMMY_WORD:
        W_emb_list = [w for w in params if w.name == 'W_emb']
        zerout_dummy_word = theano.function(
            [], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list])

    # weights_dev = numpy.zeros(len(y_dev))
    # weights_dev[y_dev == 0] = weights_data[0]
    # weights_dev[y_dev == 1] = weights_data[1]
    # print weights_dev

    best_dev_acc = -numpy.inf
    epoch = 0
    timer_train = time.time()
    no_best_dev_update = 0
    num_train_batches = len(train_set_iterator)
    while epoch < n_epochs:
        timer = time.time()
        for i, (x_q, x_a, x_q_overlap, x_a_overlap,
                y) in enumerate(tqdm(train_set_iterator), 1):
            # for i, (x_q, x_a,  x, y) in enumerate(tqdm(train_set_iterator), 1):
            # train_fn(x_q, x_a, x_q_overlap, x_a_overlap, x,y)
            # train_fn(x_q, x_a,  x, y)
            train_fn(x_q, x_a, x_q_overlap, x_a_overlap, y)

            # Make sure the null word in the word embeddings always remains zero
            if ZEROUT_DUMMY_WORD:
                zerout_dummy_word()

            if i % 10 == 0 or i == num_train_batches:
                y_pred_dev = predict_prob_batch(dev_set_iterator)
                # # dev_acc = map_score(qids_dev, y_dev, predict_prob_batch(dev_set_iterator)) * 100
                # Compute Area Under the Receiver Operating Characteristic Curve(ROC AUC) from prediction scores.
                dev_acc = metrics.roc_auc_score(y_dev, y_pred_dev) * 100
                if dev_acc > best_dev_acc:
                    y_pred = predict_prob_batch(test_set_iterator)
                    test_acc = map_score(qids_test, y_test, y_pred) * 100

                    print(
                        'epoch: {} batch: {} dev auc: {:.4f}; test map: {:.4f}; best_dev_acc: {:.4f}'
                        .format(epoch, i, dev_acc, test_acc, best_dev_acc))
                    best_dev_acc = dev_acc
                    best_params = [
                        numpy.copy(p.get_value(borrow=True)) for p in params
                    ]
                    no_best_dev_update = 0

        if no_best_dev_update >= 3:
            print "Quitting after of no update of the best score on dev set", no_best_dev_update
            break

        print('epoch {} took {:.4f} seconds'.format(epoch,
                                                    time.time() - timer))
        epoch += 1
        no_best_dev_update += 1

    print('Training took: {:.4f} seconds'.format(time.time() - timer_train))
    for i, param in enumerate(best_params):
        params[i].set_value(param, borrow=True)

    y_pred_test = predict_prob_batch(test_set_iterator)
    test_acc = map_score(qids_test, y_test, y_pred_test) * 100
    fname = os.path.join(
        nnet_outdir,
        'best_dev_params.epoch={:02d};batch={:05d};dev_acc={:.2f}.dat'.format(
            epoch, i, best_dev_acc))
    numpy.savetxt(
        os.path.join(
            nnet_outdir,
            'test.epoch={:02d};batch={:05d};dev_acc={:.2f}.predictions.npy'.
            format(epoch, i, best_dev_acc)), y_pred)
    cPickle.dump(best_params,
                 open(fname, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)

    print "Running trec_eval script..."
    N = len(y_pred_test)

    df_submission = pd.DataFrame(
        index=numpy.arange(N),
        columns=['qid', 'iter', 'docno', 'rank', 'sim', 'run_id'])
    df_submission['qid'] = qids_test
    df_submission['iter'] = 0
    df_submission['docno'] = numpy.arange(N)
    df_submission['rank'] = 0
    df_submission['sim'] = y_pred_test
    df_submission['run_id'] = 'nnet'
    df_submission.to_csv(os.path.join(nnet_outdir, 'submission.txt'),
                         header=False,
                         index=False,
                         sep=' ')

    df_gold = pd.DataFrame(index=numpy.arange(N),
                           columns=['qid', 'iter', 'docno', 'rel'])
    df_gold['qid'] = qids_test
    df_gold['iter'] = 0
    df_gold['docno'] = numpy.arange(N)
    df_gold['rel'] = y_test
    df_gold.to_csv(os.path.join(nnet_outdir, 'gold.txt'),
                   header=False,
                   index=False,
                   sep=' ')

    subprocess.call("/bin/sh run_eval.sh '{}'".format(nnet_outdir), shell=True)
示例#2
0
def main():
    # ZEROUT_DUMMY_WORD = False
    ZEROUT_DUMMY_WORD = True

    ## Load data
    # mode = 'TRAIN-ALL'
    mode = sys.argv[1]
    """
  if len(sys.argv) > 1:
    mode = sys.argv[1]
    if not mode in ['TRAIN', 'TRAIN-ALL']:
      print "ERROR! The two possible training settings are: ['TRAIN', 'TRAIN-ALL']"
      sys.exit(1)
  """
    print "Running training in the {} setting".format(mode)

    data_dir = mode

    if mode in ['TRAIN-ALL']:
        q_train = numpy.load(os.path.join(data_dir, 'train-all.questions.npy'))
        a_train = numpy.load(os.path.join(data_dir, 'train-all.answers.npy'))
        q_overlap_train = numpy.load(
            os.path.join(data_dir, 'train-all.q_overlap_indices.npy'))
        a_overlap_train = numpy.load(
            os.path.join(data_dir, 'train-all.a_overlap_indices.npy'))
        y_train = numpy.load(os.path.join(data_dir, 'train-all.labels.npy'))
    else:
        q_train = numpy.load(os.path.join(data_dir, 'train.questions.npy'))
        a_train = numpy.load(os.path.join(data_dir, 'train.answers.npy'))
        q_overlap_train = numpy.load(os.path.join(data_dir, 'train.q_sim.npy'))
        a_overlap_train = numpy.load(os.path.join(data_dir, 'train.a_sim.npy'))
        y_train = numpy.load(os.path.join(data_dir, 'train.labels.npy'))

    q_dev = numpy.load(os.path.join(data_dir, 'dev.questions.npy'))
    a_dev = numpy.load(os.path.join(data_dir, 'dev.answers.npy'))
    q_overlap_dev = numpy.load(os.path.join(data_dir, 'dev.q_sim.npy'))
    a_overlap_dev = numpy.load(os.path.join(data_dir, 'dev.a_sim.npy'))
    y_dev = numpy.load(os.path.join(data_dir, 'dev.labels.npy'))
    qids_dev = numpy.load(os.path.join(data_dir, 'dev.qids.npy'))

    q_test = numpy.load(os.path.join(data_dir, 'test.questions.npy'))
    a_test = numpy.load(os.path.join(data_dir, 'test.answers.npy'))
    q_overlap_test = numpy.load(os.path.join(data_dir, 'test.q_sim.npy'))
    a_overlap_test = numpy.load(os.path.join(data_dir, 'test.a_sim.npy'))
    y_test = numpy.load(os.path.join(data_dir, 'test.labels.npy'))
    qids_test = numpy.load(os.path.join(data_dir, 'test.qids.npy'))

    # x_train = numpy.load(os.path.join(data_dir, 'train.overlap_feats.npy'))
    # x_dev = numpy.load(os.path.join(data_dir, 'dev.overlap_feats.npy'))
    # x_test = numpy.load(os.path.join(data_dir, 'test.overlap_feats.npy'))

    # feats_ndim = x_train.shape[1]

    # from sklearn.preprocessing import StandardScaler
    # scaler = StandardScaler()
    # print "Scaling overlap features"
    # x_train = scaler.fit_transform(x_train)
    # x_dev = scaler.transform(x_dev)
    # x_test = scaler.transform(x_test)

    print 'y_train', numpy.unique(y_train, return_counts=True)
    print 'y_dev', numpy.unique(y_dev, return_counts=True)
    print 'y_test', numpy.unique(y_test, return_counts=True)

    print 'q_train', q_train.shape
    print 'q_dev', q_dev.shape
    print 'q_test', q_test.shape

    print 'a_train', a_train.shape
    print 'a_dev', a_dev.shape
    print 'a_test', a_test.shape

    ## Get the word embeddings from the nnet trained on SemEval
    # ndim = 40
    # nnet_outdir = 'exp/ndim=60;batch=100;max_norm=0;learning_rate=0.1;2014-12-02-15:53:14'
    # nnet_fname = os.path.join(nnet_outdir, 'nnet.dat')
    # params_fname = os.path.join(nnet_outdir, 'best_dev_params.epoch=00;batch=14640;dev_f1=83.12;test_acc=85.00.dat')
    # train_nnet, test_nnet = nn_layers.load_nnet(nnet_fname, params_fname)

    numpy_rng = numpy.random.RandomState(123)
    q_max_sent_size = q_train.shape[1]
    a_max_sent_size = a_train.shape[1]
    # print 'max', numpy.max(a_train)
    # print 'min', numpy.min(a_train)

    ndim = 5
    print "Generating random vocabulary for word overlap indicator features with dim:", ndim
    dummy_word_id = numpy.max(a_overlap_train)
    # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim))
    print "Gaussian"
    vocab_emb_overlap = numpy_rng.randn(dummy_word_id + 1, ndim) * 0.25
    # vocab_emb_overlap = numpy_rng.randn(dummy_word_id+1, ndim) * 0.05
    # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim))
    vocab_emb_overlap[-1] = 0

    # Load word2vec embeddings
    fname = os.path.join(data_dir, 'emb_aquaint+wiki.txt.gz.ndim=50.bin.npy')

    print "Loading word embeddings from", fname
    vocab_emb = numpy.load(fname)
    ndim = vocab_emb.shape[1]
    dummpy_word_idx = numpy.max(a_train)
    print "Word embedding matrix size:", vocab_emb.shape

    x = T.dmatrix('x')
    x_q = T.lmatrix('q')
    x_q_overlap = T.lmatrix('q_overlap')
    x_a = T.lmatrix('a')
    x_a_overlap = T.lmatrix('a_overlap')
    y = T.ivector('y')

    #######
    n_outs = 2

    n_epochs = 25
    batch_size = 50
    learning_rate = 0.1
    max_norm = 0

    print 'batch_size', batch_size
    print 'n_epochs', n_epochs
    print 'learning_rate', learning_rate
    print 'max_norm', max_norm

    ## 1st conv layer.
    ndim = vocab_emb.shape[1] + vocab_emb_overlap.shape[1]

    ### Nonlinearity type
    # activation = nn_layers.relu_f
    activation = T.tanh

    dropout_rate = 0.5
    nkernels = 100
    q_k_max = 1
    a_k_max = 1

    # filter_widths = [3,4,5]
    q_filter_widths = [5]
    a_filter_widths = [5]

    ###### QUESTION ######
    lookup_table_words = nn_layers.LookupTableFastStatic(
        W=vocab_emb, pad=max(q_filter_widths) - 1)
    lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap,
                                                     pad=max(q_filter_widths) -
                                                     1)

    lookup_table = nn_layers.ParallelLookupTable(
        layers=[lookup_table_words, lookup_table_overlap])

    num_input_channels = 1
    input_shape = (batch_size, num_input_channels,
                   q_max_sent_size + 2 * (max(q_filter_widths) - 1), ndim)

    conv_layers = []
    for filter_width in q_filter_widths:
        filter_shape = (nkernels, num_input_channels, filter_width, ndim)
        conv = nn_layers.Conv2dLayer(rng=numpy_rng,
                                     filter_shape=filter_shape,
                                     input_shape=input_shape)
        non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0],
                                                    activation=activation)
        pooling = nn_layers.KMaxPoolLayer(k_max=q_k_max)
        conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(
            layers=[conv, non_linearity, pooling])
        conv_layers.append(conv2dNonLinearMaxPool)

    join_layer = nn_layers.ParallelLayer(layers=conv_layers)
    flatten_layer = nn_layers.FlattenLayer()

    nnet_q = nn_layers.FeedForwardNet(layers=[
        lookup_table,
        join_layer,
        flatten_layer,
    ])
    nnet_q.set_input((x_q, x_q_overlap))
    ######

    ###### ANSWER ######
    lookup_table_words = nn_layers.LookupTableFastStatic(
        W=vocab_emb, pad=max(q_filter_widths) - 1)
    lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap,
                                                     pad=max(q_filter_widths) -
                                                     1)

    lookup_table = nn_layers.ParallelLookupTable(
        layers=[lookup_table_words, lookup_table_overlap])

    # num_input_channels = len(lookup_table.layers)
    input_shape = (batch_size, num_input_channels,
                   a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim)
    conv_layers = []
    for filter_width in a_filter_widths:
        filter_shape = (nkernels, num_input_channels, filter_width, ndim)
        conv = nn_layers.Conv2dLayer(rng=numpy_rng,
                                     filter_shape=filter_shape,
                                     input_shape=input_shape)
        non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0],
                                                    activation=activation)
        pooling = nn_layers.KMaxPoolLayer(k_max=a_k_max)
        conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(
            layers=[conv, non_linearity, pooling])
        conv_layers.append(conv2dNonLinearMaxPool)

    join_layer = nn_layers.ParallelLayer(layers=conv_layers)
    flatten_layer = nn_layers.FlattenLayer()

    nnet_a = nn_layers.FeedForwardNet(layers=[
        lookup_table,
        join_layer,
        flatten_layer,
    ])
    nnet_a.set_input((x_a, x_a_overlap))
    #######
    # print 'nnet_q.output', nnet_q.output.ndim

    q_logistic_n_in = nkernels * len(q_filter_widths) * q_k_max
    a_logistic_n_in = nkernels * len(a_filter_widths) * a_k_max

    # dropout_q = nn_layers.FastDropoutLayer(rng=numpy_rng)
    # dropout_a = nn_layers.FastDropoutLayer(rng=numpy_rng)
    # dropout_q.set_input(nnet_q.output)
    # dropout_a.set_input(nnet_a.output)

    # feats_nout = 10
    # x_hidden_layer = nn_layers.LinearLayer(numpy_rng, n_in=feats_ndim, n_out=feats_nout, activation=activation)
    # x_hidden_layer.set_input(x)

    # feats_nout = feats_ndim

    ### Dropout
    # classifier = nn_layers.PairwiseLogisticWithFeatsRegression(q_in=logistic_n_in,
    #                                                   a_in=logistic_n_in,
    #                                                   n_in=feats_nout,
    #                                                   n_out=n_outs)
    # # classifier.set_input((dropout_q.output, dropout_a.output, x_hidden_layer.output))
    # classifier.set_input((dropout_q.output, dropout_a.output, x))

    # # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, dropout_q, dropout_a, classifier],
    # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, dropout_q, dropout_a, classifier],
    #                                       name="Training nnet")

    # test_classifier = nn_layers.PairwiseLogisticWithFeatsRegression(q_in=logistic_n_in,
    #                                                         a_in=logistic_n_in,
    #                                                         n_in=feats_nout,
    #                                                         n_out=n_outs,
    #                                                         W=classifier.W,
    #                                                         W_feats=classifier.W_feats,
    #                                                         b=classifier.b)
    # # test_classifier.set_input((nnet_q.output, nnet_a.output, x_hidden_layer.output))
    # test_classifier.set_input((nnet_q.output, nnet_a.output, x))
    # # test_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, test_classifier],
    # test_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, test_classifier],
    #                                       name="Test nnet")
    #########

    # pairwise_layer = nn_layers.PairwiseMultiOnlySimWithFeatsLayer(q_in=q_logistic_n_in,

    pairwise_layer = nn_layers.PairwiseNoFeatsLayer(
        q_in=q_logistic_n_in,
        # pairwise_layer = nn_layers.PairwiseWithFeatsLayer(q_in=q_logistic_n_in,
        # pairwise_layer = nn_layers.PairwiseOnlySimWithFeatsLayer(q_in=q_logistic_n_in,
        a_in=a_logistic_n_in)
    pairwise_layer.set_input((nnet_q.output, nnet_a.output))

    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + a_logistic_n_in
    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 50
    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 1
    n_in = q_logistic_n_in + a_logistic_n_in + 1
    # n_in = feats_ndim + 1
    # n_in = feats_ndim + 50

    hidden_layer = nn_layers.LinearLayer(numpy_rng,
                                         n_in=n_in,
                                         n_out=n_in,
                                         activation=activation)
    hidden_layer.set_input(pairwise_layer.output)

    classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs)
    classifier.set_input(hidden_layer.output)

    train_nnet = nn_layers.FeedForwardNet(
        layers=[nnet_q, nnet_a, pairwise_layer, hidden_layer, classifier],
        # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, classifier],
        name="Training nnet")
    test_nnet = train_nnet
    #######

    print train_nnet

    params = train_nnet.params

    ts = datetime.now().strftime('%Y-%m-%d-%H.%M.%S')

    nnet_outdir = 'exp.out/ndim={};batch={};max_norm={};learning_rate={};{}'.format(
        ndim, batch_size, max_norm, learning_rate, ts)
    nnet_outdir = os.path.join(data_dir, nnet_outdir)
    if not os.path.exists(nnet_outdir):
        os.makedirs(nnet_outdir)
    nnet_fname = os.path.join(nnet_outdir, 'nnet.dat')
    print "Saving to", nnet_fname
    cPickle.dump([train_nnet, test_nnet],
                 open(nnet_fname, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)

    total_params = sum([numpy.prod(param.shape.eval()) for param in params])
    print 'Total params number:', total_params

    cost = train_nnet.layers[-1].training_cost(y)
    # y_train_counts = numpy.unique(y_train, return_counts=True)[1].astype(numpy.float32)
    # weights_data = numpy.sum(y_train_counts) / y_train_counts
    # weights_data_norm = numpy.linalg.norm(weights_data)
    # weights_data /= weights_data_norm
    # print 'weights_data', weights_data
    # weights = theano.shared(weights_data, borrow=True)
    # cost = train_nnet.layers[-1].training_cost_weighted(y, weights=weights)

    predictions = test_nnet.layers[-1].y_pred
    predictions_prob = test_nnet.layers[-1].p_y_given_x[:, -1]

    ### L2 regularization
    # L2_word_emb = 1e-4
    # L2_conv1d = 3e-5
    # # L2_softmax = 1e-3
    # L2_softmax = 1e-4
    # print "Regularizing nnet weights"
    # for w in train_nnet.weights:
    #   L2_reg = 0.
    #   if w.name.startswith('W_emb'):
    #     L2_reg = L2_word_emb
    #   elif w.name.startswith('W_conv1d'):
    #     L2_reg = L2_conv1d
    #   elif w.name.startswith('W_softmax'):
    #     L2_reg = L2_softmax
    #   elif w.name == 'W':
    #     L2_reg = L2_softmax
    #   print w.name, L2_reg
    #   cost += T.sum(w**2) * L2_reg

    # batch_x = T.dmatrix('batch_x')
    batch_x_q = T.lmatrix('batch_x_q')
    batch_x_a = T.lmatrix('batch_x_a')
    batch_x_q_overlap = T.lmatrix('batch_x_q_overlap')
    batch_x_a_overlap = T.lmatrix('batch_x_a_overlap')
    batch_y = T.ivector('batch_y')

    # updates = sgd_trainer.get_adagrad_updates(cost, params, learning_rate=learning_rate, max_norm=max_norm, _eps=1e-6)
    updates = sgd_trainer.get_adadelta_updates(cost,
                                               params,
                                               rho=0.95,
                                               eps=1e-6,
                                               max_norm=max_norm,
                                               word_vec_name='W_emb')

    inputs_pred = [
        batch_x_q,
        batch_x_a,
        batch_x_q_overlap,
        batch_x_a_overlap,
        # batch_x,
    ]

    givens_pred = {
        x_q: batch_x_q,
        x_a: batch_x_a,
        x_q_overlap: batch_x_q_overlap,
        x_a_overlap: batch_x_a_overlap,
        # x: batch_x
    }

    inputs_train = [
        batch_x_q,
        batch_x_a,
        batch_x_q_overlap,
        batch_x_a_overlap,
        # batch_x,
        batch_y,
    ]

    givens_train = {
        x_q: batch_x_q,
        x_a: batch_x_a,
        x_q_overlap: batch_x_q_overlap,
        x_a_overlap: batch_x_a_overlap,
        # x: batch_x,
        y: batch_y
    }

    train_fn = theano.function(inputs=inputs_train,
                               outputs=cost,
                               updates=updates,
                               givens=givens_train)

    pred_fn = theano.function(inputs=inputs_pred,
                              outputs=predictions,
                              givens=givens_pred)

    pred_prob_fn = theano.function(inputs=inputs_pred,
                                   outputs=predictions_prob,
                                   givens=givens_pred)

    def predict_batch(batch_iterator):
        preds = numpy.hstack([
            pred_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap)
            for batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, _
            in batch_iterator
        ])
        return preds[:batch_iterator.n_samples]

    def predict_prob_batch(batch_iterator):
        preds = numpy.hstack([
            pred_prob_fn(batch_x_q, batch_x_a, batch_x_q_overlap,
                         batch_x_a_overlap) for batch_x_q, batch_x_a,
            batch_x_q_overlap, batch_x_a_overlap, _ in batch_iterator
        ])
        return preds[:batch_iterator.n_samples]

    train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng,
        [q_train, a_train, q_overlap_train, a_overlap_train, y_train],
        batch_size=batch_size,
        randomize=True)
    dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [q_dev, a_dev, q_overlap_dev, a_overlap_dev, y_dev],
        batch_size=batch_size,
        randomize=False)
    test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [q_test, a_test, q_overlap_test, a_overlap_test, y_test],
        batch_size=batch_size,
        randomize=False)

    labels = sorted(numpy.unique(y_test))
    print 'labels', labels

    def map_score(qids, labels, preds):
        qid2cand = defaultdict(list)
        for qid, label, pred in zip(qids, labels, preds):
            qid2cand[qid].append((pred, label))

        average_precs = []
        for qid, candidates in qid2cand.iteritems():
            average_prec = 0
            running_correct_count = 0
            for i, (score,
                    label) in enumerate(sorted(candidates, reverse=True), 1):
                if label > 0:
                    running_correct_count += 1
                    average_prec += float(running_correct_count) / i
            average_precs.append(average_prec / (running_correct_count + 1e-6))
        map_score = sum(average_precs) / len(average_precs)
        return map_score

    print "Zero out dummy word:", ZEROUT_DUMMY_WORD
    if ZEROUT_DUMMY_WORD:
        W_emb_list = [w for w in params if w.name == 'W_emb']
        zerout_dummy_word = theano.function(
            [], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list])

    # weights_dev = numpy.zeros(len(y_dev))
    # weights_dev[y_dev == 0] = weights_data[0]
    # weights_dev[y_dev == 1] = weights_data[1]
    # print weights_dev

    best_dev_acc = -numpy.inf
    epoch = 0
    timer_train = time.time()
    no_best_dev_update = 0
    num_train_batches = len(train_set_iterator)
    while epoch < n_epochs:
        timer = time.time()
        for i, (x_q, x_a, x_q_overlap, x_a_overlap,
                y) in enumerate(tqdm(train_set_iterator), 1):
            train_fn(x_q, x_a, x_q_overlap, x_a_overlap, y)

            # Make sure the null word in the word embeddings always remains zero
            if ZEROUT_DUMMY_WORD:
                zerout_dummy_word()

            if i % 10 == 0 or i == num_train_batches:
                y_pred_dev = predict_prob_batch(dev_set_iterator)
                # # dev_acc = map_score(qids_dev, y_dev, predict_prob_batch(dev_set_iterator)) * 100
                dev_acc = metrics.roc_auc_score(y_dev, y_pred_dev) * 100
                if dev_acc > best_dev_acc:
                    y_pred = predict_prob_batch(test_set_iterator)
                    test_acc = map_score(qids_test, y_test, y_pred) * 100

                    print(
                        'epoch: {} batch: {} dev auc: {:.4f}; test map: {:.4f}; best_dev_acc: {:.4f}'
                        .format(epoch, i, dev_acc, test_acc, best_dev_acc))
                    best_dev_acc = dev_acc
                    best_params = [
                        numpy.copy(p.get_value(borrow=True)) for p in params
                    ]
                    no_best_dev_update = 0

        if no_best_dev_update >= 3:
            print "Quitting after of no update of the best score on dev set", no_best_dev_update
            break

        print('epoch {} took {:.4f} seconds'.format(epoch,
                                                    time.time() - timer))
        epoch += 1
        no_best_dev_update += 1

    print('Training took: {:.4f} seconds'.format(time.time() - timer_train))
    for i, param in enumerate(best_params):
        params[i].set_value(param, borrow=True)

    y_pred_test = predict_prob_batch(test_set_iterator)
    test_acc = map_score(qids_test, y_test, y_pred_test) * 100
    fname = os.path.join(
        nnet_outdir,
        'best_dev_params.epoch={:02d};batch={:05d};dev_acc={:.2f}.dat'.format(
            epoch, i, best_dev_acc))
    numpy.savetxt(
        os.path.join(
            nnet_outdir,
            'test.epoch={:02d};batch={:05d};dev_acc={:.2f}.predictions.npy'.
            format(epoch, i, best_dev_acc)), y_pred)
    cPickle.dump(best_params,
                 open(fname, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)

    print "Running trec_eval script..."
    N = len(y_pred_test)

    df_submission = pd.DataFrame(
        index=numpy.arange(N),
        columns=['qid', 'iter', 'docno', 'rank', 'sim', 'run_id'])
    df_submission['qid'] = qids_test
    df_submission['iter'] = 0
    df_submission['docno'] = numpy.arange(N)
    df_submission['rank'] = 0
    df_submission['sim'] = y_pred_test
    df_submission['run_id'] = 'nnet'
    df_submission.to_csv(os.path.join(nnet_outdir, 'submission.txt'),
                         header=False,
                         index=False,
                         sep=' ')

    df_gold = pd.DataFrame(index=numpy.arange(N),
                           columns=['qid', 'iter', 'docno', 'rel'])
    df_gold['qid'] = qids_test
    df_gold['iter'] = 0
    df_gold['docno'] = numpy.arange(N)
    df_gold['rel'] = y_test
    df_gold.to_csv(os.path.join(nnet_outdir, 'gold.txt'),
                   header=False,
                   index=False,
                   sep=' ')

    subprocess.call("/bin/sh run_eval.sh '{}'".format(nnet_outdir), shell=True)
示例#3
0
x_q = T.lmatrix('q')
x_q_overlap = T.lmatrix('q_overlap')
x_a = T.lmatrix('a')
x_a_overlap = T.lmatrix('a_overlap')
y = T.ivector('y')

########################## CONVOLUTION LAYER ########################

ndim = vocab_emb.shape[1] + vocab_emb_overlap.shape[1]    # 50 + 5 = 55
activation = T.tanh

### CNN FOR QUESTION ###

lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths)-1)
lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=max(q_filter_widths)-1)
lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words, lookup_table_overlap])

num_input_channels = 1
input_shape = (batch_size, num_input_channels, q_max_size + 2*(max(q_filter_widths)-1), ndim)

conv_layers = []

# each conv_layer consists of 2d convolution , filters, activation, pooling layers
for filter_width in q_filter_widths:
    filter_shape = (nfilters, num_input_channels, filter_width, ndim)
    conv = nn_layers.Conv2dLayer(rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape)
    non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0], activation=activation)
    pooling = nn_layers.KMaxPoolLayer(k_max=q_k_max)
    conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(layers=[conv, non_linearity, pooling])
    conv_layers.append(conv2dNonLinearMaxPool)
示例#4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-a', choices=['abcnn1', 'abcnn2'])
    parser.add_argument('--similarity', choices=['euclidean', 'cosine'])
    parser.add_argument('--no-features',
                        action='store_true',
                        help='do not use external features')
    parser.add_argument('--l2svm',
                        action='store_true',
                        help='use L2-SVM as the classifier')
    parser.add_argument('--dropout', choices=['gaussian', 'mc'])
    parser.add_argument('--dropout-rate',
                        type=float,
                        help='dropout rate (default: %(default)s)')
    parser.add_argument('--nkernels',
                        type=int,
                        help='number of kernels (default: %(default)s)')
    parser.add_argument('--early-stop',
                        metavar='N',
                        type=int,
                        help='stop if seeing no improvements in N epochs')
    parser.add_argument('-e',
                        choices=['GoogleNews', 'aquaint+wiki'],
                        help='word embeddings file to use')
    parser.add_argument('mode')
    parser.set_defaults(early_stop=3,
                        e='GoogleNews',
                        dropout_rate=0.5,
                        nkernels=100)
    args = parser.parse_args()

    # ZEROUT_DUMMY_WORD = False
    ZEROUT_DUMMY_WORD = True

    ## Load data
    # mode = 'TRAIN-ALL'
    mode = args.mode
    if mode not in ['TRAIN', 'TRAIN-ALL', 'WIKIQA-TRAIN'] + [
            'WEBAP-FOLD{}-TRAIN'.format(i) for i in (1, 2, 3, 4, 5)
    ]:
        print "ERROR! mode '{}' is invalid".format(mode)
        sys.exit(1)

    print "Running training in the {} setting".format(mode)

    data_dir = mode

    def load_numpy_data(data_dir, prefix):
        filetypes = [
            'questions', 'answers', 'q_overlap_indices', 'a_overlap_indices',
            'labels', 'qids', 'aids'
        ]
        filenames = [
            '{}.{}.npy'.format(prefix, filetype) for filetype in filetypes
        ]
        return [
            numpy.load(os.path.join(data_dir, filename))
            for filename in filenames
        ]

    if mode in ['TRAIN-ALL', 'TRAIN']:
        prefix = mode.lower()
        q_train, a_train, q_overlap_train, a_overlap_train, y_train, _, _ = load_numpy_data(
            data_dir, prefix)
        q_dev, a_dev, q_overlap_dev, a_overlap_dev, y_dev, qids_dev, _ = load_numpy_data(
            data_dir, 'dev')
        q_test, a_test, q_overlap_test, a_overlap_test, y_test, qids_test, aids_test = load_numpy_data(
            data_dir, 'test')

        x_train = numpy.load(
            os.path.join(data_dir, '{}.overlap_feats.npy'.format(prefix)))
        x_dev = numpy.load(os.path.join(data_dir, 'dev.overlap_feats.npy'))
        x_test = numpy.load(os.path.join(data_dir, 'test.overlap_feats.npy'))

    elif mode in ['WIKIQA-TRAIN']:
        q_train, a_train, q_overlap_train, a_overlap_train, y_train, _, _ = load_numpy_data(
            data_dir, 'WikiQA-train')
        q_dev, a_dev, q_overlap_dev, a_overlap_dev, y_dev, qids_dev, _ = load_numpy_data(
            data_dir, 'WikiQA-dev-filtered')
        q_test, a_test, q_overlap_test, a_overlap_test, y_test, qids_test, aids_test = load_numpy_data(
            data_dir, 'WikiQA-test-filtered')

        x_train = numpy.load(
            os.path.join(data_dir, 'WikiQA-train.overlap_feats.npy'))
        x_dev = numpy.load(
            os.path.join(data_dir, 'WikiQA-dev-filtered.overlap_feats.npy'))
        x_test = numpy.load(
            os.path.join(data_dir, 'WikiQA-test-filtered.overlap_feats.npy'))

    elif mode in ['WEBAP-FOLD{}-TRAIN'.format(i) for i in (1, 2, 3, 4, 5)]:
        fn = ['WEBAP-FOLD{}-TRAIN'.format(i)
              for i in (1, 2, 3, 4, 5)].index(mode) + 1

        q_train, a_train, q_overlap_train, a_overlap_train, y_train, _, _ = load_numpy_data(
            data_dir, 'WebAP-fold{}-train'.format(fn))
        q_dev, a_dev, q_overlap_dev, a_overlap_dev, y_dev, qids_dev, _ = load_numpy_data(
            data_dir, 'WebAP-fold{}-dev'.format(fn))
        q_test, a_test, q_overlap_test, a_overlap_test, y_test, qids_test, aids_test = load_numpy_data(
            data_dir, 'WebAP-fold{}-test'.format(fn))

    # x_train = numpy.load(os.path.join(data_dir, 'train.overlap_feats.npy'))
    # x_dev = numpy.load(os.path.join(data_dir, 'dev.overlap_feats.npy'))
    # x_test = numpy.load(os.path.join(data_dir, 'test.overlap_feats.npy'))

    feats_ndim = x_train.shape[1]

    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler(copy=True)
    print "Scaling features"
    x_train = scaler.fit_transform(x_train)
    x_dev = scaler.transform(x_dev)
    x_test = scaler.transform(x_test)

    print 'y_train', numpy.unique(y_train, return_counts=True)
    print 'y_dev', numpy.unique(y_dev, return_counts=True)
    print 'y_test', numpy.unique(y_test, return_counts=True)

    print 'q_train', q_train.shape
    print 'q_dev', q_dev.shape
    print 'q_test', q_test.shape

    print 'a_train', a_train.shape
    print 'a_dev', a_dev.shape
    print 'a_test', a_test.shape

    print 'x_train', x_train.shape
    print 'x_dev', x_dev.shape
    print 'x_test', x_test.shape

    ## Get the word embeddings from the nnet trained on SemEval
    # ndim = 40
    # nnet_outdir = 'exp/ndim=60;batch=100;max_norm=0;learning_rate=0.1;2014-12-02-15:53:14'
    # nnet_fname = os.path.join(nnet_outdir, 'nnet.dat')
    # params_fname = os.path.join(nnet_outdir, 'best_dev_params.epoch=00;batch=14640;dev_f1=83.12;test_acc=85.00.dat')
    # train_nnet, test_nnet = nn_layers.load_nnet(nnet_fname, params_fname)

    numpy_rng = numpy.random.RandomState(123)
    q_max_sent_size = q_train.shape[1]
    a_max_sent_size = a_train.shape[1]
    # print 'max', numpy.max(a_train)
    # print 'min', numpy.min(a_train)

    ndim = 5
    print "Generating random vocabulary for word overlap indicator features with dim:", ndim
    dummy_word_id = numpy.max(a_overlap_train)
    # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim))
    print "Gaussian"
    vocab_emb_overlap = numpy_rng.randn(dummy_word_id + 1, ndim) * 0.25
    # vocab_emb_overlap = numpy_rng.randn(dummy_word_id+1, ndim) * 0.05
    # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim))
    vocab_emb_overlap[-1] = 0

    # Load word2vec embeddings
    if args.e in ['GoogleNews']:
        fname = os.path.join(data_dir,
                             'emb_GoogleNews-vectors-negative300.bin.npy')
    elif args.e in ['aquaint+wiki']:
        fname = os.path.join(data_dir,
                             'emb_aquaint+wiki.txt.gz.ndim=50.bin.npy')
    else:
        print 'No such embedding file: {}'.format(args.e)
        sys.exit(1)

    print "Loading word embeddings from", fname
    vocab_emb = numpy.load(fname)
    ndim = vocab_emb.shape[1]
    dummpy_word_idx = numpy.max(a_train)
    print "Word embedding matrix size:", vocab_emb.shape

    x = T.dmatrix('x')
    x_q = T.lmatrix('q')
    x_q_overlap = T.lmatrix('q_overlap')
    x_a = T.lmatrix('a')
    x_a_overlap = T.lmatrix('a_overlap')
    y = T.ivector('y')

    #######
    n_outs = 2

    n_epochs = 25
    batch_size = 50
    learning_rate = 0.1
    max_norm = 0

    print 'batch_size', batch_size
    print 'n_epochs', n_epochs
    print 'learning_rate', learning_rate
    print 'max_norm', max_norm

    ## 1st conv layer.
    ndim = vocab_emb.shape[1] + vocab_emb_overlap.shape[1]

    ### Nonlinearity type
    # activation = nn_layers.relu_f
    activation = T.tanh

    dropout_rate = args.dropout_rate
    nkernels = args.nkernels
    q_k_max = 1
    a_k_max = 1

    # filter_widths = [3,4,5]
    q_filter_widths = [5]
    a_filter_widths = [5]

    # Lookup layers
    lookup_table_q = nn_layers.ParallelLookupTable(layers=[
        nn_layers.LookupTableFastStatic(W=vocab_emb,
                                        pad=max(q_filter_widths) - 1),
        nn_layers.LookupTableFast(W=vocab_emb_overlap,
                                  pad=max(q_filter_widths) - 1)
    ])
    lookup_table_q.set_input((x_q, x_q_overlap))

    lookup_table_a = nn_layers.ParallelLookupTable(layers=[
        nn_layers.LookupTableFastStatic(W=vocab_emb,
                                        pad=max(a_filter_widths) - 1),
        nn_layers.LookupTableFast(W=vocab_emb_overlap,
                                  pad=max(a_filter_widths) - 1)
    ])
    lookup_table_a.set_input((x_a, x_a_overlap))

    # NOTE: these seemingly mismatched shapes are actually correct
    if args.a in ['abcnn1']:
        attention = AttentionTransformLayer(
            similarity=args.similarity,
            rng=numpy_rng,
            W_q_shape=(a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim),
            W_a_shape=(q_max_sent_size + 2 * (max(q_filter_widths) - 1), ndim))
        num_input_channels = 2
    elif args.a in ['abcnn2']:
        attention = AttentionWeightingLayer(similarity=args.similarity)
        num_input_channels = 1
    else:
        attention = None
        num_input_channels = 1

    if attention is not None:
        attention.set_input((lookup_table_q.output, lookup_table_a.output))
        input0, input1 = attention.output
    else:
        input0, input1 = lookup_table_q.output, lookup_table_a.output

    input_shape_q = (batch_size, num_input_channels,
                     q_max_sent_size + 2 * (max(q_filter_widths) - 1), ndim)
    input_shape_a = (batch_size, num_input_channels,
                     a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim)
    ###### QUESTION ######

    # lookup_table_words = nn_layers.LookupTableFastStatic(
    #     W=vocab_emb, pad=max(q_filter_widths) - 1)
    # lookup_table_overlap = nn_layers.LookupTableFast(
    #     W=vocab_emb_overlap, pad=max(q_filter_widths) - 1)
    # lookup_table = nn_layers.ParallelLookupTable(
    #     layers=[lookup_table_words, lookup_table_overlap])

    # input_shape = (batch_size, num_input_channels, q_max_sent_size + 2 *
    #                (max(q_filter_widths) - 1), ndim)

    conv_layers = []
    for filter_width in q_filter_widths:
        filter_shape = (nkernels, num_input_channels, filter_width, ndim)
        conv = nn_layers.Conv2dLayer(rng=numpy_rng,
                                     filter_shape=filter_shape,
                                     input_shape=input_shape_q)
        non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0],
                                                    activation=activation)
        pooling = nn_layers.KMaxPoolLayer(k_max=q_k_max)
        conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(
            layers=[conv, non_linearity, pooling])
        conv_layers.append(conv2dNonLinearMaxPool)

    join_layer = nn_layers.ParallelLayer(layers=conv_layers)
    flatten_layer = nn_layers.FlattenLayer()

    nnet_q = nn_layers.FeedForwardNet(layers=[join_layer, flatten_layer])
    nnet_q.set_input(input0)
    ######

    ###### ANSWER ######
    # lookup_table_words = nn_layers.LookupTableFastStatic(
    #     W=vocab_emb, pad=max(q_filter_widths) - 1)
    # lookup_table_overlap = nn_layers.LookupTableFast(
    #     W=vocab_emb_overlap, pad=max(q_filter_widths) - 1)

    # lookup_table = nn_layers.ParallelLookupTable(
    #     layers=[lookup_table_words, lookup_table_overlap])

    # num_input_channels = len(lookup_table.layers)
    # input_shape = (batch_size, num_input_channels, a_max_sent_size + 2 *
    #                (max(a_filter_widths) - 1), ndim)
    conv_layers = []
    for filter_width in a_filter_widths:
        filter_shape = (nkernels, num_input_channels, filter_width, ndim)
        conv = nn_layers.Conv2dLayer(rng=numpy_rng,
                                     filter_shape=filter_shape,
                                     input_shape=input_shape_a)
        non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0],
                                                    activation=activation)
        pooling = nn_layers.KMaxPoolLayer(k_max=a_k_max)
        conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(
            layers=[conv, non_linearity, pooling])
        conv_layers.append(conv2dNonLinearMaxPool)

    join_layer = nn_layers.ParallelLayer(layers=conv_layers)
    flatten_layer = nn_layers.FlattenLayer()

    nnet_a = nn_layers.FeedForwardNet(layers=[join_layer, flatten_layer])
    nnet_a.set_input(input1)
    #######
    # print 'nnet_q.output', nnet_q.output.ndim

    q_logistic_n_in = nkernels * len(q_filter_widths) * q_k_max
    a_logistic_n_in = nkernels * len(a_filter_widths) * a_k_max

    if args.dropout:
        if args.dropout == 'gaussian':
            dropout_q = nn_layers.FastDropoutLayer(rng=numpy_rng)
            dropout_a = nn_layers.FastDropoutLayer(rng=numpy_rng)
        elif args.dropout == 'mc':
            dropout_q = nn_layers.DropoutLayer(rng=numpy_rng, p=dropout_rate)
            dropout_a = nn_layers.DropoutLayer(rng=numpy_rng, p=dropout_rate)
        dropout_q.set_input(nnet_q.output)
        dropout_a.set_input(nnet_a.output)

    # feats_nout = 10
    # x_hidden_layer = nn_layers.LinearLayer(numpy_rng, n_in=feats_ndim, n_out=feats_nout, activation=activation)
    # x_hidden_layer.set_input(x)

    # feats_nout = feats_ndim

    ### Dropout
    # classifier = nn_layers.PairwiseLogisticWithFeatsRegression(q_in=logistic_n_in,
    #                                                   a_in=logistic_n_in,
    #                                                   n_in=feats_nout,
    #                                                   n_out=n_outs)
    # # classifier.set_input((dropout_q.output, dropout_a.output, x_hidden_layer.output))
    # classifier.set_input((dropout_q.output, dropout_a.output, x))

    # # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, dropout_q, dropout_a, classifier],
    # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, dropout_q, dropout_a, classifier],
    #                                       name="Training nnet")

    # test_classifier = nn_layers.PairwiseLogisticWithFeatsRegression(q_in=logistic_n_in,
    #                                                         a_in=logistic_n_in,
    #                                                         n_in=feats_nout,
    #                                                         n_out=n_outs,
    #                                                         W=classifier.W,
    #                                                         W_feats=classifier.W_feats,
    #                                                         b=classifier.b)
    # # test_classifier.set_input((nnet_q.output, nnet_a.output, x_hidden_layer.output))
    # test_classifier.set_input((nnet_q.output, nnet_a.output, x))
    # # test_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, test_classifier],
    # test_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, test_classifier],
    #                                       name="Test nnet")
    #########

    # pairwise_layer = nn_layers.PairwiseMultiOnlySimWithFeatsLayer(q_in=q_logistic_n_in,
    # pairwise_layer = nn_layers.PairwiseWithFeatsLayer(q_in=q_logistic_n_in,
    #                                                   a_in=a_logistic_n_in,
    #                                                   n_in=feats_ndim)
    # pairwise_layer = nn_layers.PairwiseOnlySimWithFeatsLayer(q_in=q_logistic_n_in,

    # pairwise_layer = nn_layers.PairwiseNoFeatsLayer(q_in=q_logistic_n_in,
    #                                                 a_in=a_logistic_n_in)
    # pairwise_layer.set_input((nnet_q.output, nnet_a.output))
    if args.no_features:
        pairwise_layer = nn_layers.PairwiseNoFeatsLayer(q_in=q_logistic_n_in,
                                                        a_in=a_logistic_n_in)
        n_in = q_logistic_n_in + a_logistic_n_in + 1
        if args.dropout:
            pairwise_layer.set_input((dropout_q.output, dropout_a.output))
        else:
            pairwise_layer.set_input((nnet_q.output, nnet_a.output))
    else:
        pairwise_layer = nn_layers.PairwiseWithFeatsLayer(q_in=q_logistic_n_in,
                                                          a_in=a_logistic_n_in,
                                                          n_in=feats_ndim)
        n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 1
        if args.dropout:
            pairwise_layer.set_input((dropout_q.output, dropout_a.output, x))
        else:
            pairwise_layer.set_input((nnet_q.output, nnet_a.output, x))

    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + a_logistic_n_in
    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 50
    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 1
    # n_in = q_logistic_n_in + a_logistic_n_in + 1
    # n_in = feats_ndim + 1
    # n_in = feats_ndim + 50

    hidden_layer = nn_layers.LinearLayer(numpy_rng,
                                         n_in=n_in,
                                         n_out=n_in,
                                         activation=activation)
    hidden_layer.set_input(pairwise_layer.output)

    if args.l2svm:
        classifier = nn_layers.L2SVM(n_in=n_in, n_out=n_outs)
    else:
        classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs)
    classifier.set_input(hidden_layer.output)

    all_layers = []
    if args.a:
        all_layers.append(attention)
    all_layers.extend([nnet_q, nnet_a])
    if args.dropout:
        all_layers.extend([dropout_q, dropout_a])
    all_layers.extend([pairwise_layer, hidden_layer, classifier])

    train_nnet = nn_layers.FeedForwardNet(
        layers=all_layers,
        # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, classifier],
        name="Training nnet")
    test_nnet = train_nnet
    #######

    print train_nnet

    params = train_nnet.params

    ts = datetime.now().strftime('%Y-%m-%d-%H.%M.%S')
    nnet_outdir = 'exp.out/ndim={};batch={};max_norm={};learning_rate={};{}'.format(
        ndim, batch_size, max_norm, learning_rate, ts)
    if not os.path.exists(nnet_outdir):
        os.makedirs(nnet_outdir)
    nnet_fname = os.path.join(nnet_outdir, 'nnet.dat')
    print "Saving to", nnet_fname
    cPickle.dump([train_nnet, test_nnet],
                 open(nnet_fname, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)

    total_params = sum([numpy.prod(param.shape.eval()) for param in params])
    print 'Total params number:', total_params

    cost = train_nnet.layers[-1].training_cost(y)
    # y_train_counts = numpy.unique(y_train, return_counts=True)[1].astype(numpy.float32)
    # weights_data = numpy.sum(y_train_counts) / y_train_counts
    # weights_data_norm = numpy.linalg.norm(weights_data)
    # weights_data /= weights_data_norm
    # print 'weights_data', weights_data
    # weights = theano.shared(weights_data, borrow=True)
    # cost = train_nnet.layers[-1].training_cost_weighted(y, weights=weights)

    predictions = test_nnet.layers[-1].y_pred
    predictions_prob = test_nnet.layers[-1].p_y_given_x[:, -1]

    ### L2 regularization
    # L2_word_emb = 1e-4
    # L2_conv1d = 3e-5
    # # L2_softmax = 1e-3
    # L2_softmax = 1e-4
    # print "Regularizing nnet weights"
    # for w in train_nnet.weights:
    #   L2_reg = 0.
    #   if w.name.startswith('W_emb'):
    #     L2_reg = L2_word_emb
    #   elif w.name.startswith('W_conv1d'):
    #     L2_reg = L2_conv1d
    #   elif w.name.startswith('W_softmax'):
    #     L2_reg = L2_softmax
    #   elif w.name == 'W':
    #     L2_reg = L2_softmax
    #   print w.name, L2_reg
    #   cost += T.sum(w**2) * L2_reg

    batch_x = T.dmatrix('batch_x')
    batch_x_q = T.lmatrix('batch_x_q')
    batch_x_a = T.lmatrix('batch_x_a')
    batch_x_q_overlap = T.lmatrix('batch_x_q_overlap')
    batch_x_a_overlap = T.lmatrix('batch_x_a_overlap')
    batch_y = T.ivector('batch_y')

    # updates = sgd_trainer.get_adagrad_updates(cost, params, learning_rate=learning_rate, max_norm=max_norm, _eps=1e-6)
    updates = sgd_trainer.get_adadelta_updates(cost,
                                               params,
                                               rho=0.95,
                                               eps=1e-6,
                                               max_norm=max_norm,
                                               word_vec_name='W_emb')

    inputs_pred = [
        batch_x_q,
        batch_x_a,
        batch_x_q_overlap,
        batch_x_a_overlap,
        batch_x,
    ]

    givens_pred = {
        x_q: batch_x_q,
        x_a: batch_x_a,
        x_q_overlap: batch_x_q_overlap,
        x_a_overlap: batch_x_a_overlap,
        x: batch_x
    }

    inputs_train = [
        batch_x_q,
        batch_x_a,
        batch_x_q_overlap,
        batch_x_a_overlap,
        batch_x,
        batch_y,
    ]

    givens_train = {
        x_q: batch_x_q,
        x_a: batch_x_a,
        x_q_overlap: batch_x_q_overlap,
        x_a_overlap: batch_x_a_overlap,
        x: batch_x,
        y: batch_y
    }

    train_fn = theano.function(inputs=inputs_train,
                               outputs=cost,
                               updates=updates,
                               givens=givens_train,
                               on_unused_input='warn')

    pred_fn = theano.function(inputs=inputs_pred,
                              outputs=predictions,
                              givens=givens_pred,
                              on_unused_input='warn')

    pred_prob_fn = theano.function(inputs=inputs_pred,
                                   outputs=predictions_prob,
                                   givens=givens_pred,
                                   on_unused_input='warn')

    def predict_batch(batch_iterator):
        preds = numpy.hstack([
            pred_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap,
                    batch_x) for batch_x_q, batch_x_a, batch_x_q_overlap,
            batch_x_a_overlap, batch_x, _ in batch_iterator
        ])
        return preds[:batch_iterator.n_samples]

    def predict_prob_batch(batch_iterator):
        preds = numpy.hstack([
            pred_prob_fn(batch_x_q, batch_x_a, batch_x_q_overlap,
                         batch_x_a_overlap, batch_x) for batch_x_q, batch_x_a,
            batch_x_q_overlap, batch_x_a_overlap, batch_x, _ in batch_iterator
        ])
        return preds[:batch_iterator.n_samples]

    train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng,
        [q_train, a_train, q_overlap_train, a_overlap_train, x_train, y_train],
        batch_size=batch_size,
        randomize=True)
    dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [q_dev, a_dev, q_overlap_dev, a_overlap_dev, x_dev, y_dev],
        batch_size=batch_size,
        randomize=False)
    test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng,
        [q_test, a_test, q_overlap_test, a_overlap_test, x_test, y_test],
        batch_size=batch_size,
        randomize=False)

    labels = sorted(numpy.unique(y_test))
    print 'labels', labels

    def map_score(qids, labels, preds):
        qid2cand = defaultdict(list)
        for qid, label, pred in zip(qids, labels, preds):
            qid2cand[qid].append((pred, label))

        average_precs = []
        for qid, candidates in qid2cand.iteritems():
            average_prec = 0
            running_correct_count = 0
            for i, (score,
                    label) in enumerate(sorted(candidates, reverse=True), 1):
                if label > 0:
                    running_correct_count += 1
                    average_prec += float(running_correct_count) / i
            average_precs.append(average_prec / (running_correct_count + 1e-6))
        map_score = sum(average_precs) / len(average_precs)
        return map_score

    print "Zero out dummy word:", ZEROUT_DUMMY_WORD
    if ZEROUT_DUMMY_WORD:
        W_emb_list = [w for w in params if w.name == 'W_emb']
        zerout_dummy_word = theano.function(
            [], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list])

    # weights_dev = numpy.zeros(len(y_dev))
    # weights_dev[y_dev == 0] = weights_data[0]
    # weights_dev[y_dev == 1] = weights_data[1]
    # print weights_dev

    best_dev_acc = -numpy.inf
    epoch = 0
    timer_train = time.time()
    no_best_dev_update = 0
    num_train_batches = len(train_set_iterator)
    while epoch < n_epochs:
        timer = time.time()
        for i, (x_q, x_a, x_q_overlap, x_a_overlap, x,
                y) in enumerate(tqdm(train_set_iterator), 1):
            train_fn(x_q, x_a, x_q_overlap, x_a_overlap, x, y)

            # Make sure the null word in the word embeddings always remains zero
            if ZEROUT_DUMMY_WORD:
                zerout_dummy_word()

            if i % 10 == 0 or i == num_train_batches:
                y_pred_dev = predict_prob_batch(dev_set_iterator)
                # # dev_acc = map_score(qids_dev, y_dev, predict_prob_batch(dev_set_iterator)) * 100
                dev_acc = metrics.roc_auc_score(y_dev, y_pred_dev) * 100
                if dev_acc > best_dev_acc:
                    y_pred = predict_prob_batch(test_set_iterator)
                    test_acc = map_score(qids_test, y_test, y_pred) * 100

                    print(
                        'epoch: {} batch: {} dev auc: {:.4f}; test map: {:.4f}; best_dev_acc: {:.4f}'
                        .format(epoch, i, dev_acc, test_acc, best_dev_acc))
                    best_dev_acc = dev_acc
                    best_params = [
                        numpy.copy(p.get_value(borrow=True)) for p in params
                    ]
                    no_best_dev_update = 0

        if no_best_dev_update >= args.early_stop:
            print "Quitting after of no update of the best score on dev set", no_best_dev_update
            break

        print('epoch {} took {:.4f} seconds'.format(epoch,
                                                    time.time() - timer))
        epoch += 1
        no_best_dev_update += 1

    print('Training took: {:.4f} seconds'.format(time.time() - timer_train))
    for i, param in enumerate(best_params):
        params[i].set_value(param, borrow=True)

    y_pred_test = predict_prob_batch(test_set_iterator)
    test_acc = map_score(qids_test, y_test, y_pred_test) * 100
    fname = os.path.join(
        nnet_outdir,
        'best_dev_params.epoch={:02d};batch={:05d};dev_acc={:.2f}.dat'.format(
            epoch, i, best_dev_acc))
    numpy.savetxt(
        os.path.join(
            nnet_outdir,
            'test.epoch={:02d};batch={:05d};dev_acc={:.2f}.predictions.npy'.
            format(epoch, i, best_dev_acc)), y_pred)
    cPickle.dump(best_params,
                 open(fname, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)

    print "Running trec_eval script..."
    N = len(y_pred_test)

    df_submission = pd.DataFrame(
        index=numpy.arange(N),
        columns=['qid', 'iter', 'docno', 'rank', 'sim', 'run_id'])
    df_submission['qid'] = qids_test
    df_submission['iter'] = 0
    df_submission['docno'] = aids_test
    df_submission['rank'] = 0
    df_submission['sim'] = y_pred_test
    df_submission['run_id'] = 'nnet'
    df_submission.to_csv(os.path.join(nnet_outdir, 'submission.txt'),
                         header=False,
                         index=False,
                         sep=' ')

    df_gold = pd.DataFrame(index=numpy.arange(N),
                           columns=['qid', 'iter', 'docno', 'rel'])
    df_gold['qid'] = qids_test
    df_gold['iter'] = 0
    df_gold['docno'] = aids_test
    df_gold['rel'] = y_test
    df_gold.to_csv(os.path.join(nnet_outdir, 'gold.txt'),
                   header=False,
                   index=False,
                   sep=' ')

    subprocess.call("/bin/sh run_eval.sh '{}'".format(nnet_outdir), shell=True)
    print 'results saved to directory {}'.format(nnet_outdir)
示例#5
0
def main():
    # ZEROUT_DUMMY_WORD = False
    ZEROUT_DUMMY_WORD = True

    ## Load data
    # mode = 'TRAIN-ALL'
    #mode = 'TRAIN_DATA'
    #mode = 'TRAIN_NO_OVERLAP'
    #if len(sys.argv) > 1:
    #    mode = sys.argv[1]
    #    if not mode in ['TRAIN', 'TRAIN-ALL']:
    #        print "ERROR! The two possible training settings are: ['TRAIN', 'TRAIN-ALL']"
    #        sys.exit(1)

    mode = 'k_time_data1'.upper()

    print "Running training in the {} setting".format(mode)

    position_num = 10
    select_model = "PSCM"
    if select_model == "PSCM":
        click_model_index = 4  #PSCM
    elif select_model == "UBM":
        click_model_index = 1
    else:
        raise "MODEL SELECT ERROR!"
    data_dir = mode

    add_train = numpy.load(os.path.join(data_dir, 'train.additions.npy'))
    q_train = numpy.load(os.path.join(data_dir, 'train.questions.npy'))
    a_train = numpy.load(os.path.join(data_dir, 'train.answers.npy'))
    y_train = numpy.load(os.path.join(data_dir, 'train.labels.npy'))

    add_dev = numpy.load(os.path.join(data_dir, 'dev.additions.npy'))
    q_dev = numpy.load(os.path.join(data_dir, 'dev.questions.npy'))
    a_dev = numpy.load(os.path.join(data_dir, 'dev.answers.npy'))
    #q_overlap_dev = numpy.load(os.path.join(data_dir, 'dev.q_overlap_indices.npy'))
    #a_overlap_dev = numpy.load(os.path.join(data_dir, 'dev.a_overlap_indices.npy'))
    y_dev = numpy.load(os.path.join(data_dir, 'dev.labels.npy'))
    qids_dev = numpy.load(os.path.join(data_dir, 'dev.qids.npy'))

    add_test = numpy.load(os.path.join(data_dir, 'test.additions.npy'))
    q_test = numpy.load(os.path.join(data_dir, 'test.questions.npy'))
    a_test = numpy.load(os.path.join(data_dir, 'test.answers.npy'))
    #q_overlap_test = numpy.load(os.path.join(data_dir, 'test.q_overlap_indices.npy'))
    #a_overlap_test = numpy.load(os.path.join(data_dir, 'test.a_overlap_indices.npy'))
    y_test = numpy.load(os.path.join(data_dir, 'test.labels.npy'))
    qids_test = numpy.load(os.path.join(data_dir, 'test.qids.npy'))

    # x_train = numpy.load(os.path.join(data_dir, 'train.overlap_feats.npy'))
    # x_dev = numpy.load(os.path.join(data_dir, 'dev.overlap_feats.npy'))
    # x_test = numpy.load(os.path.join(data_dir, 'test.overlap_feats.npy'))

    # feats_ndim = x_train.shape[1]

    # from sklearn.preprocessing import StandardScaler
    # scaler = StandardScaler()
    # print "Scaling overlap features"
    # x_train = scaler.fit_transform(x_train)
    # x_dev = scaler.transform(x_dev)
    # x_test = scaler.transform(x_test)

    #multi dim

    #y_train_tmp = numpy.dstack((y_train, y_train, y_train))[0]
    #y_dev_tmp = numpy.dstack((y_dev, y_dev, y_dev))[0]
    #y_test_tmp = numpy.dstack((y_test, y_test, y_test))[0]

    #y_train = y_train_tmp
    #y_dev = y_dev_tmp
    #y_test = y_test_tmp

    max_query_id = numpy.max([
        numpy.max(add_train[:, 0]),
        numpy.max(add_test[:, 0]),
        numpy.max(add_dev[:, 0])
    ])
    max_url_id = numpy.max([
        numpy.max(add_train[:, 1:]),
        numpy.max(add_test[:, 1:]),
        numpy.max(add_dev[:, 1:])
    ])

    print 'max_query_id', max_query_id
    print 'max_url_id', max_url_id

    print 'y_train', numpy.unique(y_train, return_counts=True)
    print 'y_dev', numpy.unique(y_dev, return_counts=True)
    print 'y_test', numpy.unique(y_test, return_counts=True)

    print 'q_train', q_train.shape
    print 'q_dev', q_dev.shape
    print 'q_test', q_test.shape

    print 'a_train', a_train.shape
    print 'a_dev', a_dev.shape
    print 'a_test', a_test.shape

    ## Get the word embeddings from the nnet trained on SemEval
    # ndim = 40
    # nnet_outdir = 'exp/ndim=60;batch=100;max_norm=0;learning_rate=0.1;2014-12-02-15:53:14'
    # nnet_fname = os.path.join(nnet_outdir, 'nnet.dat')
    # params_fname = os.path.join(nnet_outdir, 'best_dev_params.epoch=00;batch=14640;dev_f1=83.12;test_acc=85.00.dat')
    # train_nnet, test_nnet = nn_layers.load_nnet(nnet_fname, params_fname)

    numpy_rng = numpy.random.RandomState(123)
    q_max_sent_size = q_train.shape[1]
    a_max_sent_size = a_train.shape[2]
    # print 'max', numpy.max(a_train)
    # print 'min', numpy.min(a_train)

    #ndim = 5
    #print "Generating random vocabulary for word overlap indicator features with dim:", ndim
    #dummy_word_id = numpy.max(a_overlap_train)
    # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim))
    #print "Gaussian"
    #vocab_emb_overlap = numpy_rng.randn(dummy_word_id + 1, ndim) * 0.25
    # vocab_emb_overlap = numpy_rng.randn(dummy_word_id+1, ndim) * 0.05
    # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim))
    #vocab_emb_overlap[-1] = 0

    # Load word2vec embeddings
    fname = os.path.join(data_dir, 'emb_vectors.skip.1124.4m.10w.npy')

    print "Loading word embeddings from", fname
    vocab_emb = numpy.load(fname)
    ndim = vocab_emb.shape[1]
    dummpy_word_idx = numpy.max(a_train)
    print "Word embedding matrix size:", vocab_emb.shape

    x = T.dmatrix('x')
    x_q = T.lmatrix('q')
    #x_q_overlap = T.lmatrix('q_overlap')
    #x_a = T.lmatrix('a')
    x_a_all = T.ltensor3('a_all')
    #x_a_overlap = T.lmatrix('a_overlap')
    #y = T.ivector('y')
    y = T.imatrix('y')
    add_info = T.dmatrix('add_info')

    #######
    n_outs = 2

    n_epochs = 15
    batch_size = 50
    learning_rate = 0.1
    max_norm = 0

    print 'batch_size', batch_size
    print 'n_epochs', n_epochs
    print 'learning_rate', learning_rate
    print 'max_norm', max_norm

    ## 1st conv layer.
    #ndim = vocab_emb.shape[1] + vocab_emb_overlap.shape[1]
    ndim = vocab_emb.shape[1]

    ### Nonlinearity type
    # activation = nn_layers.relu_f
    activation = T.tanh

    dropout_rate = 0.5
    nkernels = 100
    q_k_max = 1
    a_k_max = 1

    # filter_widths = [3,4,5]
    q_filter_widths = [5]
    a_filter_widths = [5]

    ###### QUESTION ######
    lookup_table_words = nn_layers.LookupTableFastStatic(
        W=vocab_emb, pad=max(q_filter_widths) - 1)
    #lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=max(q_filter_widths) - 1)

    #lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words, lookup_table_overlap])
    lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words])

    num_input_channels = 1
    input_shape = (batch_size, num_input_channels,
                   q_max_sent_size + 2 * (max(q_filter_widths) - 1), ndim)

    conv_layers = []
    for filter_width in q_filter_widths:
        filter_shape = (nkernels, num_input_channels, filter_width, ndim)
        conv = nn_layers.Conv2dLayer(rng=numpy_rng,
                                     filter_shape=filter_shape,
                                     input_shape=input_shape)
        non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0],
                                                    activation=activation)
        pooling = nn_layers.KMaxPoolLayer(k_max=q_k_max)
        conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(
            layers=[conv, non_linearity, pooling])
        conv_layers.append(conv2dNonLinearMaxPool)

    join_layer = nn_layers.ParallelLayer(layers=conv_layers)
    flatten_layer = nn_layers.FlattenLayer()

    nnet_q = nn_layers.FeedForwardNet(layers=[
        lookup_table,
        join_layer,
        flatten_layer,
    ])
    #nnet_q.set_input((x_q, x_q_overlap))
    nnet_q.set_input([x_q])
    ######

    ###### ANSWER ######
    nnet_a_list = []
    #lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths) - 1)
    for i in xrange(position_num):
        #lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths) - 1)
        #lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=max(q_filter_widths) - 1)

        #lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words, lookup_table_overlap])
        #lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words])

        # num_input_channels = len(lookup_table.layers)
        #input_shape = (batch_size, num_input_channels, a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim)
        input_shape = (batch_size, num_input_channels,
                       a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim)
        conv_layers = []
        for filter_width in a_filter_widths:
            filter_shape = (nkernels, num_input_channels, filter_width, ndim)
            conv = nn_layers.Conv2dLayer(rng=numpy_rng,
                                         filter_shape=filter_shape,
                                         input_shape=input_shape)
            non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0],
                                                        activation=activation)
            pooling = nn_layers.KMaxPoolLayer(k_max=a_k_max)
            conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(
                layers=[conv, non_linearity, pooling])
            conv_layers.append(conv2dNonLinearMaxPool)

        join_layer = nn_layers.ParallelLayer(layers=conv_layers)
        flatten_layer = nn_layers.FlattenLayer()

        nnet_a = nn_layers.FeedForwardNet(layers=[
            lookup_table,
            join_layer,
            flatten_layer,
        ])
        #nnet_a.set_input((x_a, x_a_overlap))
        nnet_a.set_input([x_a_all[:, i, :]])
        nnet_a_list.append(nnet_a)
    #######
    # print 'nnet_q.output', nnet_q.output.ndim

    q_logistic_n_in = nkernels * len(q_filter_widths) * q_k_max
    #a_logistic_n_in = nkernels * len(a_filter_widths) * a_k_max
    a_logistic_n_in = nkernels * len(a_filter_widths) * a_k_max

    print "q_logistic_n_in, ", q_logistic_n_in
    print "a_logistic_n_in, ", a_logistic_n_in

    #pairwise_layer = nn_layers.PositionPairwiseNoFeatsLayer(q_in=q_logistic_n_in, a_in=a_logistic_n_in,position=position_num)
    pairwise_layer = nn_layers.PositionOnlySimPairwiseNoFeatsLayer(
        q_in=q_logistic_n_in, a_in=a_logistic_n_in, position=position_num)
    pairwise_out_list = [nnet_q.output]
    for i in xrange(position_num):
        pairwise_out_list.append(nnet_a_list[i].output)
    pairwise_layer.set_input(pairwise_out_list)
    #pairwise_layer.set_input((nnet_q.output, nnet_a.output))

    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + a_logistic_n_in
    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 50
    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 1
    #n_in = q_logistic_n_in + a_logistic_n_in * position_num + 1 * position_num
    #n_in = 1 * position_num + position_num * (position_num - 1) / 2
    n_in = q_logistic_n_in + a_logistic_n_in * position_num + 1 * position_num + position_num * (
        position_num - 1) / 2
    # n_in = feats_ndim + 1
    # n_in = feats_ndim + 50

    hidden_layer = nn_layers.LinearLayer(numpy_rng,
                                         n_in=n_in,
                                         n_out=n_in,
                                         activation=activation)
    hidden_layer.set_input(pairwise_layer.output)

    #classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs)
    #classifier.set_input(hidden_layer.output)

    classifier = nn_layers.FeatureClickModelLayer(
        n_in=n_in,
        n_out=n_outs,
        max_q_id=max_query_id,
        max_u_id=max_url_id,
        dim=position_num,
        click_model_index=click_model_index)
    #classifier = nn_layers.SimpleClickModelLayer(n_in=n_in, n_out=n_outs, max_q_id=max_query_id, max_u_id=max_url_id, dim=position_num)
    #classifier = nn_layers.MultiDimLogisticRegression(n_in=n_in, n_out=n_outs, dim=position_num)
    #classifier = nn_layers.LogisticRegression2(n_in=n_in, n_out=n_outs)
    classifier.set_input([hidden_layer.output, add_info])

    #train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, pairwise_layer, hidden_layer, classifier],
    #                                      name="Training nnet")
    train_nnet = nn_layers.FeedForwardNet(
        layers=[nnet_q] + nnet_a_list +
        [pairwise_layer, hidden_layer, classifier],
        name="Training nnet")
    test_nnet = train_nnet
    #######

    #print train_nnet

    params = train_nnet.params

    ts = datetime.now().strftime('%Y-%m-%d-%H.%M.%S')
    nnet_outdir = 'exp.multi.out/model={},data={};ndim={};batch={};max_norm={};learning_rate={};{}'.format(
        select_model, mode, ndim, batch_size, max_norm, learning_rate, ts)
    if not os.path.exists(nnet_outdir):
        os.makedirs(nnet_outdir)
    nnet_fname = os.path.join(nnet_outdir, 'nnet.dat')
    print "Saving to", nnet_fname
    cPickle.dump([train_nnet, test_nnet],
                 open(nnet_fname, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)

    #total_params = sum([numpy.prod(param.shape.eval()) for param in params])
    #print 'Total params number:', total_params

    cost = train_nnet.layers[-1].training_cost(y)
    # y_train_counts = numpy.unique(y_train, return_counts=True)[1].astype(numpy.float32)
    # weights_data = numpy.sum(y_train_counts) / y_train_counts
    # weights_data_norm = numpy.linalg.norm(weights_data)
    # weights_data /= weights_data_norm
    # print 'weights_data', weights_data
    # weights = theano.shared(weights_data, borrow=True)
    # cost = train_nnet.layers[-1].training_cost_weighted(y, weights=weights)

    predictions = test_nnet.layers[-1].y_pred

    #predictions_prob = test_nnet.layers[-1].p_y_given_x[:, position_num:position_num * 2]
    predictions_prob = test_nnet.layers[-1].p_y_given_x

    ### L2 regularization
    # L2_word_emb = 1e-4
    # L2_conv1d = 3e-5
    # # L2_softmax = 1e-3
    # L2_softmax = 1e-4
    # print "Regularizing nnet weights"
    # for w in train_nnet.weights:
    #   L2_reg = 0.
    #   if w.name.startswith('W_emb'):
    #     L2_reg = L2_word_emb
    #   elif w.name.startswith('W_conv1d'):
    #     L2_reg = L2_conv1d
    #   elif w.name.startswith('W_softmax'):
    #     L2_reg = L2_softmax
    #   elif w.name == 'W':
    #     L2_reg = L2_softmax
    #   print w.name, L2_reg
    #   cost += T.sum(w**2) * L2_reg

    # batch_x = T.dmatrix('batch_x')
    batch_x_q = T.lmatrix('batch_x_q')
    #batch_x_a = T.lmatrix('batch_x_a')
    batch_x_a_all = T.ltensor3('batch_x_a_all')
    #batch_x_q_overlap = T.lmatrix('batch_x_q_overlap')
    #batch_x_a_overlap = T.lmatrix('batch_x_a_overlap')
    #batch_y = T.ivector('batch_y')
    batch_y = T.imatrix('batch_y')
    batch_add_info = T.dmatrix('batch_add_info')

    # updates = sgd_trainer.get_adagrad_updates(cost, params, learning_rate=learning_rate, max_norm=max_norm, _eps=1e-6)
    updates = sgd_trainer.get_adadelta_updates(cost,
                                               params,
                                               rho=0.95,
                                               eps=1e-6,
                                               max_norm=max_norm,
                                               word_vec_name='W_emb')

    inputs_pred = [
        batch_x_q,
        batch_x_a_all,
        batch_add_info,
        #batch_x_q_overlap,
        #batch_x_a_overlap,
        # batch_x,
    ]

    givens_pred = {
        x_q: batch_x_q,
        x_a_all: batch_x_a_all,
        add_info: batch_add_info,
        #x_q_overlap: batch_x_q_overlap,
        #x_a_overlap: batch_x_a_overlap,
        # x: batch_x
    }

    inputs_train = [
        batch_x_q,
        batch_x_a_all,
        #batch_x_q_overlap,
        #batch_x_a_overlap,
        # batch_x,
        batch_add_info,
        batch_y,
    ]

    givens_train = {
        x_q: batch_x_q,
        x_a_all: batch_x_a_all,
        #x_q_overlap: batch_x_q_overlap,
        #x_a_overlap: batch_x_a_overlap,
        # x: batch_x,
        add_info: batch_add_info,
        y: batch_y
    }

    train_fn = theano.function(inputs=inputs_train,
                               outputs=cost,
                               updates=updates,
                               givens=givens_train,
                               on_unused_input='warn')

    pred_fn = theano.function(inputs=inputs_pred,
                              outputs=predictions,
                              givens=givens_pred,
                              on_unused_input='warn')

    pred_prob_fn = theano.function(inputs=inputs_pred,
                                   outputs=predictions_prob,
                                   givens=givens_pred,
                                   on_unused_input='warn')

    def predict_batch(batch_iterator):
        #preds = numpy.vstack([pred_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap) for
        #                      batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, _ in batch_iterator])
        preds = numpy.vstack([
            pred_fn(batch_x_q, batch_x_a, batch_add_info)
            for batch_x_q, batch_x_a, batch_add_info, _ in batch_iterator
        ])
        real_preds = preds[:, -1 * position_num:]
        inner_outputs = preds

        return real_preds[:batch_iterator.
                          n_samples], inner_outputs[:batch_iterator.n_samples]

    def predict_prob_batch(batch_iterator):
        #preds = numpy.vstack([pred_prob_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap) for
        #                      batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, _ in batch_iterator])
        preds = numpy.vstack([
            pred_prob_fn(batch_x_q, batch_x_a, batch_add_info)
            for batch_x_q, batch_x_a, batch_add_info, _ in batch_iterator
        ])
        real_preds = preds[:, -1 * position_num:]
        inner_outputs = preds

        return real_preds[:batch_iterator.
                          n_samples], inner_outputs[:batch_iterator.n_samples]

    train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [q_train, a_train, add_train, y_train],
        batch_size=batch_size,
        randomize=True)
    dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [q_dev, a_dev, add_dev, y_dev],
        batch_size=batch_size,
        randomize=False)
    test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [q_test, a_test, add_test, y_test],
        batch_size=batch_size,
        randomize=False)

    labels = sorted(numpy.unique(y_test[:, -1]))
    print 'labels', labels

    def perplexity_score(labels, preds):
        positionPerplexity = [0.0] * position_num
        positionPerplexityClickSkip = [[0.0, 0.0]
                                       for i in xrange(position_num)]
        counts = [0] * position_num
        countsClickSkip = [[0, 0] for i in xrange(position_num)]
        for label, pred in zip(labels, preds):
            for i in range(0, len(label)):
                click = 1 if label[i] else 0
                tmp_pred = max(min(pred[i], 0.99999), 0.00001)
                logProb = math.log(tmp_pred, 2)
                if click == 0:
                    logProb = math.log(1 - tmp_pred, 2)
                positionPerplexity[i] += logProb
                positionPerplexityClickSkip[i][click] += logProb
                counts[i] += 1
                countsClickSkip[i][click] += 1
        positionPerplexity = [
            2**(-x / count if count else x)
            for (x, count) in zip(positionPerplexity, counts)
        ]
        positionPerplexityClickSkip = [[2 ** (-x[click] / (count[click] if count[click] else 1) if count else x) \
                for (x, count) in zip(positionPerplexityClickSkip, countsClickSkip)] for click in xrange(2)]
        perplexity = sum(positionPerplexity) / len(positionPerplexity)
        ret_str = "---------\n"
        ret_str += "Perplexity\t" + str(perplexity) + "\n"
        ret_str += "positionPerplexity"
        for i in range(0, position_num):
            ret_str += "\t" + str(positionPerplexity[i])
        ret_str += "\n"

        ret_str += "positionPerplexitySkip"
        for i in range(0, position_num):
            ret_str += "\t" + str(positionPerplexityClickSkip[0][i])
        ret_str += "\n"

        ret_str += "positionPerplexityClick"
        for i in range(0, position_num):
            ret_str += "\t" + str(positionPerplexityClickSkip[1][i])
        ret_str += "\n------------\n"
        #print ret_str
        return perplexity, ret_str

    def map_score(qids, labels, preds):
        qid2cand = defaultdict(list)
        for qid, label, pred in zip(qids, labels, preds):
            qid2cand[qid].append((pred, label))

        average_precs = []
        for qid, candidates in qid2cand.iteritems():
            average_prec = 0
            running_correct_count = 0
            for i, (score,
                    label) in enumerate(sorted(candidates, reverse=True), 1):
                if label > 0:
                    running_correct_count += 1
                    average_prec += float(running_correct_count) / i
            average_precs.append(average_prec / (running_correct_count + 1e-6))
        map_score = sum(average_precs) / len(average_precs)
        return map_score

    print "Zero out dummy word:", ZEROUT_DUMMY_WORD
    if ZEROUT_DUMMY_WORD:
        W_emb_list = [w for w in params if w.name == 'W_emb']
        zerout_dummy_word = theano.function(
            [], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list])

    # weights_dev = numpy.zeros(len(y_dev))
    # weights_dev[y_dev == 0] = weights_data[0]
    # weights_dev[y_dev == 1] = weights_data[1]
    # print weights_dev

    best_dev_acc = -numpy.inf
    best_dev_perp = numpy.inf
    epoch = 0
    timer_train = time.time()
    no_best_dev_update = 0
    num_train_batches = len(train_set_iterator)
    while epoch < n_epochs:
        timer = time.time()
        for i, (x_q, x_a, add, y) in enumerate(tqdm(train_set_iterator), 1):
            train_fn(x_q, x_a, add, y)

            # Make sure the null word in the word embeddings always remains zero
            if ZEROUT_DUMMY_WORD:
                zerout_dummy_word()

            if i % 10 == 0 or i == num_train_batches:
                y_pred_dev, y_inner_dev = predict_prob_batch(dev_set_iterator)
                #print "shape:"
                #print str(y_dev.shape)
                #print str(y_pred_dev.shape)
                # # dev_acc = map_score(qids_dev, y_dev, predict_prob_batch(dev_set_iterator)) * 100
                dev_acc = metrics.roc_auc_score(y_dev[:, -1],
                                                y_pred_dev[:, -1]) * 100
                dev_perp, dev_perp_str = perplexity_score(y_dev, y_pred_dev)
                if dev_acc > best_dev_acc:
                    y_pred, y_inner = predict_prob_batch(test_set_iterator)
                    test_acc = map_score(qids_test, y_test[:, -1],
                                         y_pred[:, -1]) * 100
                    print(
                        'epoch: {} batch: {} dev auc: {:.4f}; test map: {:.4f}; best_dev_acc: {:.4f}'
                        .format(epoch, i, dev_acc, test_acc, best_dev_acc))
                    best_dev_acc = dev_acc

                if dev_perp < best_dev_perp:
                    y_pred, y_inner = predict_prob_batch(test_set_iterator)
                    test_acc = map_score(qids_test, y_test[:, -1],
                                         y_pred[:, -1]) * 100
                    test_perplexity, test_perplexity_str = perplexity_score(
                        y_test, y_pred)
                    print(
                        'epoch: {} batch: {} dev auc: {:.4f}; test map: {:.4f}; best_dev_acc: {:.4f}; dev_perp: {:.4f}; best_dev_perp: {:.4f}'
                        .format(epoch, i, dev_acc, test_acc, best_dev_acc,
                                dev_perp, best_dev_perp))
                    print str(test_perplexity_str)
                    best_params = [
                        numpy.copy(p.get_value(borrow=True)) for p in params
                    ]
                    best_inner = y_inner
                    no_best_dev_update = 0
                    best_dev_perp = dev_perp
        if no_best_dev_update >= 3:
            print "Quitting after of no update of the best score on dev set", no_best_dev_update
            break

        numpy.savetxt(
            os.path.join(
                nnet_outdir,
                'test.epoch={:02d};batch={:05d};dev_perp={:.2f}.best_inner.npy'
                .format(epoch, i, best_dev_perp)), best_inner)
        print('epoch {} took {:.4f} seconds'.format(epoch,
                                                    time.time() - timer))
        epoch += 1
        no_best_dev_update += 1

    print('Training took: {:.4f} seconds'.format(time.time() - timer_train))
    for i, param in enumerate(best_params):
        params[i].set_value(param, borrow=True)

    y_pred_test, y_inner_test = predict_prob_batch(test_set_iterator)
    test_acc = map_score(qids_test, y_test[:, -1], y_pred_test[:, -1]) * 100
    test_perp, test_perp_str = perplexity_score(y_test, y_pred_test)
    print "FINAL ACCURACY"
    print str(test_acc)
    print "FINAL PERPLEXITY"
    print str(test_perp_str)
    fname = os.path.join(
        nnet_outdir,
        'best_dev_params.epoch={:02d};batch={:05d};dev_acc={:.2f}.dat'.format(
            epoch, i, best_dev_acc))
    numpy.savetxt(
        os.path.join(
            nnet_outdir,
            'test.epoch={:02d};batch={:05d};dev_acc={:.2f}.predictions.npy'.
            format(epoch, i, best_dev_acc)), y_pred_test)
    numpy.savetxt(
        os.path.join(
            nnet_outdir,
            'test.final.epoch={:02d};batch={:05d};dev_acc={:.2f}.best_inner.npy'
            .format(epoch, i, best_dev_acc)), best_inner)
    cPickle.dump(best_params,
                 open(fname, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)
def main():
    ##########
    # LAYERS #
    #########
    HOME_DIR = "semeval_parsed"
    timestamp = str(long(time.time() * 1000))
    input_fname = '200M'
    embedding = 'custom'

    data_dir = HOME_DIR + '_' + input_fname
    numpy_rng = numpy.random.RandomState(123)
    print "Load Parameters"
    parameter_map = cPickle.load(
        open(data_dir + '/parameters_distant_winner.p', 'rb'))
    input_shape = parameter_map['inputShape']
    filter_width = parameter_map['filterWidth']
    n_in = parameter_map['n_in']
    st = parameter_map['st']

    fname_wordembeddings = os.path.join(
        data_dir, 'emb_smiley_tweets_embedding_topic.npy')
    print "Loading word embeddings from", fname_wordembeddings
    vocab_emb_overlap = numpy.load(fname_wordembeddings)
    ndim = vocab_emb_overlap.shape[1]

    ndim = 5
    fname_vocab = os.path.join(data_dir, 'vocab_{}.pickle'.format('topic'))
    alphabet = cPickle.load(open(fname_vocab))
    dummy_word_id = alphabet.fid
    vocab_emb_overlap = (numpy_rng.randn(dummy_word_id + 1, ndim) *
                         0.25).astype(numpy.float32)

    def relu(x):
        return x * (x > 0)

    activation = relu

    tweets = T.imatrix('tweets_train')
    topics = T.imatrix('topics')
    y = T.lvector('y')
    batch_tweets = T.imatrix('batch_x_q')
    batch_topics = T.imatrix('batch_top')
    batch_y = T.lvector('batch_y')

    lookup_table_words = nn_layers.LookupTableFastStatic(
        W=parameter_map['LookupTableFastStaticW'].get_value(),
        pad=filter_width - 1)

    lookup_table_topic = nn_layers.LookupTableFast(W=vocab_emb_overlap,
                                                   pad=filter_width - 1)

    lookup_table = nn_layers.ParallelLookupTable(
        layers=[lookup_table_words, lookup_table_topic])

    filter_shape = parameter_map['FilterShape' + str(filter_width)]
    filter_shape = (filter_shape[0], filter_shape[1], filter_shape[2],
                    filter_shape[3] + ndim)

    input_shape = (input_shape[0], input_shape[1], input_shape[2],
                   input_shape[3] + ndim)

    conv_layers = []

    fan_in = numpy.prod(filter_shape[1:])
    fan_out = filter_shape[0] * numpy.prod(filter_shape[2:])
    W_bound = numpy.sqrt(1. / fan_in)
    W_data = numpy.asarray(numpy_rng.uniform(low=-W_bound,
                                             high=W_bound,
                                             size=(filter_shape[0],
                                                   filter_shape[1],
                                                   filter_shape[2], ndim)),
                           dtype=theano.config.floatX)

    W_map = parameter_map['Conv2dLayerW' + str(filter_width)].get_value()

    print W_map.shape
    print W_data.shape
    W_data = numpy.concatenate((W_map, W_data), axis=3)

    conv = nn_layers.Conv2dLayer(W=theano.shared(W_data,
                                                 name="W_conv1d",
                                                 borrow=True),
                                 rng=numpy_rng,
                                 filter_shape=filter_shape,
                                 input_shape=input_shape)

    non_linearity = nn_layers.NonLinearityLayer(
        b=parameter_map['NonLinearityLayerB' + str(filter_width)],
        b_size=filter_shape[0],
        activation=activation)
    shape1 = parameter_map['PoolingShape1']
    pooling = nn_layers.KMaxPoolLayerNative(shape=shape1,
                                            ignore_border=True,
                                            st=st)

    input_shape2 = parameter_map['input_shape2' + str(filter_width)]
    filter_shape2 = parameter_map['FilterShape2' + str(filter_width)]

    con2 = nn_layers.Conv2dLayer(W=parameter_map['Conv2dLayerW2' +
                                                 str(filter_width)],
                                 rng=numpy_rng,
                                 input_shape=input_shape2,
                                 filter_shape=filter_shape2)

    non_linearity2 = nn_layers.NonLinearityLayer(
        b=parameter_map['NonLinearityLayerB2' + str(filter_width)],
        b_size=filter_shape2[0],
        activation=activation)

    shape2 = parameter_map['PoolingShape2']
    pooling2 = nn_layers.KMaxPoolLayerNative(shape=shape2, ignore_border=True)

    conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(
        layers=[conv, non_linearity, pooling, con2, non_linearity2, pooling2])

    conv_layers.append(conv2dNonLinearMaxPool)

    join_layer = nn_layers.ParallelLayer(layers=conv_layers)
    flatten_layer = nn_layers.FlattenLayer()

    hidden_layer = nn_layers.LinearLayer(W=parameter_map['LinearLayerW'],
                                         b=parameter_map['LinearLayerB'],
                                         rng=numpy_rng,
                                         n_in=n_in,
                                         n_out=n_in,
                                         activation=activation)

    n_outs = 2
    classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs)

    nnet_tweets = nn_layers.FeedForwardNet(layers=[
        lookup_table, join_layer, flatten_layer, hidden_layer, classifier
    ])

    inputs_train = [batch_tweets, batch_topics, batch_y]
    givens_train = {tweets: batch_tweets, topics: batch_topics, y: batch_y}

    inputs_pred = [batch_tweets, batch_topics]
    givens_pred = {tweets: batch_tweets, topics: batch_topics}

    nnet_tweets.set_input((tweets, topics))
    print nnet_tweets

    params = nnet_tweets.params
    cost = nnet_tweets.layers[-1].training_cost(y)
    predictions = nnet_tweets.layers[-1].y_pred

    updates = sgd_trainer.get_adadelta_updates(cost,
                                               params,
                                               rho=0.95,
                                               eps=1e-6,
                                               max_norm=0,
                                               word_vec_name='None')

    train_fn = theano.function(
        inputs=inputs_train,
        outputs=cost,
        updates=updates,
        givens=givens_train,
    )

    pred_fn = theano.function(inputs=inputs_pred,
                              outputs=predictions,
                              givens=givens_pred)

    def predict_batch(batch_iterator):
        preds = numpy.hstack([
            pred_fn(batch_x_q, batch_topics)
            for (batch_x_q, batch_topics) in batch_iterator
        ])
        return preds[:batch_iterator.n_samples]

    #######################

    # Supervised Learining#
    ######################
    batch_size = 1000

    training_2016_tids = numpy.load(
        os.path.join(data_dir, 'task-BD-train-2016.tids.npy'))
    training_2016_tweets = numpy.load(
        os.path.join(data_dir, 'task-BD-train-2016.tweets.npy'))
    training_2016_sentiments = numpy.load(
        os.path.join(data_dir, 'task-BD-train-2016.sentiments.npy'))
    training_2016_topics = numpy.load(
        os.path.join(data_dir, 'task-BD-train-2016.topics.npy'))

    dev_2016_tids = numpy.load(
        os.path.join(data_dir, 'task-BD-dev-2016.tids.npy'))
    dev_2016_tweets = numpy.load(
        os.path.join(data_dir, 'task-BD-dev-2016.tweets.npy'))
    dev_2016_sentiments = numpy.load(
        os.path.join(data_dir, 'task-BD-dev-2016.sentiments.npy'))
    dev_2016_topics = numpy.load(
        os.path.join(data_dir, 'task-BD-dev-2016.topics.npy'))

    devtest_2016_tids = numpy.load(
        os.path.join(data_dir, 'task-BD-devtest-2016.tids.npy'))
    devtest_2016_tweets = numpy.load(
        os.path.join(data_dir, 'task-BD-devtest-2016.tweets.npy'))
    devtest_2016_sentiments = numpy.load(
        os.path.join(data_dir, 'task-BD-devtest-2016.sentiments.npy'))
    devtest_2016_topics = numpy.load(
        os.path.join(data_dir, 'task-BD-devtest-2016.topics.npy'))

    test_2016_tids = numpy.load(
        os.path.join(data_dir, 'SemEval2016-task4-test.subtask-BD.tids.npy'))
    test_2016_tweets = numpy.load(
        os.path.join(data_dir, 'SemEval2016-task4-test.subtask-BD.tweets.npy'))
    test_2016_topics = numpy.load(
        os.path.join(data_dir, 'SemEval2016-task4-test.subtask-BD.topics.npy'))

    training_full_tweets = numpy.concatenate(
        (training_2016_tweets, dev_2016_tweets), axis=0)
    training_full_sentiments = numpy.concatenate(
        (training_2016_sentiments, dev_2016_sentiments), axis=0)
    training_full_topics = numpy.concatenate(
        (training_2016_topics, dev_2016_topics), axis=0)

    train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng,
        [training_full_tweets, training_full_topics, training_full_sentiments],
        batch_size=batch_size,
        randomize=True)

    devtest2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [devtest_2016_tweets, devtest_2016_topics],
        batch_size=batch_size,
        randomize=False)

    test2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [test_2016_tweets, test_2016_topics],
        batch_size=batch_size,
        randomize=False)

    W_emb_list = [w for w in params if w.name == 'W_emb']
    zerout_dummy_word = theano.function([],
                                        updates=[(W,
                                                  T.set_subtensor(W[-1:], 0.))
                                                 for W in W_emb_list])

    epoch = 0
    n_epochs = 100
    early_stop = 20
    check_freq = 4
    timer_train = time.time()
    no_best_dev_update = 0
    best_dev_acc = -numpy.inf
    num_train_batches = len(train_set_iterator)
    while epoch < n_epochs:
        timer = time.time()
        for i, (tweet, topic,
                y_label) in enumerate(tqdm(train_set_iterator, ascii=True), 1):
            train_fn(tweet, topic, y_label)

            if i % check_freq == 0 or i == num_train_batches:
                y_pred_devtest_2016 = predict_batch(devtest2016_iterator)
                dev_acc_2016_devtest = semeval_f1_taskB(
                    devtest_2016_sentiments, y_pred_devtest_2016)

                if dev_acc_2016_devtest > best_dev_acc:
                    print(
                        'devtest 2016 epoch: {} chunk: {} best_chunk_auc: {:.4f}; best_dev_acc: {:.4f}'
                        .format(epoch, i, dev_acc_2016_devtest, best_dev_acc))

                    best_dev_acc = dev_acc_2016_devtest
                    best_params = [
                        numpy.copy(p.get_value(borrow=True)) for p in params
                    ]
                    no_best_dev_update = 0

                    #cPickle.dump(parameter_map, open(data_dir+'/parameters_{}.p'.format('supervised_posneg'), 'wb'))
                    y_pred_test_2016 = predict_batch(test2016_iterator)
                    numpy.save(data_dir + '/predictions_test_2016',
                               y_pred_test_2016)
                    numpy.save(data_dir + '/predictions_devtest2016',
                               y_pred_devtest_2016)

        zerout_dummy_word()

        print('epoch {} took {:.4f} seconds'.format(epoch,
                                                    time.time() - timer))
        epoch += 1
        no_best_dev_update += 1
        if no_best_dev_update >= early_stop:
            print "Quitting after of no update of the best score on dev set", no_best_dev_update
            break

    print('Training took: {:.4f} seconds'.format(time.time() - timer_train))
    for i, param in enumerate(best_params):
        params[i].set_value(param, borrow=True)

    #######################
    # Get Sentence Vectors#
    ######################

    batch_size = input_shape[0]

    inputs_senvec = [batch_tweets, batch_topics]
    givents_senvec = {tweets: batch_tweets, topics: batch_topics}

    output = nnet_tweets.layers[-2].output

    output_fn = function(inputs=inputs_senvec,
                         outputs=output,
                         givens=givents_senvec)

    sets = [(dev_2016_tids, dev_2016_topics, dev_2016_tweets,
             'task-BD-dev-2016'),
            (training_2016_tids, training_2016_topics, training_2016_tweets,
             'task-BD-train-2016'),
            (devtest_2016_tids, devtest_2016_topics, devtest_2016_tweets,
             'task-BD-devtest-2016'),
            (test_2016_tids, test_2016_topics, test_2016_tweets,
             'SemEval2016-task4-test.subtask-BD')]
    for (fids, ftop, fset, name) in sets:
        test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
            numpy_rng, [fset, ftop], batch_size=batch_size, randomize=False)

        counter = 0
        fname = open(
            os.path.join(data_dir, 'sentence_vecs_topic/{}.txt'.format(name)),
            'w+')
        for i, (tweet, topic) in enumerate(tqdm(test_set_iterator), 1):
            o = output_fn(tweet, topic)
            for vec in o:
                fname.write(fids[counter])
                for el in numpy.nditer(vec):
                    fname.write(" %f" % el)
                fname.write("\n")
                counter += 1
                if counter == test_set_iterator.n_samples:
                    break

    ##############################
    # Get Predictions Probabilites#
    #############################

    batch_size = input_shape[0]

    output = nnet_tweets.layers[-1].p_y_given_x

    output_fn = function(inputs=inputs_senvec,
                         outputs=output,
                         givens=givents_senvec)

    for (fids, ftop, fset, name) in sets:
        test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
            numpy_rng, [fset, ftop], batch_size=batch_size, randomize=False)

        counter = 0
        fname = open(
            os.path.join(data_dir,
                         'prob_predictions_topic/{}.txt'.format(name)), 'w+')
        for i, (tweet, topic) in enumerate(tqdm(test_set_iterator), 1):
            o = output_fn(tweet, topic)
            for vec in o:
                for el in numpy.nditer(vec):
                    fname.write(" %f" % el)
                fname.write("\n")
                counter += 1
                if counter == test_set_iterator.n_samples:
                    break