def main():
    # ZEROUT_DUMMY_WORD = False
    ZEROUT_DUMMY_WORD = True

    ## Load data
    # mode = 'TRAIN-ALL'
    mode = 'train'
    if len(sys.argv) > 1:
        mode = sys.argv[1]
        if not mode in ['TRAIN', 'TRAIN-ALL']:
            print "ERROR! The two possible training settings are: ['TRAIN', 'TRAIN-ALL']"
            sys.exit(1)

    print "Running training in the {} setting".format(mode)

    data_dir = mode

    # 加载数据集词向量
    if mode in ['TRAIN-ALL']:
        q_train = numpy.load(os.path.join(data_dir, 'train-all.questions.npy'))
        a_train = numpy.load(os.path.join(data_dir, 'train-all.answers.npy'))
        q_overlap_train = numpy.load(
            os.path.join(data_dir, 'train-all.q_overlap_indices.npy'))
        a_overlap_train = numpy.load(
            os.path.join(data_dir, 'train-all.a_overlap_indices.npy'))
        y_train = numpy.load(os.path.join(data_dir, 'train-all.labels.npy'))
    else:
        q_train = numpy.load(os.path.join(data_dir, 'train.questions.npy'))
        a_train = numpy.load(os.path.join(data_dir, 'train.answers.npy'))
        q_overlap_train = numpy.load(
            os.path.join(data_dir, 'train.q_overlap_indices.npy'))
        a_overlap_train = numpy.load(
            os.path.join(data_dir, 'train.a_overlap_indices.npy'))
        y_train = numpy.load(os.path.join(data_dir, 'train.labels.npy'))

    q_dev = numpy.load(os.path.join(data_dir, 'dev.questions.npy'))
    a_dev = numpy.load(os.path.join(data_dir, 'dev.answers.npy'))
    q_overlap_dev = numpy.load(
        os.path.join(data_dir, 'dev.q_overlap_indices.npy'))
    a_overlap_dev = numpy.load(
        os.path.join(data_dir, 'dev.a_overlap_indices.npy'))
    y_dev = numpy.load(os.path.join(data_dir, 'dev.labels.npy'))
    qids_dev = numpy.load(os.path.join(data_dir, 'dev.qids.npy'))

    q_test = numpy.load(os.path.join(data_dir, 'test.questions.npy'))
    a_test = numpy.load(os.path.join(data_dir, 'test.answers.npy'))
    q_overlap_test = numpy.load(
        os.path.join(data_dir, 'test.q_overlap_indices.npy'))
    a_overlap_test = numpy.load(
        os.path.join(data_dir, 'test.a_overlap_indices.npy'))
    y_test = numpy.load(os.path.join(data_dir, 'test.labels.npy'))
    qids_test = numpy.load(os.path.join(data_dir, 'test.qids.npy'))

    # x里放的是overlap feat
    x_train = numpy.load(os.path.join(data_dir, 'train.overlap_feats.npy'))
    x_dev = numpy.load(os.path.join(data_dir, 'dev.overlap_feats.npy'))
    x_test = numpy.load(os.path.join(data_dir, 'test.overlap_feats.npy'))

    feats_ndim = x_train.shape[1]

    # from sklearn.preprocessing import StandardScaler
    # scaler = StandardScaler()
    # print "Scaling overlap features"
    # x_train = scaler.fit_transform(x_train)
    # x_dev = scaler.transform(x_dev)
    # x_test = scaler.transform(x_test)

    print 'y_train', numpy.unique(y_train, return_counts=True)
    print 'y_dev', numpy.unique(y_dev, return_counts=True)
    print 'y_test', numpy.unique(y_test, return_counts=True)

    print 'q_train', q_train.shape
    print 'q_dev', q_dev.shape
    print 'q_test', q_test.shape

    print 'a_train', a_train.shape
    print 'a_dev', a_dev.shape
    print 'a_test', a_test.shape

    # print 'a_overlap_train',a_overlap_train.shape

    # print 'x_train',x_train.shape
    # print 'x_dev',x_dev.shape
    # print 'x_test',x_test.shape

    ## Get the word embeddings from the nnet trained on SemEval
    # ndim = 40
    # nnet_outdir = 'exp/ndim=60;batch=100;max_norm=0;learning_rate=0.1;2014-12-02-15:53:14'
    # nnet_fname = os.path.join(nnet_outdir, 'nnet.dat')
    # params_fname = os.path.join(nnet_outdir, 'best_dev_params.epoch=00;batch=14640;dev_f1=83.12;test_acc=85.00.dat')
    # train_nnet, test_nnet = nn_layers.load_nnet(nnet_fname, params_fname)
    numpy.random.RandomState()
    # 指定种子值(指定种子值是为了使同样的条件下每次产生的随机数一样,避免程序调试时由随机数不同而引起的问题)
    numpy_rng = numpy.random.RandomState(123)
    # question中最长的长度
    q_max_sent_size = q_train.shape[1]
    # answer中最长的长度
    a_max_sent_size = a_train.shape[1]
    # print 'max', numpy.max(a_train)
    # print 'min', numpy.min(a_train)

    ndim = 5
    print "Generating random vocabulary for word overlap indicator features with dim:", ndim
    # numpy.max在不指定维度信息时,返回数组中的最大的一个值
    #QQQQQQ
    dummy_word_id = numpy.max(a_overlap_train)
    print "dummy_word_id:", dummy_word_id
    # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim))
    print "Gaussian"
    # 从标准正态分布中生成维度为(a,b)的随机数组
    # 这一行看起来像是对未登录词的初始化
    # QQQQQ
    vocab_emb_overlap = numpy_rng.randn(dummy_word_id + 1, ndim) * 0.25
    # vocab_emb_overlap = numpy_rng.randn(dummy_word_id+1, ndim) * 0.05
    # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim))

    # [-1]引用的是矩阵的最后一行
    # vocab_emb_overlap[-1] = 0

    # Load word2vec embeddings
    fname = os.path.join(data_dir, 'emb_aquaint+wiki.txt.gz.ndim=50.bin.npy')

    print "Loading word embeddings from", fname
    vocab_emb = numpy.load(fname)
    ndim = vocab_emb.shape[1]
    dummpy_word_idx = numpy.max(a_train)
    print "Word embedding matrix size:", vocab_emb.shape

    # x = T.dmatrix('x')
    x_q = T.lmatrix('q')
    x_q_overlap = T.lmatrix('q_overlap')
    x_a = T.lmatrix('a')
    x_a_overlap = T.lmatrix('a_overlap')
    y = T.ivector('y')

    #######
    n_outs = 2

    n_epochs = 25
    batch_size = 50
    learning_rate = 0.1
    max_norm = 0

    print 'batch_size', batch_size
    print 'n_epochs', n_epochs
    print 'learning_rate', learning_rate
    print 'max_norm', max_norm

    ## 1st conv layer.
    ndim = vocab_emb.shape[1] + vocab_emb_overlap.shape[1]
    # ndim = vocab_emb.shape[1]
    print "1st conv layer dim:", ndim

    ### Nonlinearity type
    # activation = nn_layers.relu_f
    activation = T.tanh

    dropout_rate = 0.5
    # feature map数目
    nkernels = 100
    q_k_max = 1
    a_k_max = 1

    # filter_widths = [3,4,5]
    q_filter_widths = [5]
    a_filter_widths = [5]

    ###### QUESTION ######
    # 首先获得词向量信息
    # QQQQ为什么要有这两层?似乎已经获得了词的词向量表示:可能是用于为是每个具体的句子获得词向量表示
    # QQQQQ pad具体实现
    # lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths)-1)
    lookup_table_words = nn_layers.LookupTableFast(W=vocab_emb,
                                                   pad=max(q_filter_widths) -
                                                   1)
    #QQQQQ这一层的用途?可能也是来获得具体的句子对的overlap向量
    lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap,
                                                     pad=max(q_filter_widths) -
                                                     1)
    # lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words])
    lookup_table = nn_layers.ParallelLookupTable(
        layers=[lookup_table_words, lookup_table_overlap])

    # 因为是文本数据所以是单通道
    num_input_channels = 1
    # QQQQQq_max_sent_size + 2 * (max(q_filter_widths) - 1) 这一项的含义:因为在lookup中都加了两倍的对应长度的pad
    # QQQQ以及最后一项为什么是ndim
    # Minibatch of feature map stacks, of shape(batch  size, stack size, nb row, nb  col) see the optional parameter image_shape
    input_shape = (batch_size, num_input_channels,
                   q_max_sent_size + 2 * (max(q_filter_widths) - 1), ndim)
    print "convlution layer input shape:", input_shape

    conv_layers = []
    # 对各个filter构造卷积层
    # QQQQ各层的w矩阵初始化方案有所不同?初始化可能有哪些方案以及各种方案的性能
    for filter_width in q_filter_widths:
        # 每一层卷积的构造
        #Set of filters used in CNN layer of shape (nb filters, stack size, nb row, nb col)
        filter_shape = (nkernels, num_input_channels, filter_width, ndim)
        # 此处采用的是2D卷积
        conv = nn_layers.Conv2dLayer(rng=numpy_rng,
                                     filter_shape=filter_shape,
                                     input_shape=input_shape)
        non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0],
                                                    activation=activation)
        pooling = nn_layers.KMaxPoolLayer(k_max=q_k_max)
        conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(
            layers=[conv, non_linearity, pooling])
        conv_layers.append(conv2dNonLinearMaxPool)

    join_layer = nn_layers.ParallelLayer(layers=conv_layers)
    flatten_layer = nn_layers.FlattenLayer()

    nnet_q = nn_layers.FeedForwardNet(layers=[
        lookup_table,
        join_layer,
        flatten_layer,
    ])
    nnet_q.set_input((x_q, x_q_overlap))
    # nnet_q.set_input(x_q)
    ######

    ###### ANSWER ######
    # lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths)-1)
    lookup_table_words = nn_layers.LookupTableFast(W=vocab_emb,
                                                   pad=max(q_filter_widths) -
                                                   1)
    lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap,
                                                     pad=max(q_filter_widths) -
                                                     1)

    lookup_table = nn_layers.ParallelLookupTable(
        layers=[lookup_table_words, lookup_table_overlap])
    # lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words])
    # num_input_channels = len(lookup_table.layers)
    input_shape = (batch_size, num_input_channels,
                   a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim)
    conv_layers = []
    for filter_width in a_filter_widths:
        filter_shape = (nkernels, num_input_channels, filter_width, ndim)
        conv = nn_layers.Conv2dLayer(rng=numpy_rng,
                                     filter_shape=filter_shape,
                                     input_shape=input_shape)
        non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0],
                                                    activation=activation)
        pooling = nn_layers.KMaxPoolLayer(k_max=a_k_max)
        conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(
            layers=[conv, non_linearity, pooling])
        conv_layers.append(conv2dNonLinearMaxPool)

    join_layer = nn_layers.ParallelLayer(layers=conv_layers)
    # QQQQ为啥这里有个flattenlayer
    flatten_layer = nn_layers.FlattenLayer()

    nnet_a = nn_layers.FeedForwardNet(layers=[
        lookup_table,
        join_layer,
        flatten_layer,
    ])
    # QQQQ此处x_a_overlap的用处是?
    nnet_a.set_input((x_a, x_a_overlap))
    # nnet_a.set_input(x_a)
    #######
    # print 'nnet_q.output', nnet_q.output.ndim

    # QQQQQ这里又是干嘛的
    q_logistic_n_in = nkernels * len(q_filter_widths) * q_k_max
    a_logistic_n_in = nkernels * len(a_filter_widths) * a_k_max

    # dropout_q = nn_layers.FastDropoutLayer(rng=numpy_rng)
    # dropout_a = nn_layers.FastDropoutLayer(rng=numpy_rng)
    # dropout_q.set_input(nnet_q.output)
    # dropout_a.set_input(nnet_a.output)

    # feats_nout = 10
    # x_hidden_layer = nn_layers.LinearLayer(numpy_rng, n_in=feats_ndim, n_out=feats_nout, activation=activation)
    # x_hidden_layer.set_input(x)

    # feats_nout = feats_ndim

    ### Dropout
    # classifier = nn_layers.PairwiseLogisticWithFeatsRegression(q_in=q_logistic_n_in,
    #                                                   a_in=a_logistic_n_in,
    #                                                   n_in=feats_nout,
    #                                                   n_out=n_outs)
    # classifier.set_input((dropout_q.output, dropout_a.output, x_hidden_layer.output))
    # classifier.set_input((dropout_q.output, dropout_a.output, x))

    # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, dropout_q, dropout_a, classifier],
    # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, dropout_q, dropout_a, classifier],
    #                                       name="Training nnet")

    # test_classifier = nn_layers.PairwiseLogisticWithFeatsRegression(q_in=q_logistic_n_in,
    #                                                         a_in=a_logistic_n_in,
    #                                                         n_in=feats_nout,
    #                                                         n_out=n_outs,
    #                                                         W=classifier.W,
    #                                                         W_feats=classifier.W_feats,
    #                                                         b=classifier.b)
    # test_classifier.set_input((nnet_q.output, nnet_a.output, x_hidden_layer.output))
    # test_classifier.set_input((nnet_q.output, nnet_a.output, x))
    # test_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, test_classifier],
    # test_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, test_classifier],
    #                                       name="Test nnet")
    #########

    # 此处应该是进行句子匹配层
    #   pairwise_layer = nn_layers.PairwiseMultiOnlySimWithFeatsLayer(q_in=q_logistic_n_in,

    pairwise_layer = nn_layers.PairwiseNoFeatsLayer(
        q_in=q_logistic_n_in,
        # pairwise_layer = nn_layers.PairwiseWithFeatsLayer(q_in=q_logistic_n_in,
        # pairwise_layer = nn_layers.PairwiseOnlySimWithFeatsLayer(q_in=q_logistic_n_in,
        a_in=a_logistic_n_in)
    pairwise_layer.set_input((nnet_q.output, nnet_a.output))

    # 此处n_in的取值要根据上一层匹配层的方案进行不同的计算
    #   n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + a_logistic_n_in
    #   n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 50
    #   n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 1
    n_in = q_logistic_n_in + a_logistic_n_in + 1
    # n_in = feats_ndim + 1
    # n_in = feats_ndim + 50

    hidden_layer = nn_layers.LinearLayer(numpy_rng,
                                         n_in=n_in,
                                         n_out=n_in,
                                         activation=activation)
    hidden_layer.set_input(pairwise_layer.output)

    classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs)
    classifier.set_input(hidden_layer.output)

    # dropout2
    # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, pairwise_layer, hidden_layer,dropout_q,dropout_a, classifier],
    train_nnet = nn_layers.FeedForwardNet(
        layers=[nnet_q, nnet_a, pairwise_layer, hidden_layer, classifier],
        # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, classifier],
        name="Training nnet")
    test_nnet = train_nnet
    #######

    print train_nnet

    params = train_nnet.params

    ts = datetime.now().strftime('%Y-%m-%d-%H.%M.%S')
    nnet_outdir = 'exp.out/ndim={}_batch={}_max_norm={}_learning_rate={}_{}'.format(
        ndim, batch_size, max_norm, learning_rate, ts)
    if not os.path.exists(nnet_outdir):
        os.makedirs(nnet_outdir)
    nnet_fname = os.path.join(nnet_outdir, 'nnet.dat')
    print "Saving to", nnet_fname
    # 将python对象序列化保存到本地的文件。
    cPickle.dump([train_nnet, test_nnet],
                 open(nnet_fname, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)

    total_params = sum([numpy.prod(param.shape.eval()) for param in params])
    print 'Total params number:', total_params

    # 损失函数交叉熵
    cost = train_nnet.layers[-1].training_cost(y)
    ########################################
    # # QQQQQ这种方式好奇怪???看起来像cost的另外一种求法
    # y_train_counts = numpy.unique(y_train, return_counts=True)[1].astype(numpy.float32)
    # weights_data = numpy.sum(y_train_counts) / y_train_counts
    # weights_data_norm = numpy.linalg.norm(weights_data)
    # weights_data /= weights_data_norm
    # print 'weights_data', weights_data
    # weights = theano.shared(weights_data, borrow=True)
    # cost = train_nnet.layers[-1].training_cost_weighted(y, weights=weights)
    ########################################################

    # 经过softmax后的最大值对应的类别
    predictions = test_nnet.layers[-1].y_pred
    # 经过softmax后的最大值
    predictions_prob = test_nnet.layers[-1].p_y_given_x[:, -1]

    # ### L2 regularization
    # L2_word_emb = 1e-4
    # L2_conv1d = 3e-5
    # # L2_softmax = 1e-3
    # L2_softmax = 1e-4
    # print "Regularizing nnet weights"
    # for w in train_nnet.weights:
    #   L2_reg = 0.
    #   if w.name.startswith('W_emb'):
    #     L2_reg = L2_word_emb
    #   elif w.name.startswith('W_conv1d'):
    #     L2_reg = L2_conv1d
    #   elif w.name.startswith('W_softmax'):
    #     L2_reg = L2_softmax
    #   elif w.name == 'W':
    #     L2_reg = L2_softmax
    #   print w.name, L2_reg
    #   cost += T.sum(w**2) * L2_reg
    #
    # batch_x = T.dmatrix('batch_x')
    batch_x_q = T.lmatrix('batch_x_q')
    batch_x_a = T.lmatrix('batch_x_a')
    batch_x_q_overlap = T.lmatrix('batch_x_q_overlap')
    batch_x_a_overlap = T.lmatrix('batch_x_a_overlap')
    batch_y = T.ivector('batch_y')

    # 训练优化方案
    # updates = sgd_trainer.get_adagrad_updates(cost, params, learning_rate=learning_rate, max_norm=max_norm, _eps=1e-6)
    updates = sgd_trainer.get_adadelta_updates(cost,
                                               params,
                                               rho=0.95,
                                               eps=1e-6,
                                               max_norm=max_norm,
                                               word_vec_name='W_emb')

    # batch_x是否注释,代表是否用overlap_feat特征用于训练
    inputs_pred = [
        batch_x_q,
        batch_x_a,
        batch_x_q_overlap,
        batch_x_a_overlap,
        # batch_x,
    ]

    givens_pred = {
        x_q: batch_x_q,
        x_a: batch_x_a,
        x_q_overlap: batch_x_q_overlap,
        x_a_overlap: batch_x_a_overlap,
        # x: batch_x
    }

    inputs_train = [
        batch_x_q,
        batch_x_a,
        batch_x_q_overlap,
        batch_x_a_overlap,
        # batch_x,
        batch_y,
    ]

    givens_train = {
        x_q: batch_x_q,
        x_a: batch_x_a,
        x_q_overlap: batch_x_q_overlap,
        x_a_overlap: batch_x_a_overlap,
        # x: batch_x,
        y: batch_y
    }

    # 训练函数定义
    train_fn = theano.function(inputs=inputs_train,
                               outputs=cost,
                               updates=updates,
                               givens=givens_train)
    # 选择答案
    pred_fn = theano.function(inputs=inputs_pred,
                              outputs=predictions,
                              givens=givens_pred)
    # 每个选项的概率
    pred_prob_fn = theano.function(inputs=inputs_pred,
                                   outputs=predictions_prob,
                                   givens=givens_pred)

    def predict_batch(batch_iterator):
        # numpy.hstack:Stack arrays in sequence horizontally (column wise).This is equivalent to concatenation along the second axis, except for 1-D arrays where it concatenates along the first axis
        preds = numpy.hstack([
            pred_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap)
            for batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, _
            in batch_iterator
        ])
        # preds = numpy.hstack([pred_prob_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, batch_x) for
        #                       batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, batch_x,_ in batch_iterator])
        # preds = numpy.hstack([pred_prob_fn(batch_x_q, batch_x_a,  batch_x) for
        #                       batch_x_q, batch_x_a,  batch_x, _ in batch_iterator])
        return preds[:batch_iterator.n_samples]

    def predict_prob_batch(batch_iterator):
        preds = numpy.hstack([
            pred_prob_fn(batch_x_q, batch_x_a, batch_x_q_overlap,
                         batch_x_a_overlap) for batch_x_q, batch_x_a,
            batch_x_q_overlap, batch_x_a_overlap, _ in batch_iterator
        ])
        # preds = numpy.hstack([pred_prob_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap,batch_x) for
        #                       batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, batch_x,_ in batch_iterator])
        # preds = numpy.hstack([pred_prob_fn(batch_x_q, batch_x_a, batch_x) for
        #                       batch_x_q, batch_x_a, batch_x, _ in batch_iterator])
        return preds[:batch_iterator.n_samples]


# 三个迭代器

    train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng,
        [q_train, a_train, q_overlap_train, a_overlap_train, y_train],
        batch_size=batch_size,
        randomize=True)
    dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [q_dev, a_dev, q_overlap_dev, a_overlap_dev, y_dev],
        batch_size=batch_size,
        randomize=False)
    test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [q_test, a_test, q_overlap_test, a_overlap_test, y_test],
        batch_size=batch_size,
        randomize=False)

    ####
    #   train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(numpy_rng, [q_train, a_train, q_overlap_train,
    #                                                                                   a_overlap_train,x_train,y_train],
    #                                                                       batch_size=batch_size, randomize=True)
    #   dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(numpy_rng,
    #                                                                     [q_dev, a_dev, q_overlap_dev, a_overlap_dev,x_dev,y_dev],
    #                                                                     batch_size=batch_size, randomize=False)
    #   test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(numpy_rng,
    #                                                                      [q_test, a_test, q_overlap_test, a_overlap_test,x_test,
    #                                                                       y_test], batch_size=batch_size, randomize=False)
    #####
    #   train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(numpy_rng, [q_train, a_train,  x_train, y_train],
    #                                                                       batch_size=batch_size, randomize=True)
    #   dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(numpy_rng,
    #                                                                     [q_dev, a_dev, x_dev,
    #                                                                      y_dev],
    #                                                                     batch_size=batch_size, randomize=False)
    #   test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(numpy_rng,
    #                                                                      [q_test, a_test, x_test,
    #                                                                       y_test], batch_size=batch_size, randomize=False)

    labels = sorted(numpy.unique(y_test))
    print 'labels', labels

    def map_score(qids, labels, preds):
        qid2cand = defaultdict(list)
        for qid, label, pred in zip(qids, labels, preds):
            qid2cand[qid].append((pred, label))

        average_precs = []
        for qid, candidates in qid2cand.iteritems():
            average_prec = 0
            running_correct_count = 0
            for i, (score,
                    label) in enumerate(sorted(candidates, reverse=True), 1):
                if label > 0:
                    running_correct_count += 1
                    average_prec += float(running_correct_count) / i
            average_precs.append(average_prec / (running_correct_count + 1e-6))
        map_score = sum(average_precs) / len(average_precs)
        return map_score

    print "Zero out dummy word:", ZEROUT_DUMMY_WORD
    if ZEROUT_DUMMY_WORD:
        W_emb_list = [w for w in params if w.name == 'W_emb']
        zerout_dummy_word = theano.function(
            [], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list])

    # weights_dev = numpy.zeros(len(y_dev))
    # weights_dev[y_dev == 0] = weights_data[0]
    # weights_dev[y_dev == 1] = weights_data[1]
    # print weights_dev

    best_dev_acc = -numpy.inf
    epoch = 0
    timer_train = time.time()
    no_best_dev_update = 0
    num_train_batches = len(train_set_iterator)
    while epoch < n_epochs:
        timer = time.time()
        for i, (x_q, x_a, x_q_overlap, x_a_overlap,
                y) in enumerate(tqdm(train_set_iterator), 1):
            # for i, (x_q, x_a,  x, y) in enumerate(tqdm(train_set_iterator), 1):
            # train_fn(x_q, x_a, x_q_overlap, x_a_overlap, x,y)
            # train_fn(x_q, x_a,  x, y)
            train_fn(x_q, x_a, x_q_overlap, x_a_overlap, y)

            # Make sure the null word in the word embeddings always remains zero
            if ZEROUT_DUMMY_WORD:
                zerout_dummy_word()

            if i % 10 == 0 or i == num_train_batches:
                y_pred_dev = predict_prob_batch(dev_set_iterator)
                # # dev_acc = map_score(qids_dev, y_dev, predict_prob_batch(dev_set_iterator)) * 100
                # Compute Area Under the Receiver Operating Characteristic Curve(ROC AUC) from prediction scores.
                dev_acc = metrics.roc_auc_score(y_dev, y_pred_dev) * 100
                if dev_acc > best_dev_acc:
                    y_pred = predict_prob_batch(test_set_iterator)
                    test_acc = map_score(qids_test, y_test, y_pred) * 100

                    print(
                        'epoch: {} batch: {} dev auc: {:.4f}; test map: {:.4f}; best_dev_acc: {:.4f}'
                        .format(epoch, i, dev_acc, test_acc, best_dev_acc))
                    best_dev_acc = dev_acc
                    best_params = [
                        numpy.copy(p.get_value(borrow=True)) for p in params
                    ]
                    no_best_dev_update = 0

        if no_best_dev_update >= 3:
            print "Quitting after of no update of the best score on dev set", no_best_dev_update
            break

        print('epoch {} took {:.4f} seconds'.format(epoch,
                                                    time.time() - timer))
        epoch += 1
        no_best_dev_update += 1

    print('Training took: {:.4f} seconds'.format(time.time() - timer_train))
    for i, param in enumerate(best_params):
        params[i].set_value(param, borrow=True)

    y_pred_test = predict_prob_batch(test_set_iterator)
    test_acc = map_score(qids_test, y_test, y_pred_test) * 100
    fname = os.path.join(
        nnet_outdir,
        'best_dev_params.epoch={:02d};batch={:05d};dev_acc={:.2f}.dat'.format(
            epoch, i, best_dev_acc))
    numpy.savetxt(
        os.path.join(
            nnet_outdir,
            'test.epoch={:02d};batch={:05d};dev_acc={:.2f}.predictions.npy'.
            format(epoch, i, best_dev_acc)), y_pred)
    cPickle.dump(best_params,
                 open(fname, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)

    print "Running trec_eval script..."
    N = len(y_pred_test)

    df_submission = pd.DataFrame(
        index=numpy.arange(N),
        columns=['qid', 'iter', 'docno', 'rank', 'sim', 'run_id'])
    df_submission['qid'] = qids_test
    df_submission['iter'] = 0
    df_submission['docno'] = numpy.arange(N)
    df_submission['rank'] = 0
    df_submission['sim'] = y_pred_test
    df_submission['run_id'] = 'nnet'
    df_submission.to_csv(os.path.join(nnet_outdir, 'submission.txt'),
                         header=False,
                         index=False,
                         sep=' ')

    df_gold = pd.DataFrame(index=numpy.arange(N),
                           columns=['qid', 'iter', 'docno', 'rel'])
    df_gold['qid'] = qids_test
    df_gold['iter'] = 0
    df_gold['docno'] = numpy.arange(N)
    df_gold['rel'] = y_test
    df_gold.to_csv(os.path.join(nnet_outdir, 'gold.txt'),
                   header=False,
                   index=False,
                   sep=' ')

    subprocess.call("/bin/sh run_eval.sh '{}'".format(nnet_outdir), shell=True)
示例#2
0
lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths)-1)
lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=max(q_filter_widths)-1)
lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words, lookup_table_overlap])

num_input_channels = 1
input_shape = (batch_size, num_input_channels, q_max_size + 2*(max(q_filter_widths)-1), ndim)

conv_layers = []

# each conv_layer consists of 2d convolution , filters, activation, pooling layers
for filter_width in q_filter_widths:
    filter_shape = (nfilters, num_input_channels, filter_width, ndim)
    conv = nn_layers.Conv2dLayer(rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape)
    non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0], activation=activation)
    pooling = nn_layers.KMaxPoolLayer(k_max=q_k_max)
    conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(layers=[conv, non_linearity, pooling])
    conv_layers.append(conv2dNonLinearMaxPool)

join_layer = nn_layers.ParallelLayer(layers=conv_layers)
flatten_layer = nn_layers.FlattenLayer()

nnet_q = nn_layers.FeedForwardNet(layers=[
                                lookup_table,
                                join_layer,
                                flatten_layer,
                                ])
nnet_q.set_input((x_q, x_q_overlap))


示例#3
0
def main():
    # ZEROUT_DUMMY_WORD = False
    ZEROUT_DUMMY_WORD = True

    ## Load data
    # mode = 'TRAIN-ALL'
    mode = sys.argv[1]
    """
  if len(sys.argv) > 1:
    mode = sys.argv[1]
    if not mode in ['TRAIN', 'TRAIN-ALL']:
      print "ERROR! The two possible training settings are: ['TRAIN', 'TRAIN-ALL']"
      sys.exit(1)
  """
    print "Running training in the {} setting".format(mode)

    data_dir = mode

    if mode in ['TRAIN-ALL']:
        q_train = numpy.load(os.path.join(data_dir, 'train-all.questions.npy'))
        a_train = numpy.load(os.path.join(data_dir, 'train-all.answers.npy'))
        q_overlap_train = numpy.load(
            os.path.join(data_dir, 'train-all.q_overlap_indices.npy'))
        a_overlap_train = numpy.load(
            os.path.join(data_dir, 'train-all.a_overlap_indices.npy'))
        y_train = numpy.load(os.path.join(data_dir, 'train-all.labels.npy'))
    else:
        q_train = numpy.load(os.path.join(data_dir, 'train.questions.npy'))
        a_train = numpy.load(os.path.join(data_dir, 'train.answers.npy'))
        q_overlap_train = numpy.load(os.path.join(data_dir, 'train.q_sim.npy'))
        a_overlap_train = numpy.load(os.path.join(data_dir, 'train.a_sim.npy'))
        y_train = numpy.load(os.path.join(data_dir, 'train.labels.npy'))

    q_dev = numpy.load(os.path.join(data_dir, 'dev.questions.npy'))
    a_dev = numpy.load(os.path.join(data_dir, 'dev.answers.npy'))
    q_overlap_dev = numpy.load(os.path.join(data_dir, 'dev.q_sim.npy'))
    a_overlap_dev = numpy.load(os.path.join(data_dir, 'dev.a_sim.npy'))
    y_dev = numpy.load(os.path.join(data_dir, 'dev.labels.npy'))
    qids_dev = numpy.load(os.path.join(data_dir, 'dev.qids.npy'))

    q_test = numpy.load(os.path.join(data_dir, 'test.questions.npy'))
    a_test = numpy.load(os.path.join(data_dir, 'test.answers.npy'))
    q_overlap_test = numpy.load(os.path.join(data_dir, 'test.q_sim.npy'))
    a_overlap_test = numpy.load(os.path.join(data_dir, 'test.a_sim.npy'))
    y_test = numpy.load(os.path.join(data_dir, 'test.labels.npy'))
    qids_test = numpy.load(os.path.join(data_dir, 'test.qids.npy'))

    # x_train = numpy.load(os.path.join(data_dir, 'train.overlap_feats.npy'))
    # x_dev = numpy.load(os.path.join(data_dir, 'dev.overlap_feats.npy'))
    # x_test = numpy.load(os.path.join(data_dir, 'test.overlap_feats.npy'))

    # feats_ndim = x_train.shape[1]

    # from sklearn.preprocessing import StandardScaler
    # scaler = StandardScaler()
    # print "Scaling overlap features"
    # x_train = scaler.fit_transform(x_train)
    # x_dev = scaler.transform(x_dev)
    # x_test = scaler.transform(x_test)

    print 'y_train', numpy.unique(y_train, return_counts=True)
    print 'y_dev', numpy.unique(y_dev, return_counts=True)
    print 'y_test', numpy.unique(y_test, return_counts=True)

    print 'q_train', q_train.shape
    print 'q_dev', q_dev.shape
    print 'q_test', q_test.shape

    print 'a_train', a_train.shape
    print 'a_dev', a_dev.shape
    print 'a_test', a_test.shape

    ## Get the word embeddings from the nnet trained on SemEval
    # ndim = 40
    # nnet_outdir = 'exp/ndim=60;batch=100;max_norm=0;learning_rate=0.1;2014-12-02-15:53:14'
    # nnet_fname = os.path.join(nnet_outdir, 'nnet.dat')
    # params_fname = os.path.join(nnet_outdir, 'best_dev_params.epoch=00;batch=14640;dev_f1=83.12;test_acc=85.00.dat')
    # train_nnet, test_nnet = nn_layers.load_nnet(nnet_fname, params_fname)

    numpy_rng = numpy.random.RandomState(123)
    q_max_sent_size = q_train.shape[1]
    a_max_sent_size = a_train.shape[1]
    # print 'max', numpy.max(a_train)
    # print 'min', numpy.min(a_train)

    ndim = 5
    print "Generating random vocabulary for word overlap indicator features with dim:", ndim
    dummy_word_id = numpy.max(a_overlap_train)
    # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim))
    print "Gaussian"
    vocab_emb_overlap = numpy_rng.randn(dummy_word_id + 1, ndim) * 0.25
    # vocab_emb_overlap = numpy_rng.randn(dummy_word_id+1, ndim) * 0.05
    # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim))
    vocab_emb_overlap[-1] = 0

    # Load word2vec embeddings
    fname = os.path.join(data_dir, 'emb_aquaint+wiki.txt.gz.ndim=50.bin.npy')

    print "Loading word embeddings from", fname
    vocab_emb = numpy.load(fname)
    ndim = vocab_emb.shape[1]
    dummpy_word_idx = numpy.max(a_train)
    print "Word embedding matrix size:", vocab_emb.shape

    x = T.dmatrix('x')
    x_q = T.lmatrix('q')
    x_q_overlap = T.lmatrix('q_overlap')
    x_a = T.lmatrix('a')
    x_a_overlap = T.lmatrix('a_overlap')
    y = T.ivector('y')

    #######
    n_outs = 2

    n_epochs = 25
    batch_size = 50
    learning_rate = 0.1
    max_norm = 0

    print 'batch_size', batch_size
    print 'n_epochs', n_epochs
    print 'learning_rate', learning_rate
    print 'max_norm', max_norm

    ## 1st conv layer.
    ndim = vocab_emb.shape[1] + vocab_emb_overlap.shape[1]

    ### Nonlinearity type
    # activation = nn_layers.relu_f
    activation = T.tanh

    dropout_rate = 0.5
    nkernels = 100
    q_k_max = 1
    a_k_max = 1

    # filter_widths = [3,4,5]
    q_filter_widths = [5]
    a_filter_widths = [5]

    ###### QUESTION ######
    lookup_table_words = nn_layers.LookupTableFastStatic(
        W=vocab_emb, pad=max(q_filter_widths) - 1)
    lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap,
                                                     pad=max(q_filter_widths) -
                                                     1)

    lookup_table = nn_layers.ParallelLookupTable(
        layers=[lookup_table_words, lookup_table_overlap])

    num_input_channels = 1
    input_shape = (batch_size, num_input_channels,
                   q_max_sent_size + 2 * (max(q_filter_widths) - 1), ndim)

    conv_layers = []
    for filter_width in q_filter_widths:
        filter_shape = (nkernels, num_input_channels, filter_width, ndim)
        conv = nn_layers.Conv2dLayer(rng=numpy_rng,
                                     filter_shape=filter_shape,
                                     input_shape=input_shape)
        non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0],
                                                    activation=activation)
        pooling = nn_layers.KMaxPoolLayer(k_max=q_k_max)
        conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(
            layers=[conv, non_linearity, pooling])
        conv_layers.append(conv2dNonLinearMaxPool)

    join_layer = nn_layers.ParallelLayer(layers=conv_layers)
    flatten_layer = nn_layers.FlattenLayer()

    nnet_q = nn_layers.FeedForwardNet(layers=[
        lookup_table,
        join_layer,
        flatten_layer,
    ])
    nnet_q.set_input((x_q, x_q_overlap))
    ######

    ###### ANSWER ######
    lookup_table_words = nn_layers.LookupTableFastStatic(
        W=vocab_emb, pad=max(q_filter_widths) - 1)
    lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap,
                                                     pad=max(q_filter_widths) -
                                                     1)

    lookup_table = nn_layers.ParallelLookupTable(
        layers=[lookup_table_words, lookup_table_overlap])

    # num_input_channels = len(lookup_table.layers)
    input_shape = (batch_size, num_input_channels,
                   a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim)
    conv_layers = []
    for filter_width in a_filter_widths:
        filter_shape = (nkernels, num_input_channels, filter_width, ndim)
        conv = nn_layers.Conv2dLayer(rng=numpy_rng,
                                     filter_shape=filter_shape,
                                     input_shape=input_shape)
        non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0],
                                                    activation=activation)
        pooling = nn_layers.KMaxPoolLayer(k_max=a_k_max)
        conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(
            layers=[conv, non_linearity, pooling])
        conv_layers.append(conv2dNonLinearMaxPool)

    join_layer = nn_layers.ParallelLayer(layers=conv_layers)
    flatten_layer = nn_layers.FlattenLayer()

    nnet_a = nn_layers.FeedForwardNet(layers=[
        lookup_table,
        join_layer,
        flatten_layer,
    ])
    nnet_a.set_input((x_a, x_a_overlap))
    #######
    # print 'nnet_q.output', nnet_q.output.ndim

    q_logistic_n_in = nkernels * len(q_filter_widths) * q_k_max
    a_logistic_n_in = nkernels * len(a_filter_widths) * a_k_max

    # dropout_q = nn_layers.FastDropoutLayer(rng=numpy_rng)
    # dropout_a = nn_layers.FastDropoutLayer(rng=numpy_rng)
    # dropout_q.set_input(nnet_q.output)
    # dropout_a.set_input(nnet_a.output)

    # feats_nout = 10
    # x_hidden_layer = nn_layers.LinearLayer(numpy_rng, n_in=feats_ndim, n_out=feats_nout, activation=activation)
    # x_hidden_layer.set_input(x)

    # feats_nout = feats_ndim

    ### Dropout
    # classifier = nn_layers.PairwiseLogisticWithFeatsRegression(q_in=logistic_n_in,
    #                                                   a_in=logistic_n_in,
    #                                                   n_in=feats_nout,
    #                                                   n_out=n_outs)
    # # classifier.set_input((dropout_q.output, dropout_a.output, x_hidden_layer.output))
    # classifier.set_input((dropout_q.output, dropout_a.output, x))

    # # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, dropout_q, dropout_a, classifier],
    # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, dropout_q, dropout_a, classifier],
    #                                       name="Training nnet")

    # test_classifier = nn_layers.PairwiseLogisticWithFeatsRegression(q_in=logistic_n_in,
    #                                                         a_in=logistic_n_in,
    #                                                         n_in=feats_nout,
    #                                                         n_out=n_outs,
    #                                                         W=classifier.W,
    #                                                         W_feats=classifier.W_feats,
    #                                                         b=classifier.b)
    # # test_classifier.set_input((nnet_q.output, nnet_a.output, x_hidden_layer.output))
    # test_classifier.set_input((nnet_q.output, nnet_a.output, x))
    # # test_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, test_classifier],
    # test_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, test_classifier],
    #                                       name="Test nnet")
    #########

    # pairwise_layer = nn_layers.PairwiseMultiOnlySimWithFeatsLayer(q_in=q_logistic_n_in,

    pairwise_layer = nn_layers.PairwiseNoFeatsLayer(
        q_in=q_logistic_n_in,
        # pairwise_layer = nn_layers.PairwiseWithFeatsLayer(q_in=q_logistic_n_in,
        # pairwise_layer = nn_layers.PairwiseOnlySimWithFeatsLayer(q_in=q_logistic_n_in,
        a_in=a_logistic_n_in)
    pairwise_layer.set_input((nnet_q.output, nnet_a.output))

    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + a_logistic_n_in
    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 50
    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 1
    n_in = q_logistic_n_in + a_logistic_n_in + 1
    # n_in = feats_ndim + 1
    # n_in = feats_ndim + 50

    hidden_layer = nn_layers.LinearLayer(numpy_rng,
                                         n_in=n_in,
                                         n_out=n_in,
                                         activation=activation)
    hidden_layer.set_input(pairwise_layer.output)

    classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs)
    classifier.set_input(hidden_layer.output)

    train_nnet = nn_layers.FeedForwardNet(
        layers=[nnet_q, nnet_a, pairwise_layer, hidden_layer, classifier],
        # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, classifier],
        name="Training nnet")
    test_nnet = train_nnet
    #######

    print train_nnet

    params = train_nnet.params

    ts = datetime.now().strftime('%Y-%m-%d-%H.%M.%S')

    nnet_outdir = 'exp.out/ndim={};batch={};max_norm={};learning_rate={};{}'.format(
        ndim, batch_size, max_norm, learning_rate, ts)
    nnet_outdir = os.path.join(data_dir, nnet_outdir)
    if not os.path.exists(nnet_outdir):
        os.makedirs(nnet_outdir)
    nnet_fname = os.path.join(nnet_outdir, 'nnet.dat')
    print "Saving to", nnet_fname
    cPickle.dump([train_nnet, test_nnet],
                 open(nnet_fname, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)

    total_params = sum([numpy.prod(param.shape.eval()) for param in params])
    print 'Total params number:', total_params

    cost = train_nnet.layers[-1].training_cost(y)
    # y_train_counts = numpy.unique(y_train, return_counts=True)[1].astype(numpy.float32)
    # weights_data = numpy.sum(y_train_counts) / y_train_counts
    # weights_data_norm = numpy.linalg.norm(weights_data)
    # weights_data /= weights_data_norm
    # print 'weights_data', weights_data
    # weights = theano.shared(weights_data, borrow=True)
    # cost = train_nnet.layers[-1].training_cost_weighted(y, weights=weights)

    predictions = test_nnet.layers[-1].y_pred
    predictions_prob = test_nnet.layers[-1].p_y_given_x[:, -1]

    ### L2 regularization
    # L2_word_emb = 1e-4
    # L2_conv1d = 3e-5
    # # L2_softmax = 1e-3
    # L2_softmax = 1e-4
    # print "Regularizing nnet weights"
    # for w in train_nnet.weights:
    #   L2_reg = 0.
    #   if w.name.startswith('W_emb'):
    #     L2_reg = L2_word_emb
    #   elif w.name.startswith('W_conv1d'):
    #     L2_reg = L2_conv1d
    #   elif w.name.startswith('W_softmax'):
    #     L2_reg = L2_softmax
    #   elif w.name == 'W':
    #     L2_reg = L2_softmax
    #   print w.name, L2_reg
    #   cost += T.sum(w**2) * L2_reg

    # batch_x = T.dmatrix('batch_x')
    batch_x_q = T.lmatrix('batch_x_q')
    batch_x_a = T.lmatrix('batch_x_a')
    batch_x_q_overlap = T.lmatrix('batch_x_q_overlap')
    batch_x_a_overlap = T.lmatrix('batch_x_a_overlap')
    batch_y = T.ivector('batch_y')

    # updates = sgd_trainer.get_adagrad_updates(cost, params, learning_rate=learning_rate, max_norm=max_norm, _eps=1e-6)
    updates = sgd_trainer.get_adadelta_updates(cost,
                                               params,
                                               rho=0.95,
                                               eps=1e-6,
                                               max_norm=max_norm,
                                               word_vec_name='W_emb')

    inputs_pred = [
        batch_x_q,
        batch_x_a,
        batch_x_q_overlap,
        batch_x_a_overlap,
        # batch_x,
    ]

    givens_pred = {
        x_q: batch_x_q,
        x_a: batch_x_a,
        x_q_overlap: batch_x_q_overlap,
        x_a_overlap: batch_x_a_overlap,
        # x: batch_x
    }

    inputs_train = [
        batch_x_q,
        batch_x_a,
        batch_x_q_overlap,
        batch_x_a_overlap,
        # batch_x,
        batch_y,
    ]

    givens_train = {
        x_q: batch_x_q,
        x_a: batch_x_a,
        x_q_overlap: batch_x_q_overlap,
        x_a_overlap: batch_x_a_overlap,
        # x: batch_x,
        y: batch_y
    }

    train_fn = theano.function(inputs=inputs_train,
                               outputs=cost,
                               updates=updates,
                               givens=givens_train)

    pred_fn = theano.function(inputs=inputs_pred,
                              outputs=predictions,
                              givens=givens_pred)

    pred_prob_fn = theano.function(inputs=inputs_pred,
                                   outputs=predictions_prob,
                                   givens=givens_pred)

    def predict_batch(batch_iterator):
        preds = numpy.hstack([
            pred_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap)
            for batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, _
            in batch_iterator
        ])
        return preds[:batch_iterator.n_samples]

    def predict_prob_batch(batch_iterator):
        preds = numpy.hstack([
            pred_prob_fn(batch_x_q, batch_x_a, batch_x_q_overlap,
                         batch_x_a_overlap) for batch_x_q, batch_x_a,
            batch_x_q_overlap, batch_x_a_overlap, _ in batch_iterator
        ])
        return preds[:batch_iterator.n_samples]

    train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng,
        [q_train, a_train, q_overlap_train, a_overlap_train, y_train],
        batch_size=batch_size,
        randomize=True)
    dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [q_dev, a_dev, q_overlap_dev, a_overlap_dev, y_dev],
        batch_size=batch_size,
        randomize=False)
    test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [q_test, a_test, q_overlap_test, a_overlap_test, y_test],
        batch_size=batch_size,
        randomize=False)

    labels = sorted(numpy.unique(y_test))
    print 'labels', labels

    def map_score(qids, labels, preds):
        qid2cand = defaultdict(list)
        for qid, label, pred in zip(qids, labels, preds):
            qid2cand[qid].append((pred, label))

        average_precs = []
        for qid, candidates in qid2cand.iteritems():
            average_prec = 0
            running_correct_count = 0
            for i, (score,
                    label) in enumerate(sorted(candidates, reverse=True), 1):
                if label > 0:
                    running_correct_count += 1
                    average_prec += float(running_correct_count) / i
            average_precs.append(average_prec / (running_correct_count + 1e-6))
        map_score = sum(average_precs) / len(average_precs)
        return map_score

    print "Zero out dummy word:", ZEROUT_DUMMY_WORD
    if ZEROUT_DUMMY_WORD:
        W_emb_list = [w for w in params if w.name == 'W_emb']
        zerout_dummy_word = theano.function(
            [], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list])

    # weights_dev = numpy.zeros(len(y_dev))
    # weights_dev[y_dev == 0] = weights_data[0]
    # weights_dev[y_dev == 1] = weights_data[1]
    # print weights_dev

    best_dev_acc = -numpy.inf
    epoch = 0
    timer_train = time.time()
    no_best_dev_update = 0
    num_train_batches = len(train_set_iterator)
    while epoch < n_epochs:
        timer = time.time()
        for i, (x_q, x_a, x_q_overlap, x_a_overlap,
                y) in enumerate(tqdm(train_set_iterator), 1):
            train_fn(x_q, x_a, x_q_overlap, x_a_overlap, y)

            # Make sure the null word in the word embeddings always remains zero
            if ZEROUT_DUMMY_WORD:
                zerout_dummy_word()

            if i % 10 == 0 or i == num_train_batches:
                y_pred_dev = predict_prob_batch(dev_set_iterator)
                # # dev_acc = map_score(qids_dev, y_dev, predict_prob_batch(dev_set_iterator)) * 100
                dev_acc = metrics.roc_auc_score(y_dev, y_pred_dev) * 100
                if dev_acc > best_dev_acc:
                    y_pred = predict_prob_batch(test_set_iterator)
                    test_acc = map_score(qids_test, y_test, y_pred) * 100

                    print(
                        'epoch: {} batch: {} dev auc: {:.4f}; test map: {:.4f}; best_dev_acc: {:.4f}'
                        .format(epoch, i, dev_acc, test_acc, best_dev_acc))
                    best_dev_acc = dev_acc
                    best_params = [
                        numpy.copy(p.get_value(borrow=True)) for p in params
                    ]
                    no_best_dev_update = 0

        if no_best_dev_update >= 3:
            print "Quitting after of no update of the best score on dev set", no_best_dev_update
            break

        print('epoch {} took {:.4f} seconds'.format(epoch,
                                                    time.time() - timer))
        epoch += 1
        no_best_dev_update += 1

    print('Training took: {:.4f} seconds'.format(time.time() - timer_train))
    for i, param in enumerate(best_params):
        params[i].set_value(param, borrow=True)

    y_pred_test = predict_prob_batch(test_set_iterator)
    test_acc = map_score(qids_test, y_test, y_pred_test) * 100
    fname = os.path.join(
        nnet_outdir,
        'best_dev_params.epoch={:02d};batch={:05d};dev_acc={:.2f}.dat'.format(
            epoch, i, best_dev_acc))
    numpy.savetxt(
        os.path.join(
            nnet_outdir,
            'test.epoch={:02d};batch={:05d};dev_acc={:.2f}.predictions.npy'.
            format(epoch, i, best_dev_acc)), y_pred)
    cPickle.dump(best_params,
                 open(fname, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)

    print "Running trec_eval script..."
    N = len(y_pred_test)

    df_submission = pd.DataFrame(
        index=numpy.arange(N),
        columns=['qid', 'iter', 'docno', 'rank', 'sim', 'run_id'])
    df_submission['qid'] = qids_test
    df_submission['iter'] = 0
    df_submission['docno'] = numpy.arange(N)
    df_submission['rank'] = 0
    df_submission['sim'] = y_pred_test
    df_submission['run_id'] = 'nnet'
    df_submission.to_csv(os.path.join(nnet_outdir, 'submission.txt'),
                         header=False,
                         index=False,
                         sep=' ')

    df_gold = pd.DataFrame(index=numpy.arange(N),
                           columns=['qid', 'iter', 'docno', 'rel'])
    df_gold['qid'] = qids_test
    df_gold['iter'] = 0
    df_gold['docno'] = numpy.arange(N)
    df_gold['rel'] = y_test
    df_gold.to_csv(os.path.join(nnet_outdir, 'gold.txt'),
                   header=False,
                   index=False,
                   sep=' ')

    subprocess.call("/bin/sh run_eval.sh '{}'".format(nnet_outdir), shell=True)
示例#4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-a', choices=['abcnn1', 'abcnn2'])
    parser.add_argument('--similarity', choices=['euclidean', 'cosine'])
    parser.add_argument('--no-features',
                        action='store_true',
                        help='do not use external features')
    parser.add_argument('--l2svm',
                        action='store_true',
                        help='use L2-SVM as the classifier')
    parser.add_argument('--dropout', choices=['gaussian', 'mc'])
    parser.add_argument('--dropout-rate',
                        type=float,
                        help='dropout rate (default: %(default)s)')
    parser.add_argument('--nkernels',
                        type=int,
                        help='number of kernels (default: %(default)s)')
    parser.add_argument('--early-stop',
                        metavar='N',
                        type=int,
                        help='stop if seeing no improvements in N epochs')
    parser.add_argument('-e',
                        choices=['GoogleNews', 'aquaint+wiki'],
                        help='word embeddings file to use')
    parser.add_argument('mode')
    parser.set_defaults(early_stop=3,
                        e='GoogleNews',
                        dropout_rate=0.5,
                        nkernels=100)
    args = parser.parse_args()

    # ZEROUT_DUMMY_WORD = False
    ZEROUT_DUMMY_WORD = True

    ## Load data
    # mode = 'TRAIN-ALL'
    mode = args.mode
    if mode not in ['TRAIN', 'TRAIN-ALL', 'WIKIQA-TRAIN'] + [
            'WEBAP-FOLD{}-TRAIN'.format(i) for i in (1, 2, 3, 4, 5)
    ]:
        print "ERROR! mode '{}' is invalid".format(mode)
        sys.exit(1)

    print "Running training in the {} setting".format(mode)

    data_dir = mode

    def load_numpy_data(data_dir, prefix):
        filetypes = [
            'questions', 'answers', 'q_overlap_indices', 'a_overlap_indices',
            'labels', 'qids', 'aids'
        ]
        filenames = [
            '{}.{}.npy'.format(prefix, filetype) for filetype in filetypes
        ]
        return [
            numpy.load(os.path.join(data_dir, filename))
            for filename in filenames
        ]

    if mode in ['TRAIN-ALL', 'TRAIN']:
        prefix = mode.lower()
        q_train, a_train, q_overlap_train, a_overlap_train, y_train, _, _ = load_numpy_data(
            data_dir, prefix)
        q_dev, a_dev, q_overlap_dev, a_overlap_dev, y_dev, qids_dev, _ = load_numpy_data(
            data_dir, 'dev')
        q_test, a_test, q_overlap_test, a_overlap_test, y_test, qids_test, aids_test = load_numpy_data(
            data_dir, 'test')

        x_train = numpy.load(
            os.path.join(data_dir, '{}.overlap_feats.npy'.format(prefix)))
        x_dev = numpy.load(os.path.join(data_dir, 'dev.overlap_feats.npy'))
        x_test = numpy.load(os.path.join(data_dir, 'test.overlap_feats.npy'))

    elif mode in ['WIKIQA-TRAIN']:
        q_train, a_train, q_overlap_train, a_overlap_train, y_train, _, _ = load_numpy_data(
            data_dir, 'WikiQA-train')
        q_dev, a_dev, q_overlap_dev, a_overlap_dev, y_dev, qids_dev, _ = load_numpy_data(
            data_dir, 'WikiQA-dev-filtered')
        q_test, a_test, q_overlap_test, a_overlap_test, y_test, qids_test, aids_test = load_numpy_data(
            data_dir, 'WikiQA-test-filtered')

        x_train = numpy.load(
            os.path.join(data_dir, 'WikiQA-train.overlap_feats.npy'))
        x_dev = numpy.load(
            os.path.join(data_dir, 'WikiQA-dev-filtered.overlap_feats.npy'))
        x_test = numpy.load(
            os.path.join(data_dir, 'WikiQA-test-filtered.overlap_feats.npy'))

    elif mode in ['WEBAP-FOLD{}-TRAIN'.format(i) for i in (1, 2, 3, 4, 5)]:
        fn = ['WEBAP-FOLD{}-TRAIN'.format(i)
              for i in (1, 2, 3, 4, 5)].index(mode) + 1

        q_train, a_train, q_overlap_train, a_overlap_train, y_train, _, _ = load_numpy_data(
            data_dir, 'WebAP-fold{}-train'.format(fn))
        q_dev, a_dev, q_overlap_dev, a_overlap_dev, y_dev, qids_dev, _ = load_numpy_data(
            data_dir, 'WebAP-fold{}-dev'.format(fn))
        q_test, a_test, q_overlap_test, a_overlap_test, y_test, qids_test, aids_test = load_numpy_data(
            data_dir, 'WebAP-fold{}-test'.format(fn))

    # x_train = numpy.load(os.path.join(data_dir, 'train.overlap_feats.npy'))
    # x_dev = numpy.load(os.path.join(data_dir, 'dev.overlap_feats.npy'))
    # x_test = numpy.load(os.path.join(data_dir, 'test.overlap_feats.npy'))

    feats_ndim = x_train.shape[1]

    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler(copy=True)
    print "Scaling features"
    x_train = scaler.fit_transform(x_train)
    x_dev = scaler.transform(x_dev)
    x_test = scaler.transform(x_test)

    print 'y_train', numpy.unique(y_train, return_counts=True)
    print 'y_dev', numpy.unique(y_dev, return_counts=True)
    print 'y_test', numpy.unique(y_test, return_counts=True)

    print 'q_train', q_train.shape
    print 'q_dev', q_dev.shape
    print 'q_test', q_test.shape

    print 'a_train', a_train.shape
    print 'a_dev', a_dev.shape
    print 'a_test', a_test.shape

    print 'x_train', x_train.shape
    print 'x_dev', x_dev.shape
    print 'x_test', x_test.shape

    ## Get the word embeddings from the nnet trained on SemEval
    # ndim = 40
    # nnet_outdir = 'exp/ndim=60;batch=100;max_norm=0;learning_rate=0.1;2014-12-02-15:53:14'
    # nnet_fname = os.path.join(nnet_outdir, 'nnet.dat')
    # params_fname = os.path.join(nnet_outdir, 'best_dev_params.epoch=00;batch=14640;dev_f1=83.12;test_acc=85.00.dat')
    # train_nnet, test_nnet = nn_layers.load_nnet(nnet_fname, params_fname)

    numpy_rng = numpy.random.RandomState(123)
    q_max_sent_size = q_train.shape[1]
    a_max_sent_size = a_train.shape[1]
    # print 'max', numpy.max(a_train)
    # print 'min', numpy.min(a_train)

    ndim = 5
    print "Generating random vocabulary for word overlap indicator features with dim:", ndim
    dummy_word_id = numpy.max(a_overlap_train)
    # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim))
    print "Gaussian"
    vocab_emb_overlap = numpy_rng.randn(dummy_word_id + 1, ndim) * 0.25
    # vocab_emb_overlap = numpy_rng.randn(dummy_word_id+1, ndim) * 0.05
    # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim))
    vocab_emb_overlap[-1] = 0

    # Load word2vec embeddings
    if args.e in ['GoogleNews']:
        fname = os.path.join(data_dir,
                             'emb_GoogleNews-vectors-negative300.bin.npy')
    elif args.e in ['aquaint+wiki']:
        fname = os.path.join(data_dir,
                             'emb_aquaint+wiki.txt.gz.ndim=50.bin.npy')
    else:
        print 'No such embedding file: {}'.format(args.e)
        sys.exit(1)

    print "Loading word embeddings from", fname
    vocab_emb = numpy.load(fname)
    ndim = vocab_emb.shape[1]
    dummpy_word_idx = numpy.max(a_train)
    print "Word embedding matrix size:", vocab_emb.shape

    x = T.dmatrix('x')
    x_q = T.lmatrix('q')
    x_q_overlap = T.lmatrix('q_overlap')
    x_a = T.lmatrix('a')
    x_a_overlap = T.lmatrix('a_overlap')
    y = T.ivector('y')

    #######
    n_outs = 2

    n_epochs = 25
    batch_size = 50
    learning_rate = 0.1
    max_norm = 0

    print 'batch_size', batch_size
    print 'n_epochs', n_epochs
    print 'learning_rate', learning_rate
    print 'max_norm', max_norm

    ## 1st conv layer.
    ndim = vocab_emb.shape[1] + vocab_emb_overlap.shape[1]

    ### Nonlinearity type
    # activation = nn_layers.relu_f
    activation = T.tanh

    dropout_rate = args.dropout_rate
    nkernels = args.nkernels
    q_k_max = 1
    a_k_max = 1

    # filter_widths = [3,4,5]
    q_filter_widths = [5]
    a_filter_widths = [5]

    # Lookup layers
    lookup_table_q = nn_layers.ParallelLookupTable(layers=[
        nn_layers.LookupTableFastStatic(W=vocab_emb,
                                        pad=max(q_filter_widths) - 1),
        nn_layers.LookupTableFast(W=vocab_emb_overlap,
                                  pad=max(q_filter_widths) - 1)
    ])
    lookup_table_q.set_input((x_q, x_q_overlap))

    lookup_table_a = nn_layers.ParallelLookupTable(layers=[
        nn_layers.LookupTableFastStatic(W=vocab_emb,
                                        pad=max(a_filter_widths) - 1),
        nn_layers.LookupTableFast(W=vocab_emb_overlap,
                                  pad=max(a_filter_widths) - 1)
    ])
    lookup_table_a.set_input((x_a, x_a_overlap))

    # NOTE: these seemingly mismatched shapes are actually correct
    if args.a in ['abcnn1']:
        attention = AttentionTransformLayer(
            similarity=args.similarity,
            rng=numpy_rng,
            W_q_shape=(a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim),
            W_a_shape=(q_max_sent_size + 2 * (max(q_filter_widths) - 1), ndim))
        num_input_channels = 2
    elif args.a in ['abcnn2']:
        attention = AttentionWeightingLayer(similarity=args.similarity)
        num_input_channels = 1
    else:
        attention = None
        num_input_channels = 1

    if attention is not None:
        attention.set_input((lookup_table_q.output, lookup_table_a.output))
        input0, input1 = attention.output
    else:
        input0, input1 = lookup_table_q.output, lookup_table_a.output

    input_shape_q = (batch_size, num_input_channels,
                     q_max_sent_size + 2 * (max(q_filter_widths) - 1), ndim)
    input_shape_a = (batch_size, num_input_channels,
                     a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim)
    ###### QUESTION ######

    # lookup_table_words = nn_layers.LookupTableFastStatic(
    #     W=vocab_emb, pad=max(q_filter_widths) - 1)
    # lookup_table_overlap = nn_layers.LookupTableFast(
    #     W=vocab_emb_overlap, pad=max(q_filter_widths) - 1)
    # lookup_table = nn_layers.ParallelLookupTable(
    #     layers=[lookup_table_words, lookup_table_overlap])

    # input_shape = (batch_size, num_input_channels, q_max_sent_size + 2 *
    #                (max(q_filter_widths) - 1), ndim)

    conv_layers = []
    for filter_width in q_filter_widths:
        filter_shape = (nkernels, num_input_channels, filter_width, ndim)
        conv = nn_layers.Conv2dLayer(rng=numpy_rng,
                                     filter_shape=filter_shape,
                                     input_shape=input_shape_q)
        non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0],
                                                    activation=activation)
        pooling = nn_layers.KMaxPoolLayer(k_max=q_k_max)
        conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(
            layers=[conv, non_linearity, pooling])
        conv_layers.append(conv2dNonLinearMaxPool)

    join_layer = nn_layers.ParallelLayer(layers=conv_layers)
    flatten_layer = nn_layers.FlattenLayer()

    nnet_q = nn_layers.FeedForwardNet(layers=[join_layer, flatten_layer])
    nnet_q.set_input(input0)
    ######

    ###### ANSWER ######
    # lookup_table_words = nn_layers.LookupTableFastStatic(
    #     W=vocab_emb, pad=max(q_filter_widths) - 1)
    # lookup_table_overlap = nn_layers.LookupTableFast(
    #     W=vocab_emb_overlap, pad=max(q_filter_widths) - 1)

    # lookup_table = nn_layers.ParallelLookupTable(
    #     layers=[lookup_table_words, lookup_table_overlap])

    # num_input_channels = len(lookup_table.layers)
    # input_shape = (batch_size, num_input_channels, a_max_sent_size + 2 *
    #                (max(a_filter_widths) - 1), ndim)
    conv_layers = []
    for filter_width in a_filter_widths:
        filter_shape = (nkernels, num_input_channels, filter_width, ndim)
        conv = nn_layers.Conv2dLayer(rng=numpy_rng,
                                     filter_shape=filter_shape,
                                     input_shape=input_shape_a)
        non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0],
                                                    activation=activation)
        pooling = nn_layers.KMaxPoolLayer(k_max=a_k_max)
        conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(
            layers=[conv, non_linearity, pooling])
        conv_layers.append(conv2dNonLinearMaxPool)

    join_layer = nn_layers.ParallelLayer(layers=conv_layers)
    flatten_layer = nn_layers.FlattenLayer()

    nnet_a = nn_layers.FeedForwardNet(layers=[join_layer, flatten_layer])
    nnet_a.set_input(input1)
    #######
    # print 'nnet_q.output', nnet_q.output.ndim

    q_logistic_n_in = nkernels * len(q_filter_widths) * q_k_max
    a_logistic_n_in = nkernels * len(a_filter_widths) * a_k_max

    if args.dropout:
        if args.dropout == 'gaussian':
            dropout_q = nn_layers.FastDropoutLayer(rng=numpy_rng)
            dropout_a = nn_layers.FastDropoutLayer(rng=numpy_rng)
        elif args.dropout == 'mc':
            dropout_q = nn_layers.DropoutLayer(rng=numpy_rng, p=dropout_rate)
            dropout_a = nn_layers.DropoutLayer(rng=numpy_rng, p=dropout_rate)
        dropout_q.set_input(nnet_q.output)
        dropout_a.set_input(nnet_a.output)

    # feats_nout = 10
    # x_hidden_layer = nn_layers.LinearLayer(numpy_rng, n_in=feats_ndim, n_out=feats_nout, activation=activation)
    # x_hidden_layer.set_input(x)

    # feats_nout = feats_ndim

    ### Dropout
    # classifier = nn_layers.PairwiseLogisticWithFeatsRegression(q_in=logistic_n_in,
    #                                                   a_in=logistic_n_in,
    #                                                   n_in=feats_nout,
    #                                                   n_out=n_outs)
    # # classifier.set_input((dropout_q.output, dropout_a.output, x_hidden_layer.output))
    # classifier.set_input((dropout_q.output, dropout_a.output, x))

    # # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, dropout_q, dropout_a, classifier],
    # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, dropout_q, dropout_a, classifier],
    #                                       name="Training nnet")

    # test_classifier = nn_layers.PairwiseLogisticWithFeatsRegression(q_in=logistic_n_in,
    #                                                         a_in=logistic_n_in,
    #                                                         n_in=feats_nout,
    #                                                         n_out=n_outs,
    #                                                         W=classifier.W,
    #                                                         W_feats=classifier.W_feats,
    #                                                         b=classifier.b)
    # # test_classifier.set_input((nnet_q.output, nnet_a.output, x_hidden_layer.output))
    # test_classifier.set_input((nnet_q.output, nnet_a.output, x))
    # # test_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, test_classifier],
    # test_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, test_classifier],
    #                                       name="Test nnet")
    #########

    # pairwise_layer = nn_layers.PairwiseMultiOnlySimWithFeatsLayer(q_in=q_logistic_n_in,
    # pairwise_layer = nn_layers.PairwiseWithFeatsLayer(q_in=q_logistic_n_in,
    #                                                   a_in=a_logistic_n_in,
    #                                                   n_in=feats_ndim)
    # pairwise_layer = nn_layers.PairwiseOnlySimWithFeatsLayer(q_in=q_logistic_n_in,

    # pairwise_layer = nn_layers.PairwiseNoFeatsLayer(q_in=q_logistic_n_in,
    #                                                 a_in=a_logistic_n_in)
    # pairwise_layer.set_input((nnet_q.output, nnet_a.output))
    if args.no_features:
        pairwise_layer = nn_layers.PairwiseNoFeatsLayer(q_in=q_logistic_n_in,
                                                        a_in=a_logistic_n_in)
        n_in = q_logistic_n_in + a_logistic_n_in + 1
        if args.dropout:
            pairwise_layer.set_input((dropout_q.output, dropout_a.output))
        else:
            pairwise_layer.set_input((nnet_q.output, nnet_a.output))
    else:
        pairwise_layer = nn_layers.PairwiseWithFeatsLayer(q_in=q_logistic_n_in,
                                                          a_in=a_logistic_n_in,
                                                          n_in=feats_ndim)
        n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 1
        if args.dropout:
            pairwise_layer.set_input((dropout_q.output, dropout_a.output, x))
        else:
            pairwise_layer.set_input((nnet_q.output, nnet_a.output, x))

    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + a_logistic_n_in
    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 50
    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 1
    # n_in = q_logistic_n_in + a_logistic_n_in + 1
    # n_in = feats_ndim + 1
    # n_in = feats_ndim + 50

    hidden_layer = nn_layers.LinearLayer(numpy_rng,
                                         n_in=n_in,
                                         n_out=n_in,
                                         activation=activation)
    hidden_layer.set_input(pairwise_layer.output)

    if args.l2svm:
        classifier = nn_layers.L2SVM(n_in=n_in, n_out=n_outs)
    else:
        classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs)
    classifier.set_input(hidden_layer.output)

    all_layers = []
    if args.a:
        all_layers.append(attention)
    all_layers.extend([nnet_q, nnet_a])
    if args.dropout:
        all_layers.extend([dropout_q, dropout_a])
    all_layers.extend([pairwise_layer, hidden_layer, classifier])

    train_nnet = nn_layers.FeedForwardNet(
        layers=all_layers,
        # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, classifier],
        name="Training nnet")
    test_nnet = train_nnet
    #######

    print train_nnet

    params = train_nnet.params

    ts = datetime.now().strftime('%Y-%m-%d-%H.%M.%S')
    nnet_outdir = 'exp.out/ndim={};batch={};max_norm={};learning_rate={};{}'.format(
        ndim, batch_size, max_norm, learning_rate, ts)
    if not os.path.exists(nnet_outdir):
        os.makedirs(nnet_outdir)
    nnet_fname = os.path.join(nnet_outdir, 'nnet.dat')
    print "Saving to", nnet_fname
    cPickle.dump([train_nnet, test_nnet],
                 open(nnet_fname, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)

    total_params = sum([numpy.prod(param.shape.eval()) for param in params])
    print 'Total params number:', total_params

    cost = train_nnet.layers[-1].training_cost(y)
    # y_train_counts = numpy.unique(y_train, return_counts=True)[1].astype(numpy.float32)
    # weights_data = numpy.sum(y_train_counts) / y_train_counts
    # weights_data_norm = numpy.linalg.norm(weights_data)
    # weights_data /= weights_data_norm
    # print 'weights_data', weights_data
    # weights = theano.shared(weights_data, borrow=True)
    # cost = train_nnet.layers[-1].training_cost_weighted(y, weights=weights)

    predictions = test_nnet.layers[-1].y_pred
    predictions_prob = test_nnet.layers[-1].p_y_given_x[:, -1]

    ### L2 regularization
    # L2_word_emb = 1e-4
    # L2_conv1d = 3e-5
    # # L2_softmax = 1e-3
    # L2_softmax = 1e-4
    # print "Regularizing nnet weights"
    # for w in train_nnet.weights:
    #   L2_reg = 0.
    #   if w.name.startswith('W_emb'):
    #     L2_reg = L2_word_emb
    #   elif w.name.startswith('W_conv1d'):
    #     L2_reg = L2_conv1d
    #   elif w.name.startswith('W_softmax'):
    #     L2_reg = L2_softmax
    #   elif w.name == 'W':
    #     L2_reg = L2_softmax
    #   print w.name, L2_reg
    #   cost += T.sum(w**2) * L2_reg

    batch_x = T.dmatrix('batch_x')
    batch_x_q = T.lmatrix('batch_x_q')
    batch_x_a = T.lmatrix('batch_x_a')
    batch_x_q_overlap = T.lmatrix('batch_x_q_overlap')
    batch_x_a_overlap = T.lmatrix('batch_x_a_overlap')
    batch_y = T.ivector('batch_y')

    # updates = sgd_trainer.get_adagrad_updates(cost, params, learning_rate=learning_rate, max_norm=max_norm, _eps=1e-6)
    updates = sgd_trainer.get_adadelta_updates(cost,
                                               params,
                                               rho=0.95,
                                               eps=1e-6,
                                               max_norm=max_norm,
                                               word_vec_name='W_emb')

    inputs_pred = [
        batch_x_q,
        batch_x_a,
        batch_x_q_overlap,
        batch_x_a_overlap,
        batch_x,
    ]

    givens_pred = {
        x_q: batch_x_q,
        x_a: batch_x_a,
        x_q_overlap: batch_x_q_overlap,
        x_a_overlap: batch_x_a_overlap,
        x: batch_x
    }

    inputs_train = [
        batch_x_q,
        batch_x_a,
        batch_x_q_overlap,
        batch_x_a_overlap,
        batch_x,
        batch_y,
    ]

    givens_train = {
        x_q: batch_x_q,
        x_a: batch_x_a,
        x_q_overlap: batch_x_q_overlap,
        x_a_overlap: batch_x_a_overlap,
        x: batch_x,
        y: batch_y
    }

    train_fn = theano.function(inputs=inputs_train,
                               outputs=cost,
                               updates=updates,
                               givens=givens_train,
                               on_unused_input='warn')

    pred_fn = theano.function(inputs=inputs_pred,
                              outputs=predictions,
                              givens=givens_pred,
                              on_unused_input='warn')

    pred_prob_fn = theano.function(inputs=inputs_pred,
                                   outputs=predictions_prob,
                                   givens=givens_pred,
                                   on_unused_input='warn')

    def predict_batch(batch_iterator):
        preds = numpy.hstack([
            pred_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap,
                    batch_x) for batch_x_q, batch_x_a, batch_x_q_overlap,
            batch_x_a_overlap, batch_x, _ in batch_iterator
        ])
        return preds[:batch_iterator.n_samples]

    def predict_prob_batch(batch_iterator):
        preds = numpy.hstack([
            pred_prob_fn(batch_x_q, batch_x_a, batch_x_q_overlap,
                         batch_x_a_overlap, batch_x) for batch_x_q, batch_x_a,
            batch_x_q_overlap, batch_x_a_overlap, batch_x, _ in batch_iterator
        ])
        return preds[:batch_iterator.n_samples]

    train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng,
        [q_train, a_train, q_overlap_train, a_overlap_train, x_train, y_train],
        batch_size=batch_size,
        randomize=True)
    dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [q_dev, a_dev, q_overlap_dev, a_overlap_dev, x_dev, y_dev],
        batch_size=batch_size,
        randomize=False)
    test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng,
        [q_test, a_test, q_overlap_test, a_overlap_test, x_test, y_test],
        batch_size=batch_size,
        randomize=False)

    labels = sorted(numpy.unique(y_test))
    print 'labels', labels

    def map_score(qids, labels, preds):
        qid2cand = defaultdict(list)
        for qid, label, pred in zip(qids, labels, preds):
            qid2cand[qid].append((pred, label))

        average_precs = []
        for qid, candidates in qid2cand.iteritems():
            average_prec = 0
            running_correct_count = 0
            for i, (score,
                    label) in enumerate(sorted(candidates, reverse=True), 1):
                if label > 0:
                    running_correct_count += 1
                    average_prec += float(running_correct_count) / i
            average_precs.append(average_prec / (running_correct_count + 1e-6))
        map_score = sum(average_precs) / len(average_precs)
        return map_score

    print "Zero out dummy word:", ZEROUT_DUMMY_WORD
    if ZEROUT_DUMMY_WORD:
        W_emb_list = [w for w in params if w.name == 'W_emb']
        zerout_dummy_word = theano.function(
            [], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list])

    # weights_dev = numpy.zeros(len(y_dev))
    # weights_dev[y_dev == 0] = weights_data[0]
    # weights_dev[y_dev == 1] = weights_data[1]
    # print weights_dev

    best_dev_acc = -numpy.inf
    epoch = 0
    timer_train = time.time()
    no_best_dev_update = 0
    num_train_batches = len(train_set_iterator)
    while epoch < n_epochs:
        timer = time.time()
        for i, (x_q, x_a, x_q_overlap, x_a_overlap, x,
                y) in enumerate(tqdm(train_set_iterator), 1):
            train_fn(x_q, x_a, x_q_overlap, x_a_overlap, x, y)

            # Make sure the null word in the word embeddings always remains zero
            if ZEROUT_DUMMY_WORD:
                zerout_dummy_word()

            if i % 10 == 0 or i == num_train_batches:
                y_pred_dev = predict_prob_batch(dev_set_iterator)
                # # dev_acc = map_score(qids_dev, y_dev, predict_prob_batch(dev_set_iterator)) * 100
                dev_acc = metrics.roc_auc_score(y_dev, y_pred_dev) * 100
                if dev_acc > best_dev_acc:
                    y_pred = predict_prob_batch(test_set_iterator)
                    test_acc = map_score(qids_test, y_test, y_pred) * 100

                    print(
                        'epoch: {} batch: {} dev auc: {:.4f}; test map: {:.4f}; best_dev_acc: {:.4f}'
                        .format(epoch, i, dev_acc, test_acc, best_dev_acc))
                    best_dev_acc = dev_acc
                    best_params = [
                        numpy.copy(p.get_value(borrow=True)) for p in params
                    ]
                    no_best_dev_update = 0

        if no_best_dev_update >= args.early_stop:
            print "Quitting after of no update of the best score on dev set", no_best_dev_update
            break

        print('epoch {} took {:.4f} seconds'.format(epoch,
                                                    time.time() - timer))
        epoch += 1
        no_best_dev_update += 1

    print('Training took: {:.4f} seconds'.format(time.time() - timer_train))
    for i, param in enumerate(best_params):
        params[i].set_value(param, borrow=True)

    y_pred_test = predict_prob_batch(test_set_iterator)
    test_acc = map_score(qids_test, y_test, y_pred_test) * 100
    fname = os.path.join(
        nnet_outdir,
        'best_dev_params.epoch={:02d};batch={:05d};dev_acc={:.2f}.dat'.format(
            epoch, i, best_dev_acc))
    numpy.savetxt(
        os.path.join(
            nnet_outdir,
            'test.epoch={:02d};batch={:05d};dev_acc={:.2f}.predictions.npy'.
            format(epoch, i, best_dev_acc)), y_pred)
    cPickle.dump(best_params,
                 open(fname, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)

    print "Running trec_eval script..."
    N = len(y_pred_test)

    df_submission = pd.DataFrame(
        index=numpy.arange(N),
        columns=['qid', 'iter', 'docno', 'rank', 'sim', 'run_id'])
    df_submission['qid'] = qids_test
    df_submission['iter'] = 0
    df_submission['docno'] = aids_test
    df_submission['rank'] = 0
    df_submission['sim'] = y_pred_test
    df_submission['run_id'] = 'nnet'
    df_submission.to_csv(os.path.join(nnet_outdir, 'submission.txt'),
                         header=False,
                         index=False,
                         sep=' ')

    df_gold = pd.DataFrame(index=numpy.arange(N),
                           columns=['qid', 'iter', 'docno', 'rel'])
    df_gold['qid'] = qids_test
    df_gold['iter'] = 0
    df_gold['docno'] = aids_test
    df_gold['rel'] = y_test
    df_gold.to_csv(os.path.join(nnet_outdir, 'gold.txt'),
                   header=False,
                   index=False,
                   sep=' ')

    subprocess.call("/bin/sh run_eval.sh '{}'".format(nnet_outdir), shell=True)
    print 'results saved to directory {}'.format(nnet_outdir)
def main():
    ##########
    # LAYERS #
    #########
    HOME_DIR = "semeval_parsed"
    input_fname = '200M'

    test_type = ''
    if len(sys.argv) > 1:
        test_type = sys.argv[1]

    data_dir = HOME_DIR + '_' + input_fname
    numpy_rng = numpy.random.RandomState(123)
    print "Load Parameters"
    parameter_map = cPickle.load(
        open(data_dir + '/parameters_distant_{}.p'.format(test_type), 'rb'))
    input_shape = parameter_map['inputShape']
    filter_width = parameter_map['filterWidth']
    n_in = parameter_map['qLogisticIn']
    k_max = parameter_map['kmax']

    def relu(x):
        return x * (x > 0)

    activation = relu

    tweets = T.imatrix('tweets_train')
    y = T.lvector('y')
    batch_tweets = T.imatrix('batch_x_q')
    batch_y = T.lvector('batch_y')

    lookup_table_words = nn_layers.LookupTableFast(
        W=parameter_map['LookupTableFastStaticW'].get_value(),
        pad=filter_width - 1)

    filter_shape = parameter_map['FilterShape' + str(filter_width)]

    conv_layers = []

    conv = nn_layers.Conv2dLayer(W=parameter_map['Conv2dLayerW' +
                                                 str(filter_width)],
                                 rng=numpy_rng,
                                 filter_shape=filter_shape,
                                 input_shape=input_shape)

    non_linearity = nn_layers.NonLinearityLayer(
        b=parameter_map['NonLinearityLayerB' + str(filter_width)],
        b_size=filter_shape[0],
        activation=activation)

    pooling = nn_layers.KMaxPoolLayer(k_max=k_max)

    conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(
        layers=[conv, non_linearity, pooling])

    conv_layers.append(conv2dNonLinearMaxPool)

    join_layer = nn_layers.ParallelLayer(layers=conv_layers)
    flatten_layer = nn_layers.FlattenLayer()

    hidden_layer = nn_layers.LinearLayer(W=parameter_map['LinearLayerW'],
                                         b=parameter_map['LinearLayerB'],
                                         rng=numpy_rng,
                                         n_in=n_in,
                                         n_out=n_in,
                                         activation=activation)

    n_outs = 3
    classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs)

    nnet_tweets = nn_layers.FeedForwardNet(layers=[
        lookup_table_words, join_layer, flatten_layer, hidden_layer, classifier
    ])

    inputs_train = [batch_tweets, batch_y]
    givens_train = {tweets: batch_tweets, y: batch_y}

    inputs_pred = [
        batch_tweets,
    ]
    givens_pred = {
        tweets: batch_tweets,
    }

    nnet_tweets.set_input(tweets)
    print nnet_tweets

    params = nnet_tweets.params
    cost = nnet_tweets.layers[-1].training_cost(y)
    predictions = nnet_tweets.layers[-1].y_pred

    updates = sgd_trainer.get_adadelta_updates(cost,
                                               params,
                                               rho=0.95,
                                               eps=1e-6,
                                               max_norm=0,
                                               word_vec_name='None')

    train_fn = theano.function(
        inputs=inputs_train,
        outputs=cost,
        updates=updates,
        givens=givens_train,
    )

    pred_fn = theano.function(inputs=inputs_pred,
                              outputs=predictions,
                              givens=givens_pred)

    def predict_batch(batch_iterator):
        preds = numpy.hstack(
            [pred_fn(batch_x_q[0]) for batch_x_q in batch_iterator])
        return preds[:batch_iterator.n_samples]

    #######
    #Names#
    #######
    test_2016n = 'Test 2016'
    test_2015n = 'Test 2015'
    test_2014n = 'Test 2014'
    test_2013n = 'Test 2013'
    test_2014ljn = 'Test 2014 LiveJournal'
    test_2014srcn = 'Test 2014 Sarcasm'
    test_2013_smsn = 'Test 2013 SMS'

    ep_pred = {}
    ep_pred[test_2016n] = []
    ep_pred[test_2015n] = []
    ep_pred[test_2014n] = []
    ep_pred[test_2013n] = []
    ep_pred[test_2014ljn] = []
    ep_pred[test_2014srcn] = []
    ep_pred[test_2013_smsn] = []

    #######################
    # Supervised Learining#
    ######################
    batch_size = 1000

    training2013_tids = numpy.load(
        os.path.join(data_dir, 'task-B-train.20140221.tids.npy'))
    training2013_tweets = numpy.load(
        os.path.join(data_dir, 'task-B-train.20140221.tweets.npy'))
    training2013_sentiments = numpy.load(
        os.path.join(data_dir, 'task-B-train.20140221.sentiments.npy'))

    dev_2013_tids = numpy.load(
        os.path.join(data_dir, 'task-B-dev.20140225.tids.npy'))
    dev_2013_tweets = numpy.load(
        os.path.join(data_dir, 'task-B-dev.20140225.tweets.npy'))
    dev_2013_sentiments = numpy.load(
        os.path.join(data_dir, 'task-B-dev.20140225.sentiments.npy'))

    trainingA_2016_tids = numpy.load(
        os.path.join(data_dir, 'task-A-train-2016.tids.npy'))
    trainingA_2016_tweets = numpy.load(
        os.path.join(data_dir, 'task-A-train-2016.tweets.npy'))
    trainingA_2016_sentiments = numpy.load(
        os.path.join(data_dir, 'task-A-train-2016.sentiments.npy'))

    devA_2016_tids = numpy.load(
        os.path.join(data_dir, 'task-A-dev-2016.tids.npy'))
    devA_2016_tweets = numpy.load(
        os.path.join(data_dir, 'task-A-dev-2016.tweets.npy'))
    devA_2016_sentiments = numpy.load(
        os.path.join(data_dir, 'task-A-dev-2016.sentiments.npy'))

    devtestA_2016_tids = numpy.load(
        os.path.join(data_dir, 'task-A-devtest-2016.tids.npy'))
    devtestA_2016_tweets = numpy.load(
        os.path.join(data_dir, 'task-A-devtest-2016.tweets.npy'))
    devtestA_2016_sentiments = numpy.load(
        os.path.join(data_dir, 'task-A-devtest-2016.sentiments.npy'))

    test_2016_tids = numpy.load(
        os.path.join(data_dir, 'task-A-test2016.tids.npy'))
    test_2016_tweets = numpy.load(
        os.path.join(data_dir, 'task-A-test2016.tweets.npy'))
    test_2016_sentiments = numpy.load(
        os.path.join(data_dir, 'task-A-test2016.sentiments.npy'))

    test_2013_tids = numpy.load(
        os.path.join(data_dir, 'task-B-test2013-twitter.tids.npy'))
    test_2013_tweets = numpy.load(
        os.path.join(data_dir, 'task-B-test2013-twitter.tweets.npy'))
    test_2013_sentiments = numpy.load(
        os.path.join(data_dir, 'task-B-test2013-twitter.sentiments.npy'))

    test_2014_tids = numpy.load(
        os.path.join(data_dir, 'task-B-test2014-twitter.tids.npy'))
    test_2014_tweets = numpy.load(
        os.path.join(data_dir, 'task-B-test2014-twitter.tweets.npy'))
    test_2014_sentiments = numpy.load(
        os.path.join(data_dir, 'task-B-test2014-twitter.sentiments.npy'))

    test_2015_tids = numpy.load(
        os.path.join(data_dir, 'task-B-test2015-twitter.tids.npy'))
    test_2015_tweets = numpy.load(
        os.path.join(data_dir, 'task-B-test2015-twitter.tweets.npy'))
    test_2015_sentiments = numpy.load(
        os.path.join(data_dir, 'task-B-test2015-twitter.sentiments.npy'))

    test_2013_sms_tids = numpy.load(
        os.path.join(data_dir, 'task-B-test2013-sms.tids.npy'))
    test_2013_sms_tweets = numpy.load(
        os.path.join(data_dir, 'task-B-test2013-sms.tweets.npy'))
    test_2013_sms_sentiments = numpy.load(
        os.path.join(data_dir, 'task-B-test2013-sms.sentiments.npy'))

    test_2014_livejournal_tids = numpy.load(
        os.path.join(data_dir, 'task-B-test2014-livejournal.tids.npy'))
    test_2014_livejournal_tweets = numpy.load(
        os.path.join(data_dir, 'task-B-test2014-livejournal.tweets.npy'))
    test_2014_livejournal_sentiments = numpy.load(
        os.path.join(data_dir, 'task-B-test2014-livejournal.sentiments.npy'))

    test_2014_sarcasm_tids = numpy.load(
        os.path.join(data_dir, 'task-B-test2014-twittersarcasm.tids.npy'))
    test_2014_sarcasm_tweets = numpy.load(
        os.path.join(data_dir, 'task-B-test2014-twittersarcasm.tweets.npy'))
    test_2014_sarcasm_sentiments = numpy.load(
        os.path.join(data_dir,
                     'task-B-test2014-twittersarcasm.sentiments.npy'))

    training_full_tweets = numpy.concatenate(
        (training2013_tweets, dev_2013_tweets), axis=0)
    training_full_tweets = numpy.concatenate(
        (training_full_tweets, trainingA_2016_tweets), axis=0)
    training_full_tweets = numpy.concatenate(
        (training_full_tweets, devA_2016_tweets), axis=0)
    training_full_tweets = numpy.concatenate(
        (training_full_tweets, devtestA_2016_tweets), axis=0)

    training_full_sentiments = numpy.concatenate(
        (training2013_sentiments, dev_2013_sentiments), axis=0)
    training_full_sentiments = numpy.concatenate(
        (training_full_sentiments, trainingA_2016_sentiments), axis=0)
    training_full_sentiments = numpy.concatenate(
        (training_full_sentiments, devA_2016_sentiments), axis=0)
    training_full_sentiments = numpy.concatenate(
        (training_full_sentiments, devtestA_2016_sentiments), axis=0)

    train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [training_full_tweets, training_full_sentiments],
        batch_size=batch_size,
        randomize=True)

    test_2015_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [test_2015_tweets], batch_size=batch_size, randomize=False)

    dev2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [devA_2016_tweets], batch_size=batch_size, randomize=False)

    devtestA2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [devtestA_2016_tweets],
        batch_size=batch_size,
        randomize=False)

    train2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [trainingA_2016_tweets],
        batch_size=batch_size,
        randomize=False)

    test2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [test_2016_tweets], batch_size=batch_size, randomize=False)

    test2013_itarator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [test_2013_tweets], batch_size=batch_size, randomize=False)

    test_2014_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [test_2014_tweets], batch_size=batch_size, randomize=False)

    test_2014_sarcasm_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [test_2014_sarcasm_tweets],
        batch_size=batch_size,
        randomize=False)

    train2013_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [training2013_tweets],
        batch_size=batch_size,
        randomize=False)

    dev_2013_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [dev_2013_tweets], batch_size=batch_size, randomize=False)

    test_2013_sms_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [test_2013_sms_tweets],
        batch_size=batch_size,
        randomize=False)

    test_2014_livejournal_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [test_2014_livejournal_tweets],
        batch_size=batch_size,
        randomize=False)

    W_emb_list = [w for w in params if w.name == 'W_emb']
    zerout_dummy_word = theano.function([],
                                        updates=[(W,
                                                  T.set_subtensor(W[-1:], 0.))
                                                 for W in W_emb_list])

    epoch = 0
    n_epochs = 50
    early_stop = 50
    check_freq = 4
    timer_train = time.time()
    no_best_dev_update = 0
    best_dev_acc = -numpy.inf
    num_train_batches = len(train_set_iterator)
    while epoch < n_epochs:
        timer = time.time()
        for i, (tweet,
                y_label) in enumerate(tqdm(train_set_iterator, ascii=True), 1):
            train_fn(tweet, y_label)

            if i % check_freq == 0 or i == num_train_batches:
                y_pred_dev_2015 = predict_batch(test_2015_iterator)
                #y_pred_train_2013 = predict_batch(train2013_iterator)
                #y_pred_train_2016 = predict_batch(train2016_iterator)
                #y_pred_dev2016 = predict_batch(dev2016_iterator)
                #y_pred_dev2013 = predict_batch(dev_2013_iterator)
                y_pred_test_2016 = predict_batch(test2016_iterator)
                y_pred_test_2014 = predict_batch(test_2014_iterator)
                y_pred_test_2013 = predict_batch(test2013_itarator)
                y_pred_test_sms_2013 = predict_batch(test_2013_sms_iterator)
                y_pred_test_livejournal_2014 = predict_batch(
                    test_2014_livejournal_iterator)
                y_pred_test_sarcasm_2014 = predict_batch(
                    test_2014_sarcasm_iterator)
                #y_pred_devtest_2016 = predict_batch(devtestA2016_iterator)

                dev_acc_2015 = semeval_f1_taskA(test_2015_sentiments,
                                                y_pred_dev_2015)
                dev_acc_2014 = semeval_f1_taskA(test_2014_sentiments,
                                                y_pred_test_2014)
                dev_acc_2014_lj = semeval_f1_taskA(
                    test_2014_livejournal_sentiments,
                    y_pred_test_livejournal_2014)
                dev_acc_2014_srcs = semeval_f1_taskA(
                    test_2014_sarcasm_sentiments, y_pred_test_sarcasm_2014)
                dev_acc_2013 = semeval_f1_taskA(test_2013_sentiments,
                                                y_pred_test_2013)
                dev_acc_2013_sms = semeval_f1_taskA(test_2013_sms_sentiments,
                                                    y_pred_test_sms_2013)
                dev_acc_2016 = semeval_f1_taskA(test_2016_sentiments,
                                                y_pred_test_2016)

                ep_pred[test_2016n].append(dev_acc_2016)
                ep_pred[test_2015n].append(dev_acc_2015)
                ep_pred[test_2014n].append(dev_acc_2014)
                ep_pred[test_2013n].append(dev_acc_2013)
                ep_pred[test_2014ljn].append(dev_acc_2014_lj)
                ep_pred[test_2014srcn].append(dev_acc_2014_srcs)
                ep_pred[test_2013_smsn].append(dev_acc_2013_sms)

                if dev_acc_2016 > best_dev_acc:
                    best_dev_acc = dev_acc_2016
                    best_params = [
                        numpy.copy(p.get_value(borrow=True)) for p in params
                    ]
                    no_best_dev_update = 0

                    print('2016 epoch: {} chunk: {} best_chunk_auc: {:.4f};'.
                          format(epoch, i, dev_acc_2016))
                    print('2015 epoch: {} chunk: {} best_chunk_auc: {:.4f};'.
                          format(epoch, i, dev_acc_2015))
                    print('2014 epoch: {} chunk: {} best_chunk_auc: {:.4f};'.
                          format(epoch, i, dev_acc_2014))
                    print('2013 epoch: {} chunk: {} best_chunk_auc: {:.4f};'.
                          format(epoch, i, dev_acc_2013))
                    print('2014lj epoch: {} chunk: {} best_chunk_auc: {:.4f};'.
                          format(epoch, i, dev_acc_2014_lj))
                    print(
                        '2014src epoch: {} chunk: {} best_chunk_auc: {:.4f};'.
                        format(epoch, i, dev_acc_2014_srcs))
                    print(
                        '2013sms epoch: {} chunk: {} best_chunk_auc: {:.4f};'.
                        format(epoch, i, dev_acc_2013_sms))
                    #print('devtest2016 epoch: {} chunk: {} best_chunk_auc: {:.4f};'.format(epoch, i, dev_acc_2016_devtest))

        zerout_dummy_word()

        print('epoch {} took {:.4f} seconds'.format(epoch,
                                                    time.time() - timer))
        epoch += 1
        no_best_dev_update += 1
        if no_best_dev_update >= early_stop:
            print "Quitting after of no update of the best score on dev set", no_best_dev_update
            break

    print('Training took: {:.4f} seconds'.format(time.time() - timer_train))
    for i, param in enumerate(best_params):
        params[i].set_value(param, borrow=True)

    cPickle.dump(
        ep_pred,
        open(data_dir + '/supervised_results_{}.p'.format(test_type), 'wb'))

    return
    #######################
    # Get Sentence Vectors#
    ######################

    batch_size = input_shape[0]

    inputs_senvec = [batch_tweets]
    givents_senvec = {
        tweets: batch_tweets,
    }

    output = nnet_tweets.layers[-2].output

    output_fn = function(inputs=inputs_senvec,
                         outputs=output,
                         givens=givents_senvec)

    sets = [(test_2014_tids, test_2014_tweets, 'task-B-test2014-twitter'),
            (test_2015_tids, test_2015_tweets, 'task-B-test2015-twitter'),
            (training2013_tids, training2013_tweets, 'task-BD-train-2013'),
            (test_2013_sms_tids, test_2013_sms_tweets, 'task-B-test2013-sms'),
            (devA_2016_tids, devA_2016_tweets, 'task-A-dev-2016'),
            (trainingA_2016_tids, trainingA_2016_tweets, 'task-A-train-2016'),
            (devtestA_2016_tids, devtestA_2016_tweets, 'task-A-devtest-2016'),
            (test_2016_tids, test_2016_tweets,
             'SemEval2016-task4-test.subtask-A'),
            (test_2014_sarcasm_tids, test_2014_sarcasm_tweets,
             'test_2014_sarcasm'),
            (test_2014_livejournal_tids, test_2014_livejournal_tweets,
             'task-B-test2014-livejournal'),
            (test_2013_tids, test_2013_tweets, 'task-BD-train-2013'),
            (dev_2013_tids, dev_2013_tweets, 'task-BD-dev-2013')]

    for (fids, fset, name) in sets:
        test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
            numpy_rng, [fset], batch_size=batch_size, randomize=False)

        counter = 0
        fname = open(
            os.path.join(data_dir, 'sentence-vecs/{}.txt'.format(name)), 'w+')
        for i, tweet in enumerate(tqdm(test_set_iterator), 1):
            o = output_fn(tweet[0])
            for vec in o:
                fname.write(fids[counter])
                for el in numpy.nditer(vec):
                    fname.write(" %f" % el)
                fname.write("\n")
                counter += 1
                if counter == test_set_iterator.n_samples:
                    break

    ##############################
    # Get Predictions Probabilites#
    #############################

    batch_size = input_shape[0]

    output = nnet_tweets.layers[-1].p_y_given_x

    output_fn = function(inputs=inputs_senvec,
                         outputs=output,
                         givens=givents_senvec)

    for (fids, fset, name) in sets:
        test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
            numpy_rng, [fset], batch_size=batch_size, randomize=False)

        counter = 0
        fname = open(
            os.path.join(data_dir, 'prob_predictions/{}.txt'.format(name)),
            'w+')
        for i, tweet in enumerate(tqdm(test_set_iterator), 1):
            o = output_fn(tweet[0])
            for vec in o:
                for el in numpy.nditer(vec):
                    fname.write(" %f" % el)
                fname.write("\n")
                counter += 1
                if counter == test_set_iterator.n_samples:
                    break
def main():
    ZEROUT_DUMMY_WORD = True

    arg_parser = argparse.ArgumentParser()
    arg_parser.add_argument('--data', type=unicode,
                            help='E.g.: pre, pre_hf, etc.')
    arg_parser.add_argument('--filter', '-f', type=unicode, default='3,4,5',
                            help='E.g.: 3 or 3,4,5')
    arg_parser.add_argument('--n_kernels', type=int, default=100,
                            help='# of kernels (filters)')
    arg_parser.add_argument('--n_epochs', type=int, default=25)
    arg_parser.add_argument('--batch_size', type=int, default=50)
    arg_parser.add_argument('--dropout_rate', type=float, default=0.5)
    arg_parser.add_argument('--vocab_embedding_type', type=unicode,
                            default='both', help='both/static/nonstatic')
    arg_parser.add_argument('--vocab_embedding_size', type=int,
                            default='50', help='50/200/500/800')
    arg_parser.add_argument('--L2_embedding', type=float, default=0.)
    arg_parser.add_argument('--L2_conv', type=float, default=0.)
    arg_parser.add_argument('--L2_linear', type=float, default=0.)
    arg_parser.add_argument('--activation', type=unicode, default='tanh')
    arg_parser.add_argument('--input_type', type=unicode, default='index',
                            help='E.g.: index (use a lookup layer), embed.')
    arg_parser.set_defaults(save_nn_features=False)
    arg_parser.add_argument('--save_features', dest='save_nn_features', 
                            action='store_true', 
                            help='Save outputs from second last layer')
                            
    args = arg_parser.parse_args()
    data_name = args.data
    filter_widths = [int(a) for a in args.filter.split(',')]
    n_epochs = args.n_epochs
    batch_size = args.batch_size
    dropout_rate = args.dropout_rate
    vocab_embedding_type = args.vocab_embedding_type
    vocab_embedding_size = args.vocab_embedding_size
    L2_embedding = args.L2_embedding
    L2_conv = args.L2_conv
    L2_linear = args.L2_linear
    nkernels = args.n_kernels
    activation_str = args.activation
    input_type = args.input_type    
    save_nn_features = args.save_nn_features
    
    ## Load data
    data_dir = '../../data/{}'.format(data_name)
    embedding_dir = '../../data/word2vec'
    if input_type == 'index':
        x_train = np.load(os.path.join(data_dir, 'train_input.npy'))
        x_dev = np.load(os.path.join(data_dir, 'valid_input.npy'))
        x_test = np.load(os.path.join(data_dir, 'test_input.npy'))
    elif input_type == 'embed':
        x_train = np.load(os.path.join(
            data_dir, 'train_embed_{}.npy'.format(vocab_embedding_size)))
        x_dev = np.load(os.path.join(
            data_dir, 'valid_embed_{}.npy'.format(vocab_embedding_size)))
        x_test = np.load(os.path.join(
            data_dir, 'test_embed_{}.npy'.format(vocab_embedding_size)))
    y_train = np.load(os.path.join(data_dir, 'train_label.npy'))
    y_dev = np.load(os.path.join(data_dir, 'valid_label.npy'))
    y_test = np.load(os.path.join(data_dir, 'test_label.npy'))
    y_candidates = np.unique(np.concatenate((y_train, y_dev, y_test)))
    n_y_class = len(y_candidates)
    # for multi class label, from (0, 1, 3, 7, ..) to (0, 1, 2, 3, ...)    
    y_train = np.array([np.where(y_candidates==yy)[0][0] for yy in y_train], 
                        dtype='int32')
    y_dev = np.array([np.where(y_candidates==yy)[0][0] for yy in y_dev], 
                      dtype='int32')
    y_test = np.array([np.where(y_candidates==yy)[0][0] for yy in y_test], 
                       dtype='int32')
    if n_y_class > 2:
        y_train_foreval = np.zeros([len(y_train), n_y_class])
        y_train_foreval[np.arange(len(y_train)), y_train] = 1
        y_dev_foreval = np.zeros([len(y_dev), n_y_class])
        y_dev_foreval[np.arange(len(y_dev)), y_dev] = 1
        y_test_foreval = np.zeros([len(y_test), n_y_class])
        y_test_foreval[np.arange(len(y_test)), y_test] = 1
    else:
        y_train_foreval = np.array(y_train > 0, dtype=int)
        y_dev_foreval = np.array(y_dev > 0, dtype=int)
        y_test_foreval = np.array(y_test > 0, dtype=int)
    
    print 'y_train', np.unique(y_train, return_counts=True),
    print 'y_dev', np.unique(y_dev, return_counts=True)
    print 'y_test', np.unique(y_test, return_counts=True)
    print 'x_train', x_train.shape, x_train.dtype, theano.config.floatX
    print 'x_dev', x_dev.shape
    print 'x_test', x_test.shape

    np_rng = np.random.RandomState()
    x_max_sent_size = x_train.shape[1]
    if input_type == 'index':
        ## Load word2vec embeddings
        fname = os.path.join(embedding_dir, 
                             'word2vec_{}.npy'.format(vocab_embedding_size))
        print "Loading word embeddings from", fname
        vocab_emb = np.asarray(np.load(fname), dtype=theano.config.floatX)
        ndim = vocab_emb.shape[1]
        dummy_word_idx = vocab_emb.shape[0] - 1
        print "Word embedding matrix size:", vocab_emb.shape, type(vocab_emb), vocab_emb.dtype
        print 'dummy word index:', dummy_word_idx
    elif input_type == 'embed':
        ndim = x_train.shape[2]
        assert ndim == vocab_embedding_size, \
            'n_dim {} should be the same as emb_size {}'.format(ndim, vocab_embedding_size)

    if input_type == 'index':
        x = T.lmatrix('x')
    else:
        x = T.dtensor3('x_emb')
    y = T.ivector('y')

    ## Settings
    n_out = n_y_class if n_y_class > 2 else 1
    max_norm = 0
    print 'batch_size', batch_size
    print 'n_epochs', n_epochs
    print 'dropout_rate', dropout_rate
    print 'max_norm', max_norm
    print 'n_out', n_out
    print 'filter_widths', filter_widths
    
    reg_str = 'L2emb{}L2conv{}L2linear{}'.format(args.L2_embedding,
                                                 args.L2_conv, args.L2_linear)
    
    setting_str = 'filter={filter};n_f={n_f};activation={activation};' \
                  'emb_size={emb_size};emb_type={emb_type};reg={reg};' \
                  ''.format(filter=args.filter, n_f=args.n_kernels,
                            activation=args.activation, 
                            emb_size=args.vocab_embedding_size, 
                            emb_type=args.vocab_embedding_type,
                            reg=reg_str)
    ts_str = datetime.now().strftime('%Y-%m-%d-%H.%M.%S')
    nnet_outdir_pattern = ('../../output/{data}/{setting};time={time}')
    nnet_outdir = nnet_outdir_pattern.format(data=data_name, 
                                             setting=setting_str, time=ts_str)
    if not os.path.exists(nnet_outdir):
        os.makedirs(nnet_outdir)

    ## Conv layer.
    activation = T.tanh if activation_str == 'tanh' else T.nnet.relu
    k_max = 1
    num_input_channels = 1
    # not all of the following 3 layers are used: 
    if input_type == 'index':
        lookup_table_static = nn_layers.LookupTableFastStatic(
            W=vocab_emb, pad=max(filter_widths)-1)
        lookup_table_nonstatic = nn_layers.LookupTableFast(
            W=vocab_emb, pad=max(filter_widths)-1, borrow=False)
    elif input_type == 'embed':
        lookup_table_static = nn_layers.EmbeddingInput(
        pad=max(filter_widths)-1)
    # This is the input shape to the conv layer, not the first layer.
    input_shape = (batch_size, num_input_channels,
                   x_max_sent_size + 2*(max(filter_widths)-1), ndim)
    tconv_layers = []
    for i_width, filter_width in enumerate(filter_widths):
        filter_shape = (nkernels, num_input_channels, filter_width, ndim)
        conv = nn_layers.Conv2dLayer(
            rng=np_rng, filter_shape=filter_shape, input_shape=input_shape)
        non_linearity = nn_layers.NonLinearityLayer(
            b_size=filter_shape[0], activation=activation)
        conv_static = nn_layers.FeedForwardNet(layers=[conv, non_linearity])
        if vocab_embedding_type == 'nonstatic':
            conv_nonstatic = nn_layers.FeedForwardNet(layers=[conv, 
                                                              non_linearity])
        else:
            conv_nonstatic = nn_layers.CopiedLayer(conv_static)
        if i_width == 0:
            tc_static = nn_layers.FeedForwardNet(
                layers=[lookup_table_static, conv_static])
            if input_type  == 'index':
                tc_nonstatic =  nn_layers.FeedForwardNet(
                    layers=[lookup_table_nonstatic, conv_nonstatic])
        else:
            tc_static = nn_layers.FeedForwardNet(
                layers=[nn_layers.CopiedLayer(lookup_table_static), 
                        conv_static])
            if input_type  == 'index':
                tc_nonstatic =  nn_layers.FeedForwardNet(
                    layers=[nn_layers.CopiedLayer(lookup_table_nonstatic), 
                            conv_nonstatic])
        if vocab_embedding_type == 'both':
            tc_multichannel = nn_layers.SumMergeLayer(
                layers=[tc_static, tc_nonstatic])
        elif vocab_embedding_type == 'static':
            tc_multichannel = tc_static
        elif vocab_embedding_type == 'nonstatic':
            tc_multichannel = tc_nonstatic
        pooling = nn_layers.KMaxPoolLayer(k_max=k_max)
        tconv2dNonLinearMaxPool = nn_layers.FeedForwardNet(
            layers=[tc_multichannel, pooling])
        tconv_layers.append(tconv2dNonLinearMaxPool)

    join_layer = nn_layers.ParallelLayer(layers=tconv_layers)
    flatten_layer = nn_layers.FlattenLayer()
    nnet = nn_layers.FeedForwardNet(
        layers=[join_layer,
                flatten_layer,
                ])
    nnet.set_input(x)

    logistic_n_in = nkernels * len(filter_widths) * k_max
    dropout = nn_layers.DropoutLayer(rng=np_rng, p=dropout_rate)
    dropout.set_input(nnet.output)
    if n_out > 2:
        classifier = nn_layers.LogisticRegression(n_in=logistic_n_in, 
                                                  n_out=n_out)
    else:
        classifier = nn_layers.BinaryLogisticRegression(n_in=logistic_n_in)
    classifier.set_input(dropout.output)

    train_nnet = nn_layers.FeedForwardNet(
        layers=[nnet, dropout, classifier],
        name="Training nnet")
    test_nnet = train_nnet
    print 'train_nnet:\n{}'.format(train_nnet)

    params = train_nnet.params

    nnet_fname = os.path.join(nnet_outdir, 'nnet.dat')
    print "Saving to", nnet_fname
    cPickle.dump([train_nnet, test_nnet], 
                 open(nnet_fname, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL)
    with open(os.path.join(nnet_outdir, 'model_str.txt'), 'w') as f:
        f.write(str(train_nnet))
    total_params = sum([np.prod(param.shape.eval()) for param in params])
    print 'Total params number:', total_params

    cost = train_nnet.layers[-1].training_cost(y)
    predictions = test_nnet.layers[-1].y_pred
    predictions_prob = test_nnet.layers[-1].p_y_given_x[:]
    second_last_features = test_nnet.layers[-3].output
    
    ## Add L_2 regularization
    print "Regularizing nnet weights: ",
    for w in train_nnet.weights:
        if w.name.startswith('W_emb'):
            L2_reg_w = L2_embedding
        elif w.name.startswith('W_conv1d'):
            L2_reg_w = L2_conv
        elif w.name.startswith('W_softmax'):
            L2_reg_w = L2_linear
        elif w.name == 'W':
            L2_reg_w = 0.
        print '{}:{}, '.format(w.name, L2_reg_w),
        cost += T.sum(w**2) * L2_reg_w
    print ''

    if input_type == 'index':
        batch_x = T.lmatrix('batch_x')
    elif input_type == 'embed':
        batch_x = T.dtensor3('batch_x_emb')
    batch_y = T.ivector('batch_y')

    updates = sgd_trainer.get_adadelta_updates(cost, params, 
                                               rho=0.95, eps=1e-6, 
                                               max_norm=max_norm, 
                                               word_vec_name='W_emb')
    inputs_pred = [batch_x,]
    givens_pred = {x: batch_x,}
    inputs_train = [batch_x,
                    batch_y,]
    givens_train = {x: batch_x,
                    y: batch_y,}

    train_fn = theano.function(inputs=inputs_train,
                               outputs=cost,
                               updates=updates,
                               givens=givens_train)
    pred_fn = theano.function(inputs=inputs_pred,
                              outputs=predictions,
                              givens=givens_pred)
    pred_prob_fn = theano.function(inputs=inputs_pred,
                                   outputs=predictions_prob,
                                   givens=givens_pred)
    features_fn = theano.function(inputs=inputs_pred,
                                  outputs=second_last_features,
                                  givens=givens_pred)

    def predict_batch(batch_iterator):
        preds = np.concatenate(
            [pred_fn(batch_data[0]) for batch_data in batch_iterator])
        return preds[:batch_iterator.n_samples]

    def predict_prob_batch(batch_iterator):
        preds = np.concatenate(
            [pred_prob_fn(batch_data[0]) for batch_data in batch_iterator])
        return preds[:batch_iterator.n_samples]

    def get_features_batch(batch_iterator):
        features = np.concatenate(
            [features_fn(batch_data[0]) for batch_data in batch_iterator])
        return features[:batch_iterator.n_samples]

    train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        np_rng, [x_train, y_train], batch_size=batch_size, randomize=True)
    train_set_iterator_eval = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        np_rng, [x_train, y_train], batch_size=batch_size, randomize=False)
    dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        np_rng, [x_dev, y_dev], batch_size=batch_size, randomize=False)
    test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        np_rng, [x_test, y_test], batch_size=batch_size, randomize=False)

    print "Zero out dummy word:", ZEROUT_DUMMY_WORD
    if ZEROUT_DUMMY_WORD:
        W_emb_list = [w for w in params if w.name == 'W_emb']
        zerout_dummy_word = theano.function(
            [], 
            updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list]
            )

    best_dev_auc = -np.inf
    epoch = 0
    timer_train = time.time()
    no_best_dev_update = 0
    num_train_batches = len(train_set_iterator)
    best_params = [np.copy(p.get_value(borrow=True)) for p in params]
    for i, p in enumerate(best_params):
        print i, p.shape,
        print best_params[i].sum()
    while epoch < n_epochs:
        timer = time.time()
        for i, (x, y) in enumerate(tqdm(train_set_iterator), 1):
            train_fn(x, y)
    
            # Make sure the null word embedding always remains zero
            if ZEROUT_DUMMY_WORD:
                zerout_dummy_word()
        
            if i % 10 == 0 or i == num_train_batches:
              y_pred_dev = predict_prob_batch(dev_set_iterator)
              print y_dev_foreval.shape, y_pred_dev.shape
              dev_auc = metrics.roc_auc_score(y_dev_foreval, y_pred_dev) * 100
              if dev_auc > best_dev_auc:
                y_pred = predict_prob_batch(test_set_iterator)
                test_auc = metrics.roc_auc_score(y_test_foreval, y_pred) * 100
                print ('epoch: {} batch: {} dev auc: {:.4f}; '
                       'best_dev_auc: {:.4f}; test_auc: {:.4f}'
                       .format(epoch, i, dev_auc, best_dev_auc, test_auc))
                best_dev_auc = dev_auc
                best_params_pre = best_params
                best_params = [
                    np.copy(p.get_value(borrow=True)) for p in params]
                no_best_dev_update = 0
                for i, p in enumerate(best_params):
                    print i,p.shape,'\t\t\t',
                    print np.array_equal(best_params[i],best_params_pre[i]),
                    print '\t\t\t',
                    print best_params[i].sum()
                print
        if no_best_dev_update >= 3:
            print "Quitting after of no update of the best score on dev set",
            print no_best_dev_update
            break
        print ('epoch {} took {:.4f} seconds'
               .format(epoch, time.time() - timer))
        epoch += 1
        no_best_dev_update += 1

    print('Training took: {:.4f} seconds'.format(time.time() - timer_train))
    for i, param in enumerate(best_params):
        params[i].set_value(param, borrow=True)


    y_pred_train = predict_batch(train_set_iterator_eval)
    y_pred_prob_train = predict_prob_batch(train_set_iterator_eval)
    y_pred_dev = predict_batch(dev_set_iterator)
    y_pred_prob_dev = predict_prob_batch(dev_set_iterator)
    y_pred_test = predict_batch(test_set_iterator)
    y_pred_prob_test = predict_prob_batch(test_set_iterator)
    
    print 'Train:'
    print 'acc is:', metrics.accuracy_score(y_train, y_pred_train)
    print 'auc is:', metrics.roc_auc_score(y_train_foreval, y_pred_prob_train)
    print 'prc is:', metrics.average_precision_score(y_train_foreval, y_pred_prob_train)
    print 'maxf1 is:', maxf1(y_train_foreval, y_pred_prob_train)
    print 'prec @ 10/20/30:', topKPrecision(y_train_foreval, y_pred_prob_train)
    
    print 'Valid:'
    print 'acc is:', metrics.accuracy_score(y_dev, y_pred_dev),
    print 'auc is:', metrics.roc_auc_score(y_dev_foreval, y_pred_prob_dev)
    print 'prc is:', metrics.average_precision_score(y_dev_foreval, y_pred_prob_dev)
    print 'maxf1 is:', maxf1(y_dev_foreval, y_pred_prob_dev)
    print 'prec @ 10/20/30:', topKPrecision(y_dev_foreval, y_pred_prob_dev)
    
    print 'Test:'
    test_acc = metrics.accuracy_score(y_test, y_pred_test)
    test_auc = metrics.roc_auc_score(y_test_foreval, y_pred_prob_test)
    test_prc = metrics.average_precision_score(y_test_foreval, y_pred_prob_test)
    test_maxf1 = maxf1(y_test_foreval, y_pred_prob_test)
    test_prec = topKPrecision(y_test_foreval, y_pred_prob_test)
    
    print 'acc is:', test_acc
    print 'auc is:', test_auc
    print 'prc is:', test_prc
    print 'maxf1 is:', test_maxf1
    print 'prec @ 10/20/30:', test_prec
    
    with open('../../output/summary.txt', 'a') as f:
        f.write(data_name + '\t' + setting_str + '\t')
        f.write('\t'.join([str(a) for a in \
            [test_acc, test_auc, test_prc, test_maxf1, test_prec]]))
        f.write('\n')
    
    fname = os.path.join(nnet_outdir,
                         ('best_dev_params.epoch={:02d};batch={:05d};'
                          'dev_auc={:.2f}.dat'
                          .format(epoch, i, best_dev_auc)))
    cPickle.dump(best_params, open(fname, 'wb'), 
                 protocol=cPickle.HIGHEST_PROTOCOL)
    pred_txt_name_suffix = ('.epoch={:02d};batch={:05d};'
                            'dev_auc={:.2f}.predictions.txt'
                            .format(epoch, i, best_dev_auc))
    np.savetxt(os.path.join(nnet_outdir, 'train' + pred_txt_name_suffix),
               y_pred_prob_train)
    np.savetxt(os.path.join(nnet_outdir, 'valid' + pred_txt_name_suffix),
               y_pred_prob_dev)
    np.savetxt(os.path.join(nnet_outdir, 'test' + pred_txt_name_suffix),
               y_pred_prob_test)
               
    if save_nn_features:
        y_features_train = get_features_batch(train_set_iterator_eval)
        y_features_dev = get_features_batch(dev_set_iterator)
        y_features_test = get_features_batch(test_set_iterator)
        np.save(os.path.join(nnet_outdir, 'cnn_features_train.npy'), y_features_train)
        np.save(os.path.join(nnet_outdir, 'cnn_features_dev.npy'), y_features_dev)
        np.save(os.path.join(nnet_outdir, 'cnn_features_test.npy'), y_features_test)

    N = len(y_pred_test)
    df_submission = pd.DataFrame(
        index=np.arange(N), 
        columns=['docno', 'label','pred'] + \
                ['p' + str(i+1) for i in xrange(n_out)])
    df_submission['docno'] = np.arange(N)
    df_submission['label'] = y_test
    df_submission['pred'] = y_pred_test
    if n_out > 1:
        for i in xrange(n_out):
            df_submission['p' + str(i+1)] = y_pred_prob_test[:, i]
    else:
        df_submission['p1'] = y_pred_prob_test

    df_submission.to_csv(os.path.join(nnet_outdir, 'submission.txt'), 
                         header=True, index=True, sep=' ')
    df_submission.to_csv(os.path.join(nnet_outdir, 'submission1.txt'), 
                         header=False, index=False, sep=' ')
    print nnet_outdir
    print vocab_emb.shape

    print 'epoch', epoch
示例#7
0
def main():
    # ZEROUT_DUMMY_WORD = False
    ZEROUT_DUMMY_WORD = True

    ## Load data
    # mode = 'TRAIN-ALL'
    #mode = 'TRAIN_DATA'
    #mode = 'TRAIN_NO_OVERLAP'
    #if len(sys.argv) > 1:
    #    mode = sys.argv[1]
    #    if not mode in ['TRAIN', 'TRAIN-ALL']:
    #        print "ERROR! The two possible training settings are: ['TRAIN', 'TRAIN-ALL']"
    #        sys.exit(1)

    mode = 'k_time_data1'.upper()

    print "Running training in the {} setting".format(mode)

    position_num = 10
    select_model = "PSCM"
    if select_model == "PSCM":
        click_model_index = 4  #PSCM
    elif select_model == "UBM":
        click_model_index = 1
    else:
        raise "MODEL SELECT ERROR!"
    data_dir = mode

    add_train = numpy.load(os.path.join(data_dir, 'train.additions.npy'))
    q_train = numpy.load(os.path.join(data_dir, 'train.questions.npy'))
    a_train = numpy.load(os.path.join(data_dir, 'train.answers.npy'))
    y_train = numpy.load(os.path.join(data_dir, 'train.labels.npy'))

    add_dev = numpy.load(os.path.join(data_dir, 'dev.additions.npy'))
    q_dev = numpy.load(os.path.join(data_dir, 'dev.questions.npy'))
    a_dev = numpy.load(os.path.join(data_dir, 'dev.answers.npy'))
    #q_overlap_dev = numpy.load(os.path.join(data_dir, 'dev.q_overlap_indices.npy'))
    #a_overlap_dev = numpy.load(os.path.join(data_dir, 'dev.a_overlap_indices.npy'))
    y_dev = numpy.load(os.path.join(data_dir, 'dev.labels.npy'))
    qids_dev = numpy.load(os.path.join(data_dir, 'dev.qids.npy'))

    add_test = numpy.load(os.path.join(data_dir, 'test.additions.npy'))
    q_test = numpy.load(os.path.join(data_dir, 'test.questions.npy'))
    a_test = numpy.load(os.path.join(data_dir, 'test.answers.npy'))
    #q_overlap_test = numpy.load(os.path.join(data_dir, 'test.q_overlap_indices.npy'))
    #a_overlap_test = numpy.load(os.path.join(data_dir, 'test.a_overlap_indices.npy'))
    y_test = numpy.load(os.path.join(data_dir, 'test.labels.npy'))
    qids_test = numpy.load(os.path.join(data_dir, 'test.qids.npy'))

    # x_train = numpy.load(os.path.join(data_dir, 'train.overlap_feats.npy'))
    # x_dev = numpy.load(os.path.join(data_dir, 'dev.overlap_feats.npy'))
    # x_test = numpy.load(os.path.join(data_dir, 'test.overlap_feats.npy'))

    # feats_ndim = x_train.shape[1]

    # from sklearn.preprocessing import StandardScaler
    # scaler = StandardScaler()
    # print "Scaling overlap features"
    # x_train = scaler.fit_transform(x_train)
    # x_dev = scaler.transform(x_dev)
    # x_test = scaler.transform(x_test)

    #multi dim

    #y_train_tmp = numpy.dstack((y_train, y_train, y_train))[0]
    #y_dev_tmp = numpy.dstack((y_dev, y_dev, y_dev))[0]
    #y_test_tmp = numpy.dstack((y_test, y_test, y_test))[0]

    #y_train = y_train_tmp
    #y_dev = y_dev_tmp
    #y_test = y_test_tmp

    max_query_id = numpy.max([
        numpy.max(add_train[:, 0]),
        numpy.max(add_test[:, 0]),
        numpy.max(add_dev[:, 0])
    ])
    max_url_id = numpy.max([
        numpy.max(add_train[:, 1:]),
        numpy.max(add_test[:, 1:]),
        numpy.max(add_dev[:, 1:])
    ])

    print 'max_query_id', max_query_id
    print 'max_url_id', max_url_id

    print 'y_train', numpy.unique(y_train, return_counts=True)
    print 'y_dev', numpy.unique(y_dev, return_counts=True)
    print 'y_test', numpy.unique(y_test, return_counts=True)

    print 'q_train', q_train.shape
    print 'q_dev', q_dev.shape
    print 'q_test', q_test.shape

    print 'a_train', a_train.shape
    print 'a_dev', a_dev.shape
    print 'a_test', a_test.shape

    ## Get the word embeddings from the nnet trained on SemEval
    # ndim = 40
    # nnet_outdir = 'exp/ndim=60;batch=100;max_norm=0;learning_rate=0.1;2014-12-02-15:53:14'
    # nnet_fname = os.path.join(nnet_outdir, 'nnet.dat')
    # params_fname = os.path.join(nnet_outdir, 'best_dev_params.epoch=00;batch=14640;dev_f1=83.12;test_acc=85.00.dat')
    # train_nnet, test_nnet = nn_layers.load_nnet(nnet_fname, params_fname)

    numpy_rng = numpy.random.RandomState(123)
    q_max_sent_size = q_train.shape[1]
    a_max_sent_size = a_train.shape[2]
    # print 'max', numpy.max(a_train)
    # print 'min', numpy.min(a_train)

    #ndim = 5
    #print "Generating random vocabulary for word overlap indicator features with dim:", ndim
    #dummy_word_id = numpy.max(a_overlap_train)
    # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim))
    #print "Gaussian"
    #vocab_emb_overlap = numpy_rng.randn(dummy_word_id + 1, ndim) * 0.25
    # vocab_emb_overlap = numpy_rng.randn(dummy_word_id+1, ndim) * 0.05
    # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim))
    #vocab_emb_overlap[-1] = 0

    # Load word2vec embeddings
    fname = os.path.join(data_dir, 'emb_vectors.skip.1124.4m.10w.npy')

    print "Loading word embeddings from", fname
    vocab_emb = numpy.load(fname)
    ndim = vocab_emb.shape[1]
    dummpy_word_idx = numpy.max(a_train)
    print "Word embedding matrix size:", vocab_emb.shape

    x = T.dmatrix('x')
    x_q = T.lmatrix('q')
    #x_q_overlap = T.lmatrix('q_overlap')
    #x_a = T.lmatrix('a')
    x_a_all = T.ltensor3('a_all')
    #x_a_overlap = T.lmatrix('a_overlap')
    #y = T.ivector('y')
    y = T.imatrix('y')
    add_info = T.dmatrix('add_info')

    #######
    n_outs = 2

    n_epochs = 15
    batch_size = 50
    learning_rate = 0.1
    max_norm = 0

    print 'batch_size', batch_size
    print 'n_epochs', n_epochs
    print 'learning_rate', learning_rate
    print 'max_norm', max_norm

    ## 1st conv layer.
    #ndim = vocab_emb.shape[1] + vocab_emb_overlap.shape[1]
    ndim = vocab_emb.shape[1]

    ### Nonlinearity type
    # activation = nn_layers.relu_f
    activation = T.tanh

    dropout_rate = 0.5
    nkernels = 100
    q_k_max = 1
    a_k_max = 1

    # filter_widths = [3,4,5]
    q_filter_widths = [5]
    a_filter_widths = [5]

    ###### QUESTION ######
    lookup_table_words = nn_layers.LookupTableFastStatic(
        W=vocab_emb, pad=max(q_filter_widths) - 1)
    #lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=max(q_filter_widths) - 1)

    #lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words, lookup_table_overlap])
    lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words])

    num_input_channels = 1
    input_shape = (batch_size, num_input_channels,
                   q_max_sent_size + 2 * (max(q_filter_widths) - 1), ndim)

    conv_layers = []
    for filter_width in q_filter_widths:
        filter_shape = (nkernels, num_input_channels, filter_width, ndim)
        conv = nn_layers.Conv2dLayer(rng=numpy_rng,
                                     filter_shape=filter_shape,
                                     input_shape=input_shape)
        non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0],
                                                    activation=activation)
        pooling = nn_layers.KMaxPoolLayer(k_max=q_k_max)
        conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(
            layers=[conv, non_linearity, pooling])
        conv_layers.append(conv2dNonLinearMaxPool)

    join_layer = nn_layers.ParallelLayer(layers=conv_layers)
    flatten_layer = nn_layers.FlattenLayer()

    nnet_q = nn_layers.FeedForwardNet(layers=[
        lookup_table,
        join_layer,
        flatten_layer,
    ])
    #nnet_q.set_input((x_q, x_q_overlap))
    nnet_q.set_input([x_q])
    ######

    ###### ANSWER ######
    nnet_a_list = []
    #lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths) - 1)
    for i in xrange(position_num):
        #lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths) - 1)
        #lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=max(q_filter_widths) - 1)

        #lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words, lookup_table_overlap])
        #lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words])

        # num_input_channels = len(lookup_table.layers)
        #input_shape = (batch_size, num_input_channels, a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim)
        input_shape = (batch_size, num_input_channels,
                       a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim)
        conv_layers = []
        for filter_width in a_filter_widths:
            filter_shape = (nkernels, num_input_channels, filter_width, ndim)
            conv = nn_layers.Conv2dLayer(rng=numpy_rng,
                                         filter_shape=filter_shape,
                                         input_shape=input_shape)
            non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0],
                                                        activation=activation)
            pooling = nn_layers.KMaxPoolLayer(k_max=a_k_max)
            conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(
                layers=[conv, non_linearity, pooling])
            conv_layers.append(conv2dNonLinearMaxPool)

        join_layer = nn_layers.ParallelLayer(layers=conv_layers)
        flatten_layer = nn_layers.FlattenLayer()

        nnet_a = nn_layers.FeedForwardNet(layers=[
            lookup_table,
            join_layer,
            flatten_layer,
        ])
        #nnet_a.set_input((x_a, x_a_overlap))
        nnet_a.set_input([x_a_all[:, i, :]])
        nnet_a_list.append(nnet_a)
    #######
    # print 'nnet_q.output', nnet_q.output.ndim

    q_logistic_n_in = nkernels * len(q_filter_widths) * q_k_max
    #a_logistic_n_in = nkernels * len(a_filter_widths) * a_k_max
    a_logistic_n_in = nkernels * len(a_filter_widths) * a_k_max

    print "q_logistic_n_in, ", q_logistic_n_in
    print "a_logistic_n_in, ", a_logistic_n_in

    #pairwise_layer = nn_layers.PositionPairwiseNoFeatsLayer(q_in=q_logistic_n_in, a_in=a_logistic_n_in,position=position_num)
    pairwise_layer = nn_layers.PositionOnlySimPairwiseNoFeatsLayer(
        q_in=q_logistic_n_in, a_in=a_logistic_n_in, position=position_num)
    pairwise_out_list = [nnet_q.output]
    for i in xrange(position_num):
        pairwise_out_list.append(nnet_a_list[i].output)
    pairwise_layer.set_input(pairwise_out_list)
    #pairwise_layer.set_input((nnet_q.output, nnet_a.output))

    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + a_logistic_n_in
    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 50
    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 1
    #n_in = q_logistic_n_in + a_logistic_n_in * position_num + 1 * position_num
    #n_in = 1 * position_num + position_num * (position_num - 1) / 2
    n_in = q_logistic_n_in + a_logistic_n_in * position_num + 1 * position_num + position_num * (
        position_num - 1) / 2
    # n_in = feats_ndim + 1
    # n_in = feats_ndim + 50

    hidden_layer = nn_layers.LinearLayer(numpy_rng,
                                         n_in=n_in,
                                         n_out=n_in,
                                         activation=activation)
    hidden_layer.set_input(pairwise_layer.output)

    #classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs)
    #classifier.set_input(hidden_layer.output)

    classifier = nn_layers.FeatureClickModelLayer(
        n_in=n_in,
        n_out=n_outs,
        max_q_id=max_query_id,
        max_u_id=max_url_id,
        dim=position_num,
        click_model_index=click_model_index)
    #classifier = nn_layers.SimpleClickModelLayer(n_in=n_in, n_out=n_outs, max_q_id=max_query_id, max_u_id=max_url_id, dim=position_num)
    #classifier = nn_layers.MultiDimLogisticRegression(n_in=n_in, n_out=n_outs, dim=position_num)
    #classifier = nn_layers.LogisticRegression2(n_in=n_in, n_out=n_outs)
    classifier.set_input([hidden_layer.output, add_info])

    #train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, pairwise_layer, hidden_layer, classifier],
    #                                      name="Training nnet")
    train_nnet = nn_layers.FeedForwardNet(
        layers=[nnet_q] + nnet_a_list +
        [pairwise_layer, hidden_layer, classifier],
        name="Training nnet")
    test_nnet = train_nnet
    #######

    #print train_nnet

    params = train_nnet.params

    ts = datetime.now().strftime('%Y-%m-%d-%H.%M.%S')
    nnet_outdir = 'exp.multi.out/model={},data={};ndim={};batch={};max_norm={};learning_rate={};{}'.format(
        select_model, mode, ndim, batch_size, max_norm, learning_rate, ts)
    if not os.path.exists(nnet_outdir):
        os.makedirs(nnet_outdir)
    nnet_fname = os.path.join(nnet_outdir, 'nnet.dat')
    print "Saving to", nnet_fname
    cPickle.dump([train_nnet, test_nnet],
                 open(nnet_fname, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)

    #total_params = sum([numpy.prod(param.shape.eval()) for param in params])
    #print 'Total params number:', total_params

    cost = train_nnet.layers[-1].training_cost(y)
    # y_train_counts = numpy.unique(y_train, return_counts=True)[1].astype(numpy.float32)
    # weights_data = numpy.sum(y_train_counts) / y_train_counts
    # weights_data_norm = numpy.linalg.norm(weights_data)
    # weights_data /= weights_data_norm
    # print 'weights_data', weights_data
    # weights = theano.shared(weights_data, borrow=True)
    # cost = train_nnet.layers[-1].training_cost_weighted(y, weights=weights)

    predictions = test_nnet.layers[-1].y_pred

    #predictions_prob = test_nnet.layers[-1].p_y_given_x[:, position_num:position_num * 2]
    predictions_prob = test_nnet.layers[-1].p_y_given_x

    ### L2 regularization
    # L2_word_emb = 1e-4
    # L2_conv1d = 3e-5
    # # L2_softmax = 1e-3
    # L2_softmax = 1e-4
    # print "Regularizing nnet weights"
    # for w in train_nnet.weights:
    #   L2_reg = 0.
    #   if w.name.startswith('W_emb'):
    #     L2_reg = L2_word_emb
    #   elif w.name.startswith('W_conv1d'):
    #     L2_reg = L2_conv1d
    #   elif w.name.startswith('W_softmax'):
    #     L2_reg = L2_softmax
    #   elif w.name == 'W':
    #     L2_reg = L2_softmax
    #   print w.name, L2_reg
    #   cost += T.sum(w**2) * L2_reg

    # batch_x = T.dmatrix('batch_x')
    batch_x_q = T.lmatrix('batch_x_q')
    #batch_x_a = T.lmatrix('batch_x_a')
    batch_x_a_all = T.ltensor3('batch_x_a_all')
    #batch_x_q_overlap = T.lmatrix('batch_x_q_overlap')
    #batch_x_a_overlap = T.lmatrix('batch_x_a_overlap')
    #batch_y = T.ivector('batch_y')
    batch_y = T.imatrix('batch_y')
    batch_add_info = T.dmatrix('batch_add_info')

    # updates = sgd_trainer.get_adagrad_updates(cost, params, learning_rate=learning_rate, max_norm=max_norm, _eps=1e-6)
    updates = sgd_trainer.get_adadelta_updates(cost,
                                               params,
                                               rho=0.95,
                                               eps=1e-6,
                                               max_norm=max_norm,
                                               word_vec_name='W_emb')

    inputs_pred = [
        batch_x_q,
        batch_x_a_all,
        batch_add_info,
        #batch_x_q_overlap,
        #batch_x_a_overlap,
        # batch_x,
    ]

    givens_pred = {
        x_q: batch_x_q,
        x_a_all: batch_x_a_all,
        add_info: batch_add_info,
        #x_q_overlap: batch_x_q_overlap,
        #x_a_overlap: batch_x_a_overlap,
        # x: batch_x
    }

    inputs_train = [
        batch_x_q,
        batch_x_a_all,
        #batch_x_q_overlap,
        #batch_x_a_overlap,
        # batch_x,
        batch_add_info,
        batch_y,
    ]

    givens_train = {
        x_q: batch_x_q,
        x_a_all: batch_x_a_all,
        #x_q_overlap: batch_x_q_overlap,
        #x_a_overlap: batch_x_a_overlap,
        # x: batch_x,
        add_info: batch_add_info,
        y: batch_y
    }

    train_fn = theano.function(inputs=inputs_train,
                               outputs=cost,
                               updates=updates,
                               givens=givens_train,
                               on_unused_input='warn')

    pred_fn = theano.function(inputs=inputs_pred,
                              outputs=predictions,
                              givens=givens_pred,
                              on_unused_input='warn')

    pred_prob_fn = theano.function(inputs=inputs_pred,
                                   outputs=predictions_prob,
                                   givens=givens_pred,
                                   on_unused_input='warn')

    def predict_batch(batch_iterator):
        #preds = numpy.vstack([pred_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap) for
        #                      batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, _ in batch_iterator])
        preds = numpy.vstack([
            pred_fn(batch_x_q, batch_x_a, batch_add_info)
            for batch_x_q, batch_x_a, batch_add_info, _ in batch_iterator
        ])
        real_preds = preds[:, -1 * position_num:]
        inner_outputs = preds

        return real_preds[:batch_iterator.
                          n_samples], inner_outputs[:batch_iterator.n_samples]

    def predict_prob_batch(batch_iterator):
        #preds = numpy.vstack([pred_prob_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap) for
        #                      batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, _ in batch_iterator])
        preds = numpy.vstack([
            pred_prob_fn(batch_x_q, batch_x_a, batch_add_info)
            for batch_x_q, batch_x_a, batch_add_info, _ in batch_iterator
        ])
        real_preds = preds[:, -1 * position_num:]
        inner_outputs = preds

        return real_preds[:batch_iterator.
                          n_samples], inner_outputs[:batch_iterator.n_samples]

    train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [q_train, a_train, add_train, y_train],
        batch_size=batch_size,
        randomize=True)
    dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [q_dev, a_dev, add_dev, y_dev],
        batch_size=batch_size,
        randomize=False)
    test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [q_test, a_test, add_test, y_test],
        batch_size=batch_size,
        randomize=False)

    labels = sorted(numpy.unique(y_test[:, -1]))
    print 'labels', labels

    def perplexity_score(labels, preds):
        positionPerplexity = [0.0] * position_num
        positionPerplexityClickSkip = [[0.0, 0.0]
                                       for i in xrange(position_num)]
        counts = [0] * position_num
        countsClickSkip = [[0, 0] for i in xrange(position_num)]
        for label, pred in zip(labels, preds):
            for i in range(0, len(label)):
                click = 1 if label[i] else 0
                tmp_pred = max(min(pred[i], 0.99999), 0.00001)
                logProb = math.log(tmp_pred, 2)
                if click == 0:
                    logProb = math.log(1 - tmp_pred, 2)
                positionPerplexity[i] += logProb
                positionPerplexityClickSkip[i][click] += logProb
                counts[i] += 1
                countsClickSkip[i][click] += 1
        positionPerplexity = [
            2**(-x / count if count else x)
            for (x, count) in zip(positionPerplexity, counts)
        ]
        positionPerplexityClickSkip = [[2 ** (-x[click] / (count[click] if count[click] else 1) if count else x) \
                for (x, count) in zip(positionPerplexityClickSkip, countsClickSkip)] for click in xrange(2)]
        perplexity = sum(positionPerplexity) / len(positionPerplexity)
        ret_str = "---------\n"
        ret_str += "Perplexity\t" + str(perplexity) + "\n"
        ret_str += "positionPerplexity"
        for i in range(0, position_num):
            ret_str += "\t" + str(positionPerplexity[i])
        ret_str += "\n"

        ret_str += "positionPerplexitySkip"
        for i in range(0, position_num):
            ret_str += "\t" + str(positionPerplexityClickSkip[0][i])
        ret_str += "\n"

        ret_str += "positionPerplexityClick"
        for i in range(0, position_num):
            ret_str += "\t" + str(positionPerplexityClickSkip[1][i])
        ret_str += "\n------------\n"
        #print ret_str
        return perplexity, ret_str

    def map_score(qids, labels, preds):
        qid2cand = defaultdict(list)
        for qid, label, pred in zip(qids, labels, preds):
            qid2cand[qid].append((pred, label))

        average_precs = []
        for qid, candidates in qid2cand.iteritems():
            average_prec = 0
            running_correct_count = 0
            for i, (score,
                    label) in enumerate(sorted(candidates, reverse=True), 1):
                if label > 0:
                    running_correct_count += 1
                    average_prec += float(running_correct_count) / i
            average_precs.append(average_prec / (running_correct_count + 1e-6))
        map_score = sum(average_precs) / len(average_precs)
        return map_score

    print "Zero out dummy word:", ZEROUT_DUMMY_WORD
    if ZEROUT_DUMMY_WORD:
        W_emb_list = [w for w in params if w.name == 'W_emb']
        zerout_dummy_word = theano.function(
            [], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list])

    # weights_dev = numpy.zeros(len(y_dev))
    # weights_dev[y_dev == 0] = weights_data[0]
    # weights_dev[y_dev == 1] = weights_data[1]
    # print weights_dev

    best_dev_acc = -numpy.inf
    best_dev_perp = numpy.inf
    epoch = 0
    timer_train = time.time()
    no_best_dev_update = 0
    num_train_batches = len(train_set_iterator)
    while epoch < n_epochs:
        timer = time.time()
        for i, (x_q, x_a, add, y) in enumerate(tqdm(train_set_iterator), 1):
            train_fn(x_q, x_a, add, y)

            # Make sure the null word in the word embeddings always remains zero
            if ZEROUT_DUMMY_WORD:
                zerout_dummy_word()

            if i % 10 == 0 or i == num_train_batches:
                y_pred_dev, y_inner_dev = predict_prob_batch(dev_set_iterator)
                #print "shape:"
                #print str(y_dev.shape)
                #print str(y_pred_dev.shape)
                # # dev_acc = map_score(qids_dev, y_dev, predict_prob_batch(dev_set_iterator)) * 100
                dev_acc = metrics.roc_auc_score(y_dev[:, -1],
                                                y_pred_dev[:, -1]) * 100
                dev_perp, dev_perp_str = perplexity_score(y_dev, y_pred_dev)
                if dev_acc > best_dev_acc:
                    y_pred, y_inner = predict_prob_batch(test_set_iterator)
                    test_acc = map_score(qids_test, y_test[:, -1],
                                         y_pred[:, -1]) * 100
                    print(
                        'epoch: {} batch: {} dev auc: {:.4f}; test map: {:.4f}; best_dev_acc: {:.4f}'
                        .format(epoch, i, dev_acc, test_acc, best_dev_acc))
                    best_dev_acc = dev_acc

                if dev_perp < best_dev_perp:
                    y_pred, y_inner = predict_prob_batch(test_set_iterator)
                    test_acc = map_score(qids_test, y_test[:, -1],
                                         y_pred[:, -1]) * 100
                    test_perplexity, test_perplexity_str = perplexity_score(
                        y_test, y_pred)
                    print(
                        'epoch: {} batch: {} dev auc: {:.4f}; test map: {:.4f}; best_dev_acc: {:.4f}; dev_perp: {:.4f}; best_dev_perp: {:.4f}'
                        .format(epoch, i, dev_acc, test_acc, best_dev_acc,
                                dev_perp, best_dev_perp))
                    print str(test_perplexity_str)
                    best_params = [
                        numpy.copy(p.get_value(borrow=True)) for p in params
                    ]
                    best_inner = y_inner
                    no_best_dev_update = 0
                    best_dev_perp = dev_perp
        if no_best_dev_update >= 3:
            print "Quitting after of no update of the best score on dev set", no_best_dev_update
            break

        numpy.savetxt(
            os.path.join(
                nnet_outdir,
                'test.epoch={:02d};batch={:05d};dev_perp={:.2f}.best_inner.npy'
                .format(epoch, i, best_dev_perp)), best_inner)
        print('epoch {} took {:.4f} seconds'.format(epoch,
                                                    time.time() - timer))
        epoch += 1
        no_best_dev_update += 1

    print('Training took: {:.4f} seconds'.format(time.time() - timer_train))
    for i, param in enumerate(best_params):
        params[i].set_value(param, borrow=True)

    y_pred_test, y_inner_test = predict_prob_batch(test_set_iterator)
    test_acc = map_score(qids_test, y_test[:, -1], y_pred_test[:, -1]) * 100
    test_perp, test_perp_str = perplexity_score(y_test, y_pred_test)
    print "FINAL ACCURACY"
    print str(test_acc)
    print "FINAL PERPLEXITY"
    print str(test_perp_str)
    fname = os.path.join(
        nnet_outdir,
        'best_dev_params.epoch={:02d};batch={:05d};dev_acc={:.2f}.dat'.format(
            epoch, i, best_dev_acc))
    numpy.savetxt(
        os.path.join(
            nnet_outdir,
            'test.epoch={:02d};batch={:05d};dev_acc={:.2f}.predictions.npy'.
            format(epoch, i, best_dev_acc)), y_pred_test)
    numpy.savetxt(
        os.path.join(
            nnet_outdir,
            'test.final.epoch={:02d};batch={:05d};dev_acc={:.2f}.best_inner.npy'
            .format(epoch, i, best_dev_acc)), best_inner)
    cPickle.dump(best_params,
                 open(fname, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)
def main():
    data_dir = "parsed_tweets"
    numpy_rng = numpy.random.RandomState(123)
    q_max_sent_size = 140

    # Load word2vec embeddings
    embedding_fname = 'emb_smiley_tweets_embedding_final.npy'
    fname_wordembeddings = os.path.join(data_dir, embedding_fname)

    print "Loading word embeddings from", fname_wordembeddings
    vocab_emb = numpy.load(fname_wordembeddings)
    print type(vocab_emb[0][0])
    print "Word embedding matrix size:", vocab_emb.shape

    #Load hasthag embeddings
    embedding_fname = 'emb_smiley_tweets_embedding_topn.npy'
    fname_htembeddings = os.path.join(data_dir, embedding_fname)
    print "Loading word embeddings from", fname_htembeddings
    vocab_emb_ht = numpy.load(fname_htembeddings)
    print type(vocab_emb_ht[0][0])
    print "Word embedding matrix size:", vocab_emb_ht.shape

    print 'Load Test Set'
    dev_set = numpy.load(
        'parsed_tweets/hashtag_top100_smiley_tweets_test.tweets.npy')
    y_dev_set = numpy.load(
        'parsed_tweets/hashtag_top100_smiley_tweets_test.hashtags.npy')

    tweets = T.imatrix('tweets_train')
    y = T.lvector('y_train')

    #######
    n_outs = 100
    batch_size = 1000
    max_norm = 0

    print 'batch_size', batch_size
    print 'max_norm', max_norm

    ## 1st conv layer.
    ndim = vocab_emb.shape[1]

    ### Nonlinearity type
    def relu(x):
        return x * (x > 0)

    activation = relu
    nkernels1 = 1000
    k_max = 1
    num_input_channels = 1
    filter_width1 = 4
    n_in = nkernels1 * k_max

    input_shape = (batch_size, num_input_channels,
                   q_max_sent_size + 2 * (filter_width1 - 1), ndim)

    ##########
    # LAYERS #
    #########
    parameter_map = {}
    parameter_map['nKernels1'] = nkernels1
    parameter_map['num_input_channels'] = num_input_channels
    parameter_map['ndim'] = ndim
    parameter_map['inputShape'] = input_shape
    parameter_map['activation'] = 'relu'
    parameter_map['n_in'] = n_in
    parameter_map['kmax'] = k_max

    parameter_map['filterWidth'] = filter_width1

    lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb,
                                                         pad=filter_width1 - 1)

    parameter_map['LookupTableFastStaticW'] = lookup_table_words.W

    filter_shape = (nkernels1, num_input_channels, filter_width1, ndim)

    parameter_map['FilterShape' + str(filter_width1)] = filter_shape

    conv = nn_layers.Conv2dLayer(rng=numpy_rng,
                                 filter_shape=filter_shape,
                                 input_shape=input_shape)

    parameter_map['Conv2dLayerW' + str(filter_width1)] = conv.W

    non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0],
                                                activation=activation)

    parameter_map['NonLinearityLayerB' + str(filter_width1)] = non_linearity.b

    pooling = nn_layers.KMaxPoolLayer(k_max=k_max)

    conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(
        layers=[conv, non_linearity, pooling])

    flatten_layer = nn_layers.FlattenLayer()

    hidden_layer = nn_layers.LinearLayer(numpy_rng,
                                         n_in=n_in,
                                         n_out=n_in,
                                         activation=activation)

    parameter_map['LinearLayerW'] = hidden_layer.W
    parameter_map['LinearLayerB'] = hidden_layer.b

    classifier = nn_layers.Training(numpy_rng, W=None, shape=(102, nkernels1))
    #classifier = nn_layers.LogisticRegression(n_in=n_in,n_out=n_outs)

    nnet_tweets = nn_layers.FeedForwardNet(layers=[
        lookup_table_words, conv2dNonLinearMaxPool, flatten_layer,
        hidden_layer, classifier
    ])

    nnet_tweets.set_input(tweets)
    print nnet_tweets

    ################
    # TRAIN  MODEL #
    ###############

    batch_tweets = T.imatrix('batch_x_q')
    batch_y = T.lvector('batch_y')

    params = nnet_tweets.params
    print params

    mrg_rng = MRG_RandomStreams()
    i = mrg_rng.uniform(size=(batch_size, vocab_emb_ht.shape[0]),
                        low=0.0,
                        high=1.0,
                        dtype=theano.config.floatX).argsort(axis=1)

    cost = nnet_tweets.layers[-1].training_cost(y, i)
    predictions = nnet_tweets.layers[-1].y_pred
    predictions_prob = nnet_tweets.layers[-1].f

    #cost = nnet_tweets.layers[-1].training_cost(y)
    #predictions = nnet_tweets.layers[-1].y_pred
    #predictions_prob = nnet_tweets.layers[-1].p_y_given_x[:, -1]

    inputs_train = [batch_tweets, batch_y]
    givens_train = {tweets: batch_tweets, y: batch_y}

    inputs_pred = [batch_tweets]
    givens_pred = {tweets: batch_tweets}

    updates = sgd_trainer.get_adadelta_updates(cost,
                                               params,
                                               rho=0.95,
                                               eps=1e-6,
                                               max_norm=max_norm,
                                               word_vec_name='None')

    train_fn = theano.function(inputs=inputs_train,
                               outputs=cost,
                               updates=updates,
                               givens=givens_train)

    pred_fn = theano.function(inputs=inputs_pred,
                              outputs=predictions,
                              givens=givens_pred)

    pred_prob_fn = theano.function(inputs=inputs_pred,
                                   outputs=predictions_prob,
                                   givens=givens_pred)

    def predict_prob_batch(batch_iterator):
        preds = numpy.vstack(
            [pred_prob_fn(batch_x_q[0]) for batch_x_q in batch_iterator])
        return preds[:batch_iterator.n_samples]

    def predict_batch(batch_iterator):
        preds = numpy.vstack(
            [pred_fn(batch_x_q[0]) for batch_x_q in batch_iterator])
        return preds[:batch_iterator.n_samples]

    W_emb_list = [w for w in params if w.name == 'W_emb']
    zerout_dummy_word = theano.function([],
                                        updates=[(W,
                                                  T.set_subtensor(W[-1:], 0.))
                                                 for W in W_emb_list])

    epoch = 0
    n_epochs = 25
    early_stop = 3
    best_dev_acc = -numpy.inf
    no_best_dev_update = 0
    timer_train = time.time()
    done = False
    best_params = [numpy.copy(p.get_value(borrow=True)) for p in params]
    while epoch < n_epochs and not done:
        max_chunks = numpy.inf
        curr_chunks = 0
        timer = time.time()
        fname_tweet = open(
            os.path.join(data_dir,
                         'hashtag_top100_smiley_tweets_train.tweets.npy'),
            'rb')
        fname_sentiments = open(
            os.path.join(data_dir,
                         'hashtag_top100_smiley_tweets_train.hashtags.npy'),
            'rb')
        while curr_chunks < max_chunks:
            train_set, y_train_set, chunks = get_next_chunk(fname_tweet,
                                                            fname_sentiments,
                                                            n_chunks=2)
            curr_chunks += chunks
            if train_set is None:
                break

            print "Length trains_set:", len(train_set)
            print "Length dev_set:", len(dev_set)
            print "Length y_trains_set:", len(y_train_set)
            print "Length y_dev_set:", len(y_dev_set)

            train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
                numpy_rng, [train_set, y_train_set],
                batch_size=batch_size,
                randomize=True)

            dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
                numpy_rng, [dev_set], batch_size=batch_size, randomize=False)

            for i, (tweet,
                    y_label) in enumerate(tqdm(train_set_iterator, ascii=True),
                                          1):
                train_fn(tweet, y_label)

            # Make sure the null word in the word embeddings always remains zero
            zerout_dummy_word()

            y_pred_dev = predict_prob_batch(dev_set_iterator)
            dev_acc = precision_at_k(y_dev_set, y_pred_dev, k=1) * 100
            #dev_acc = metrics.accuracy_score(y_dev_set,y_pred_dev)

            if dev_acc > best_dev_acc:
                print(
                    'epoch: {} chunk: {} best_chunk_auc: {:.4f}; best_dev_acc: {:.4f}'
                    .format(epoch, curr_chunks, dev_acc, best_dev_acc))
                best_dev_acc = dev_acc
                no_best_dev_update = 0
            else:
                print(
                    'epoch: {} chunk: {} best_chunk_auc: {:.4f}; best_dev_acc: {:.4f}'
                    .format(epoch, curr_chunks, dev_acc, best_dev_acc))
            cPickle.dump(
                parameter_map,
                open(data_dir + '/parameters_{}.p'.format('distant'), 'wb'))

        cPickle.dump(
            parameter_map,
            open(data_dir + '/parameters_{}.p'.format('distant'), 'wb'))
        print('epoch {} took {:.4f} seconds'.format(epoch,
                                                    time.time() - timer))

        if no_best_dev_update >= early_stop:
            print "Quitting after of no update of the best score on dev set", no_best_dev_update
            break
        no_best_dev_update += 1
        epoch += 1
        fname_tweet.close()
        fname_sentiments.close()

    cPickle.dump(parameter_map,
                 open(data_dir + '/parameters_{}.p'.format('distant'), 'wb'))
    print('Training took: {:.4f} seconds'.format(time.time() - timer_train))