def main(): # ZEROUT_DUMMY_WORD = False ZEROUT_DUMMY_WORD = True ## Load data # mode = 'TRAIN-ALL' mode = 'train' if len(sys.argv) > 1: mode = sys.argv[1] if not mode in ['TRAIN', 'TRAIN-ALL']: print "ERROR! The two possible training settings are: ['TRAIN', 'TRAIN-ALL']" sys.exit(1) print "Running training in the {} setting".format(mode) data_dir = mode # 加载数据集词向量 if mode in ['TRAIN-ALL']: q_train = numpy.load(os.path.join(data_dir, 'train-all.questions.npy')) a_train = numpy.load(os.path.join(data_dir, 'train-all.answers.npy')) q_overlap_train = numpy.load( os.path.join(data_dir, 'train-all.q_overlap_indices.npy')) a_overlap_train = numpy.load( os.path.join(data_dir, 'train-all.a_overlap_indices.npy')) y_train = numpy.load(os.path.join(data_dir, 'train-all.labels.npy')) else: q_train = numpy.load(os.path.join(data_dir, 'train.questions.npy')) a_train = numpy.load(os.path.join(data_dir, 'train.answers.npy')) q_overlap_train = numpy.load( os.path.join(data_dir, 'train.q_overlap_indices.npy')) a_overlap_train = numpy.load( os.path.join(data_dir, 'train.a_overlap_indices.npy')) y_train = numpy.load(os.path.join(data_dir, 'train.labels.npy')) q_dev = numpy.load(os.path.join(data_dir, 'dev.questions.npy')) a_dev = numpy.load(os.path.join(data_dir, 'dev.answers.npy')) q_overlap_dev = numpy.load( os.path.join(data_dir, 'dev.q_overlap_indices.npy')) a_overlap_dev = numpy.load( os.path.join(data_dir, 'dev.a_overlap_indices.npy')) y_dev = numpy.load(os.path.join(data_dir, 'dev.labels.npy')) qids_dev = numpy.load(os.path.join(data_dir, 'dev.qids.npy')) q_test = numpy.load(os.path.join(data_dir, 'test.questions.npy')) a_test = numpy.load(os.path.join(data_dir, 'test.answers.npy')) q_overlap_test = numpy.load( os.path.join(data_dir, 'test.q_overlap_indices.npy')) a_overlap_test = numpy.load( os.path.join(data_dir, 'test.a_overlap_indices.npy')) y_test = numpy.load(os.path.join(data_dir, 'test.labels.npy')) qids_test = numpy.load(os.path.join(data_dir, 'test.qids.npy')) # x里放的是overlap feat x_train = numpy.load(os.path.join(data_dir, 'train.overlap_feats.npy')) x_dev = numpy.load(os.path.join(data_dir, 'dev.overlap_feats.npy')) x_test = numpy.load(os.path.join(data_dir, 'test.overlap_feats.npy')) feats_ndim = x_train.shape[1] # from sklearn.preprocessing import StandardScaler # scaler = StandardScaler() # print "Scaling overlap features" # x_train = scaler.fit_transform(x_train) # x_dev = scaler.transform(x_dev) # x_test = scaler.transform(x_test) print 'y_train', numpy.unique(y_train, return_counts=True) print 'y_dev', numpy.unique(y_dev, return_counts=True) print 'y_test', numpy.unique(y_test, return_counts=True) print 'q_train', q_train.shape print 'q_dev', q_dev.shape print 'q_test', q_test.shape print 'a_train', a_train.shape print 'a_dev', a_dev.shape print 'a_test', a_test.shape # print 'a_overlap_train',a_overlap_train.shape # print 'x_train',x_train.shape # print 'x_dev',x_dev.shape # print 'x_test',x_test.shape ## Get the word embeddings from the nnet trained on SemEval # ndim = 40 # nnet_outdir = 'exp/ndim=60;batch=100;max_norm=0;learning_rate=0.1;2014-12-02-15:53:14' # nnet_fname = os.path.join(nnet_outdir, 'nnet.dat') # params_fname = os.path.join(nnet_outdir, 'best_dev_params.epoch=00;batch=14640;dev_f1=83.12;test_acc=85.00.dat') # train_nnet, test_nnet = nn_layers.load_nnet(nnet_fname, params_fname) numpy.random.RandomState() # 指定种子值(指定种子值是为了使同样的条件下每次产生的随机数一样,避免程序调试时由随机数不同而引起的问题) numpy_rng = numpy.random.RandomState(123) # question中最长的长度 q_max_sent_size = q_train.shape[1] # answer中最长的长度 a_max_sent_size = a_train.shape[1] # print 'max', numpy.max(a_train) # print 'min', numpy.min(a_train) ndim = 5 print "Generating random vocabulary for word overlap indicator features with dim:", ndim # numpy.max在不指定维度信息时,返回数组中的最大的一个值 #QQQQQQ dummy_word_id = numpy.max(a_overlap_train) print "dummy_word_id:", dummy_word_id # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim)) print "Gaussian" # 从标准正态分布中生成维度为(a,b)的随机数组 # 这一行看起来像是对未登录词的初始化 # QQQQQ vocab_emb_overlap = numpy_rng.randn(dummy_word_id + 1, ndim) * 0.25 # vocab_emb_overlap = numpy_rng.randn(dummy_word_id+1, ndim) * 0.05 # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim)) # [-1]引用的是矩阵的最后一行 # vocab_emb_overlap[-1] = 0 # Load word2vec embeddings fname = os.path.join(data_dir, 'emb_aquaint+wiki.txt.gz.ndim=50.bin.npy') print "Loading word embeddings from", fname vocab_emb = numpy.load(fname) ndim = vocab_emb.shape[1] dummpy_word_idx = numpy.max(a_train) print "Word embedding matrix size:", vocab_emb.shape # x = T.dmatrix('x') x_q = T.lmatrix('q') x_q_overlap = T.lmatrix('q_overlap') x_a = T.lmatrix('a') x_a_overlap = T.lmatrix('a_overlap') y = T.ivector('y') ####### n_outs = 2 n_epochs = 25 batch_size = 50 learning_rate = 0.1 max_norm = 0 print 'batch_size', batch_size print 'n_epochs', n_epochs print 'learning_rate', learning_rate print 'max_norm', max_norm ## 1st conv layer. ndim = vocab_emb.shape[1] + vocab_emb_overlap.shape[1] # ndim = vocab_emb.shape[1] print "1st conv layer dim:", ndim ### Nonlinearity type # activation = nn_layers.relu_f activation = T.tanh dropout_rate = 0.5 # feature map数目 nkernels = 100 q_k_max = 1 a_k_max = 1 # filter_widths = [3,4,5] q_filter_widths = [5] a_filter_widths = [5] ###### QUESTION ###### # 首先获得词向量信息 # QQQQ为什么要有这两层?似乎已经获得了词的词向量表示:可能是用于为是每个具体的句子获得词向量表示 # QQQQQ pad具体实现 # lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths)-1) lookup_table_words = nn_layers.LookupTableFast(W=vocab_emb, pad=max(q_filter_widths) - 1) #QQQQQ这一层的用途?可能也是来获得具体的句子对的overlap向量 lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=max(q_filter_widths) - 1) # lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words]) lookup_table = nn_layers.ParallelLookupTable( layers=[lookup_table_words, lookup_table_overlap]) # 因为是文本数据所以是单通道 num_input_channels = 1 # QQQQQq_max_sent_size + 2 * (max(q_filter_widths) - 1) 这一项的含义:因为在lookup中都加了两倍的对应长度的pad # QQQQ以及最后一项为什么是ndim # Minibatch of feature map stacks, of shape(batch size, stack size, nb row, nb col) see the optional parameter image_shape input_shape = (batch_size, num_input_channels, q_max_sent_size + 2 * (max(q_filter_widths) - 1), ndim) print "convlution layer input shape:", input_shape conv_layers = [] # 对各个filter构造卷积层 # QQQQ各层的w矩阵初始化方案有所不同?初始化可能有哪些方案以及各种方案的性能 for filter_width in q_filter_widths: # 每一层卷积的构造 #Set of filters used in CNN layer of shape (nb filters, stack size, nb row, nb col) filter_shape = (nkernels, num_input_channels, filter_width, ndim) # 此处采用的是2D卷积 conv = nn_layers.Conv2dLayer(rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape) non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0], activation=activation) pooling = nn_layers.KMaxPoolLayer(k_max=q_k_max) conv2dNonLinearMaxPool = nn_layers.FeedForwardNet( layers=[conv, non_linearity, pooling]) conv_layers.append(conv2dNonLinearMaxPool) join_layer = nn_layers.ParallelLayer(layers=conv_layers) flatten_layer = nn_layers.FlattenLayer() nnet_q = nn_layers.FeedForwardNet(layers=[ lookup_table, join_layer, flatten_layer, ]) nnet_q.set_input((x_q, x_q_overlap)) # nnet_q.set_input(x_q) ###### ###### ANSWER ###### # lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths)-1) lookup_table_words = nn_layers.LookupTableFast(W=vocab_emb, pad=max(q_filter_widths) - 1) lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=max(q_filter_widths) - 1) lookup_table = nn_layers.ParallelLookupTable( layers=[lookup_table_words, lookup_table_overlap]) # lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words]) # num_input_channels = len(lookup_table.layers) input_shape = (batch_size, num_input_channels, a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim) conv_layers = [] for filter_width in a_filter_widths: filter_shape = (nkernels, num_input_channels, filter_width, ndim) conv = nn_layers.Conv2dLayer(rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape) non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0], activation=activation) pooling = nn_layers.KMaxPoolLayer(k_max=a_k_max) conv2dNonLinearMaxPool = nn_layers.FeedForwardNet( layers=[conv, non_linearity, pooling]) conv_layers.append(conv2dNonLinearMaxPool) join_layer = nn_layers.ParallelLayer(layers=conv_layers) # QQQQ为啥这里有个flattenlayer flatten_layer = nn_layers.FlattenLayer() nnet_a = nn_layers.FeedForwardNet(layers=[ lookup_table, join_layer, flatten_layer, ]) # QQQQ此处x_a_overlap的用处是? nnet_a.set_input((x_a, x_a_overlap)) # nnet_a.set_input(x_a) ####### # print 'nnet_q.output', nnet_q.output.ndim # QQQQQ这里又是干嘛的 q_logistic_n_in = nkernels * len(q_filter_widths) * q_k_max a_logistic_n_in = nkernels * len(a_filter_widths) * a_k_max # dropout_q = nn_layers.FastDropoutLayer(rng=numpy_rng) # dropout_a = nn_layers.FastDropoutLayer(rng=numpy_rng) # dropout_q.set_input(nnet_q.output) # dropout_a.set_input(nnet_a.output) # feats_nout = 10 # x_hidden_layer = nn_layers.LinearLayer(numpy_rng, n_in=feats_ndim, n_out=feats_nout, activation=activation) # x_hidden_layer.set_input(x) # feats_nout = feats_ndim ### Dropout # classifier = nn_layers.PairwiseLogisticWithFeatsRegression(q_in=q_logistic_n_in, # a_in=a_logistic_n_in, # n_in=feats_nout, # n_out=n_outs) # classifier.set_input((dropout_q.output, dropout_a.output, x_hidden_layer.output)) # classifier.set_input((dropout_q.output, dropout_a.output, x)) # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, dropout_q, dropout_a, classifier], # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, dropout_q, dropout_a, classifier], # name="Training nnet") # test_classifier = nn_layers.PairwiseLogisticWithFeatsRegression(q_in=q_logistic_n_in, # a_in=a_logistic_n_in, # n_in=feats_nout, # n_out=n_outs, # W=classifier.W, # W_feats=classifier.W_feats, # b=classifier.b) # test_classifier.set_input((nnet_q.output, nnet_a.output, x_hidden_layer.output)) # test_classifier.set_input((nnet_q.output, nnet_a.output, x)) # test_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, test_classifier], # test_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, test_classifier], # name="Test nnet") ######### # 此处应该是进行句子匹配层 # pairwise_layer = nn_layers.PairwiseMultiOnlySimWithFeatsLayer(q_in=q_logistic_n_in, pairwise_layer = nn_layers.PairwiseNoFeatsLayer( q_in=q_logistic_n_in, # pairwise_layer = nn_layers.PairwiseWithFeatsLayer(q_in=q_logistic_n_in, # pairwise_layer = nn_layers.PairwiseOnlySimWithFeatsLayer(q_in=q_logistic_n_in, a_in=a_logistic_n_in) pairwise_layer.set_input((nnet_q.output, nnet_a.output)) # 此处n_in的取值要根据上一层匹配层的方案进行不同的计算 # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + a_logistic_n_in # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 50 # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 1 n_in = q_logistic_n_in + a_logistic_n_in + 1 # n_in = feats_ndim + 1 # n_in = feats_ndim + 50 hidden_layer = nn_layers.LinearLayer(numpy_rng, n_in=n_in, n_out=n_in, activation=activation) hidden_layer.set_input(pairwise_layer.output) classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs) classifier.set_input(hidden_layer.output) # dropout2 # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, pairwise_layer, hidden_layer,dropout_q,dropout_a, classifier], train_nnet = nn_layers.FeedForwardNet( layers=[nnet_q, nnet_a, pairwise_layer, hidden_layer, classifier], # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, classifier], name="Training nnet") test_nnet = train_nnet ####### print train_nnet params = train_nnet.params ts = datetime.now().strftime('%Y-%m-%d-%H.%M.%S') nnet_outdir = 'exp.out/ndim={}_batch={}_max_norm={}_learning_rate={}_{}'.format( ndim, batch_size, max_norm, learning_rate, ts) if not os.path.exists(nnet_outdir): os.makedirs(nnet_outdir) nnet_fname = os.path.join(nnet_outdir, 'nnet.dat') print "Saving to", nnet_fname # 将python对象序列化保存到本地的文件。 cPickle.dump([train_nnet, test_nnet], open(nnet_fname, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) total_params = sum([numpy.prod(param.shape.eval()) for param in params]) print 'Total params number:', total_params # 损失函数交叉熵 cost = train_nnet.layers[-1].training_cost(y) ######################################## # # QQQQQ这种方式好奇怪???看起来像cost的另外一种求法 # y_train_counts = numpy.unique(y_train, return_counts=True)[1].astype(numpy.float32) # weights_data = numpy.sum(y_train_counts) / y_train_counts # weights_data_norm = numpy.linalg.norm(weights_data) # weights_data /= weights_data_norm # print 'weights_data', weights_data # weights = theano.shared(weights_data, borrow=True) # cost = train_nnet.layers[-1].training_cost_weighted(y, weights=weights) ######################################################## # 经过softmax后的最大值对应的类别 predictions = test_nnet.layers[-1].y_pred # 经过softmax后的最大值 predictions_prob = test_nnet.layers[-1].p_y_given_x[:, -1] # ### L2 regularization # L2_word_emb = 1e-4 # L2_conv1d = 3e-5 # # L2_softmax = 1e-3 # L2_softmax = 1e-4 # print "Regularizing nnet weights" # for w in train_nnet.weights: # L2_reg = 0. # if w.name.startswith('W_emb'): # L2_reg = L2_word_emb # elif w.name.startswith('W_conv1d'): # L2_reg = L2_conv1d # elif w.name.startswith('W_softmax'): # L2_reg = L2_softmax # elif w.name == 'W': # L2_reg = L2_softmax # print w.name, L2_reg # cost += T.sum(w**2) * L2_reg # # batch_x = T.dmatrix('batch_x') batch_x_q = T.lmatrix('batch_x_q') batch_x_a = T.lmatrix('batch_x_a') batch_x_q_overlap = T.lmatrix('batch_x_q_overlap') batch_x_a_overlap = T.lmatrix('batch_x_a_overlap') batch_y = T.ivector('batch_y') # 训练优化方案 # updates = sgd_trainer.get_adagrad_updates(cost, params, learning_rate=learning_rate, max_norm=max_norm, _eps=1e-6) updates = sgd_trainer.get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=max_norm, word_vec_name='W_emb') # batch_x是否注释,代表是否用overlap_feat特征用于训练 inputs_pred = [ batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, # batch_x, ] givens_pred = { x_q: batch_x_q, x_a: batch_x_a, x_q_overlap: batch_x_q_overlap, x_a_overlap: batch_x_a_overlap, # x: batch_x } inputs_train = [ batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, # batch_x, batch_y, ] givens_train = { x_q: batch_x_q, x_a: batch_x_a, x_q_overlap: batch_x_q_overlap, x_a_overlap: batch_x_a_overlap, # x: batch_x, y: batch_y } # 训练函数定义 train_fn = theano.function(inputs=inputs_train, outputs=cost, updates=updates, givens=givens_train) # 选择答案 pred_fn = theano.function(inputs=inputs_pred, outputs=predictions, givens=givens_pred) # 每个选项的概率 pred_prob_fn = theano.function(inputs=inputs_pred, outputs=predictions_prob, givens=givens_pred) def predict_batch(batch_iterator): # numpy.hstack:Stack arrays in sequence horizontally (column wise).This is equivalent to concatenation along the second axis, except for 1-D arrays where it concatenates along the first axis preds = numpy.hstack([ pred_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap) for batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, _ in batch_iterator ]) # preds = numpy.hstack([pred_prob_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, batch_x) for # batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, batch_x,_ in batch_iterator]) # preds = numpy.hstack([pred_prob_fn(batch_x_q, batch_x_a, batch_x) for # batch_x_q, batch_x_a, batch_x, _ in batch_iterator]) return preds[:batch_iterator.n_samples] def predict_prob_batch(batch_iterator): preds = numpy.hstack([ pred_prob_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap) for batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, _ in batch_iterator ]) # preds = numpy.hstack([pred_prob_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap,batch_x) for # batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, batch_x,_ in batch_iterator]) # preds = numpy.hstack([pred_prob_fn(batch_x_q, batch_x_a, batch_x) for # batch_x_q, batch_x_a, batch_x, _ in batch_iterator]) return preds[:batch_iterator.n_samples] # 三个迭代器 train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [q_train, a_train, q_overlap_train, a_overlap_train, y_train], batch_size=batch_size, randomize=True) dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [q_dev, a_dev, q_overlap_dev, a_overlap_dev, y_dev], batch_size=batch_size, randomize=False) test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [q_test, a_test, q_overlap_test, a_overlap_test, y_test], batch_size=batch_size, randomize=False) #### # train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(numpy_rng, [q_train, a_train, q_overlap_train, # a_overlap_train,x_train,y_train], # batch_size=batch_size, randomize=True) # dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(numpy_rng, # [q_dev, a_dev, q_overlap_dev, a_overlap_dev,x_dev,y_dev], # batch_size=batch_size, randomize=False) # test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(numpy_rng, # [q_test, a_test, q_overlap_test, a_overlap_test,x_test, # y_test], batch_size=batch_size, randomize=False) ##### # train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(numpy_rng, [q_train, a_train, x_train, y_train], # batch_size=batch_size, randomize=True) # dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(numpy_rng, # [q_dev, a_dev, x_dev, # y_dev], # batch_size=batch_size, randomize=False) # test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(numpy_rng, # [q_test, a_test, x_test, # y_test], batch_size=batch_size, randomize=False) labels = sorted(numpy.unique(y_test)) print 'labels', labels def map_score(qids, labels, preds): qid2cand = defaultdict(list) for qid, label, pred in zip(qids, labels, preds): qid2cand[qid].append((pred, label)) average_precs = [] for qid, candidates in qid2cand.iteritems(): average_prec = 0 running_correct_count = 0 for i, (score, label) in enumerate(sorted(candidates, reverse=True), 1): if label > 0: running_correct_count += 1 average_prec += float(running_correct_count) / i average_precs.append(average_prec / (running_correct_count + 1e-6)) map_score = sum(average_precs) / len(average_precs) return map_score print "Zero out dummy word:", ZEROUT_DUMMY_WORD if ZEROUT_DUMMY_WORD: W_emb_list = [w for w in params if w.name == 'W_emb'] zerout_dummy_word = theano.function( [], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list]) # weights_dev = numpy.zeros(len(y_dev)) # weights_dev[y_dev == 0] = weights_data[0] # weights_dev[y_dev == 1] = weights_data[1] # print weights_dev best_dev_acc = -numpy.inf epoch = 0 timer_train = time.time() no_best_dev_update = 0 num_train_batches = len(train_set_iterator) while epoch < n_epochs: timer = time.time() for i, (x_q, x_a, x_q_overlap, x_a_overlap, y) in enumerate(tqdm(train_set_iterator), 1): # for i, (x_q, x_a, x, y) in enumerate(tqdm(train_set_iterator), 1): # train_fn(x_q, x_a, x_q_overlap, x_a_overlap, x,y) # train_fn(x_q, x_a, x, y) train_fn(x_q, x_a, x_q_overlap, x_a_overlap, y) # Make sure the null word in the word embeddings always remains zero if ZEROUT_DUMMY_WORD: zerout_dummy_word() if i % 10 == 0 or i == num_train_batches: y_pred_dev = predict_prob_batch(dev_set_iterator) # # dev_acc = map_score(qids_dev, y_dev, predict_prob_batch(dev_set_iterator)) * 100 # Compute Area Under the Receiver Operating Characteristic Curve(ROC AUC) from prediction scores. dev_acc = metrics.roc_auc_score(y_dev, y_pred_dev) * 100 if dev_acc > best_dev_acc: y_pred = predict_prob_batch(test_set_iterator) test_acc = map_score(qids_test, y_test, y_pred) * 100 print( 'epoch: {} batch: {} dev auc: {:.4f}; test map: {:.4f}; best_dev_acc: {:.4f}' .format(epoch, i, dev_acc, test_acc, best_dev_acc)) best_dev_acc = dev_acc best_params = [ numpy.copy(p.get_value(borrow=True)) for p in params ] no_best_dev_update = 0 if no_best_dev_update >= 3: print "Quitting after of no update of the best score on dev set", no_best_dev_update break print('epoch {} took {:.4f} seconds'.format(epoch, time.time() - timer)) epoch += 1 no_best_dev_update += 1 print('Training took: {:.4f} seconds'.format(time.time() - timer_train)) for i, param in enumerate(best_params): params[i].set_value(param, borrow=True) y_pred_test = predict_prob_batch(test_set_iterator) test_acc = map_score(qids_test, y_test, y_pred_test) * 100 fname = os.path.join( nnet_outdir, 'best_dev_params.epoch={:02d};batch={:05d};dev_acc={:.2f}.dat'.format( epoch, i, best_dev_acc)) numpy.savetxt( os.path.join( nnet_outdir, 'test.epoch={:02d};batch={:05d};dev_acc={:.2f}.predictions.npy'. format(epoch, i, best_dev_acc)), y_pred) cPickle.dump(best_params, open(fname, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) print "Running trec_eval script..." N = len(y_pred_test) df_submission = pd.DataFrame( index=numpy.arange(N), columns=['qid', 'iter', 'docno', 'rank', 'sim', 'run_id']) df_submission['qid'] = qids_test df_submission['iter'] = 0 df_submission['docno'] = numpy.arange(N) df_submission['rank'] = 0 df_submission['sim'] = y_pred_test df_submission['run_id'] = 'nnet' df_submission.to_csv(os.path.join(nnet_outdir, 'submission.txt'), header=False, index=False, sep=' ') df_gold = pd.DataFrame(index=numpy.arange(N), columns=['qid', 'iter', 'docno', 'rel']) df_gold['qid'] = qids_test df_gold['iter'] = 0 df_gold['docno'] = numpy.arange(N) df_gold['rel'] = y_test df_gold.to_csv(os.path.join(nnet_outdir, 'gold.txt'), header=False, index=False, sep=' ') subprocess.call("/bin/sh run_eval.sh '{}'".format(nnet_outdir), shell=True)
lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths)-1) lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=max(q_filter_widths)-1) lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words, lookup_table_overlap]) num_input_channels = 1 input_shape = (batch_size, num_input_channels, q_max_size + 2*(max(q_filter_widths)-1), ndim) conv_layers = [] # each conv_layer consists of 2d convolution , filters, activation, pooling layers for filter_width in q_filter_widths: filter_shape = (nfilters, num_input_channels, filter_width, ndim) conv = nn_layers.Conv2dLayer(rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape) non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0], activation=activation) pooling = nn_layers.KMaxPoolLayer(k_max=q_k_max) conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(layers=[conv, non_linearity, pooling]) conv_layers.append(conv2dNonLinearMaxPool) join_layer = nn_layers.ParallelLayer(layers=conv_layers) flatten_layer = nn_layers.FlattenLayer() nnet_q = nn_layers.FeedForwardNet(layers=[ lookup_table, join_layer, flatten_layer, ]) nnet_q.set_input((x_q, x_q_overlap))
def main(): # ZEROUT_DUMMY_WORD = False ZEROUT_DUMMY_WORD = True ## Load data # mode = 'TRAIN-ALL' mode = sys.argv[1] """ if len(sys.argv) > 1: mode = sys.argv[1] if not mode in ['TRAIN', 'TRAIN-ALL']: print "ERROR! The two possible training settings are: ['TRAIN', 'TRAIN-ALL']" sys.exit(1) """ print "Running training in the {} setting".format(mode) data_dir = mode if mode in ['TRAIN-ALL']: q_train = numpy.load(os.path.join(data_dir, 'train-all.questions.npy')) a_train = numpy.load(os.path.join(data_dir, 'train-all.answers.npy')) q_overlap_train = numpy.load( os.path.join(data_dir, 'train-all.q_overlap_indices.npy')) a_overlap_train = numpy.load( os.path.join(data_dir, 'train-all.a_overlap_indices.npy')) y_train = numpy.load(os.path.join(data_dir, 'train-all.labels.npy')) else: q_train = numpy.load(os.path.join(data_dir, 'train.questions.npy')) a_train = numpy.load(os.path.join(data_dir, 'train.answers.npy')) q_overlap_train = numpy.load(os.path.join(data_dir, 'train.q_sim.npy')) a_overlap_train = numpy.load(os.path.join(data_dir, 'train.a_sim.npy')) y_train = numpy.load(os.path.join(data_dir, 'train.labels.npy')) q_dev = numpy.load(os.path.join(data_dir, 'dev.questions.npy')) a_dev = numpy.load(os.path.join(data_dir, 'dev.answers.npy')) q_overlap_dev = numpy.load(os.path.join(data_dir, 'dev.q_sim.npy')) a_overlap_dev = numpy.load(os.path.join(data_dir, 'dev.a_sim.npy')) y_dev = numpy.load(os.path.join(data_dir, 'dev.labels.npy')) qids_dev = numpy.load(os.path.join(data_dir, 'dev.qids.npy')) q_test = numpy.load(os.path.join(data_dir, 'test.questions.npy')) a_test = numpy.load(os.path.join(data_dir, 'test.answers.npy')) q_overlap_test = numpy.load(os.path.join(data_dir, 'test.q_sim.npy')) a_overlap_test = numpy.load(os.path.join(data_dir, 'test.a_sim.npy')) y_test = numpy.load(os.path.join(data_dir, 'test.labels.npy')) qids_test = numpy.load(os.path.join(data_dir, 'test.qids.npy')) # x_train = numpy.load(os.path.join(data_dir, 'train.overlap_feats.npy')) # x_dev = numpy.load(os.path.join(data_dir, 'dev.overlap_feats.npy')) # x_test = numpy.load(os.path.join(data_dir, 'test.overlap_feats.npy')) # feats_ndim = x_train.shape[1] # from sklearn.preprocessing import StandardScaler # scaler = StandardScaler() # print "Scaling overlap features" # x_train = scaler.fit_transform(x_train) # x_dev = scaler.transform(x_dev) # x_test = scaler.transform(x_test) print 'y_train', numpy.unique(y_train, return_counts=True) print 'y_dev', numpy.unique(y_dev, return_counts=True) print 'y_test', numpy.unique(y_test, return_counts=True) print 'q_train', q_train.shape print 'q_dev', q_dev.shape print 'q_test', q_test.shape print 'a_train', a_train.shape print 'a_dev', a_dev.shape print 'a_test', a_test.shape ## Get the word embeddings from the nnet trained on SemEval # ndim = 40 # nnet_outdir = 'exp/ndim=60;batch=100;max_norm=0;learning_rate=0.1;2014-12-02-15:53:14' # nnet_fname = os.path.join(nnet_outdir, 'nnet.dat') # params_fname = os.path.join(nnet_outdir, 'best_dev_params.epoch=00;batch=14640;dev_f1=83.12;test_acc=85.00.dat') # train_nnet, test_nnet = nn_layers.load_nnet(nnet_fname, params_fname) numpy_rng = numpy.random.RandomState(123) q_max_sent_size = q_train.shape[1] a_max_sent_size = a_train.shape[1] # print 'max', numpy.max(a_train) # print 'min', numpy.min(a_train) ndim = 5 print "Generating random vocabulary for word overlap indicator features with dim:", ndim dummy_word_id = numpy.max(a_overlap_train) # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim)) print "Gaussian" vocab_emb_overlap = numpy_rng.randn(dummy_word_id + 1, ndim) * 0.25 # vocab_emb_overlap = numpy_rng.randn(dummy_word_id+1, ndim) * 0.05 # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim)) vocab_emb_overlap[-1] = 0 # Load word2vec embeddings fname = os.path.join(data_dir, 'emb_aquaint+wiki.txt.gz.ndim=50.bin.npy') print "Loading word embeddings from", fname vocab_emb = numpy.load(fname) ndim = vocab_emb.shape[1] dummpy_word_idx = numpy.max(a_train) print "Word embedding matrix size:", vocab_emb.shape x = T.dmatrix('x') x_q = T.lmatrix('q') x_q_overlap = T.lmatrix('q_overlap') x_a = T.lmatrix('a') x_a_overlap = T.lmatrix('a_overlap') y = T.ivector('y') ####### n_outs = 2 n_epochs = 25 batch_size = 50 learning_rate = 0.1 max_norm = 0 print 'batch_size', batch_size print 'n_epochs', n_epochs print 'learning_rate', learning_rate print 'max_norm', max_norm ## 1st conv layer. ndim = vocab_emb.shape[1] + vocab_emb_overlap.shape[1] ### Nonlinearity type # activation = nn_layers.relu_f activation = T.tanh dropout_rate = 0.5 nkernels = 100 q_k_max = 1 a_k_max = 1 # filter_widths = [3,4,5] q_filter_widths = [5] a_filter_widths = [5] ###### QUESTION ###### lookup_table_words = nn_layers.LookupTableFastStatic( W=vocab_emb, pad=max(q_filter_widths) - 1) lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=max(q_filter_widths) - 1) lookup_table = nn_layers.ParallelLookupTable( layers=[lookup_table_words, lookup_table_overlap]) num_input_channels = 1 input_shape = (batch_size, num_input_channels, q_max_sent_size + 2 * (max(q_filter_widths) - 1), ndim) conv_layers = [] for filter_width in q_filter_widths: filter_shape = (nkernels, num_input_channels, filter_width, ndim) conv = nn_layers.Conv2dLayer(rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape) non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0], activation=activation) pooling = nn_layers.KMaxPoolLayer(k_max=q_k_max) conv2dNonLinearMaxPool = nn_layers.FeedForwardNet( layers=[conv, non_linearity, pooling]) conv_layers.append(conv2dNonLinearMaxPool) join_layer = nn_layers.ParallelLayer(layers=conv_layers) flatten_layer = nn_layers.FlattenLayer() nnet_q = nn_layers.FeedForwardNet(layers=[ lookup_table, join_layer, flatten_layer, ]) nnet_q.set_input((x_q, x_q_overlap)) ###### ###### ANSWER ###### lookup_table_words = nn_layers.LookupTableFastStatic( W=vocab_emb, pad=max(q_filter_widths) - 1) lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=max(q_filter_widths) - 1) lookup_table = nn_layers.ParallelLookupTable( layers=[lookup_table_words, lookup_table_overlap]) # num_input_channels = len(lookup_table.layers) input_shape = (batch_size, num_input_channels, a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim) conv_layers = [] for filter_width in a_filter_widths: filter_shape = (nkernels, num_input_channels, filter_width, ndim) conv = nn_layers.Conv2dLayer(rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape) non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0], activation=activation) pooling = nn_layers.KMaxPoolLayer(k_max=a_k_max) conv2dNonLinearMaxPool = nn_layers.FeedForwardNet( layers=[conv, non_linearity, pooling]) conv_layers.append(conv2dNonLinearMaxPool) join_layer = nn_layers.ParallelLayer(layers=conv_layers) flatten_layer = nn_layers.FlattenLayer() nnet_a = nn_layers.FeedForwardNet(layers=[ lookup_table, join_layer, flatten_layer, ]) nnet_a.set_input((x_a, x_a_overlap)) ####### # print 'nnet_q.output', nnet_q.output.ndim q_logistic_n_in = nkernels * len(q_filter_widths) * q_k_max a_logistic_n_in = nkernels * len(a_filter_widths) * a_k_max # dropout_q = nn_layers.FastDropoutLayer(rng=numpy_rng) # dropout_a = nn_layers.FastDropoutLayer(rng=numpy_rng) # dropout_q.set_input(nnet_q.output) # dropout_a.set_input(nnet_a.output) # feats_nout = 10 # x_hidden_layer = nn_layers.LinearLayer(numpy_rng, n_in=feats_ndim, n_out=feats_nout, activation=activation) # x_hidden_layer.set_input(x) # feats_nout = feats_ndim ### Dropout # classifier = nn_layers.PairwiseLogisticWithFeatsRegression(q_in=logistic_n_in, # a_in=logistic_n_in, # n_in=feats_nout, # n_out=n_outs) # # classifier.set_input((dropout_q.output, dropout_a.output, x_hidden_layer.output)) # classifier.set_input((dropout_q.output, dropout_a.output, x)) # # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, dropout_q, dropout_a, classifier], # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, dropout_q, dropout_a, classifier], # name="Training nnet") # test_classifier = nn_layers.PairwiseLogisticWithFeatsRegression(q_in=logistic_n_in, # a_in=logistic_n_in, # n_in=feats_nout, # n_out=n_outs, # W=classifier.W, # W_feats=classifier.W_feats, # b=classifier.b) # # test_classifier.set_input((nnet_q.output, nnet_a.output, x_hidden_layer.output)) # test_classifier.set_input((nnet_q.output, nnet_a.output, x)) # # test_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, test_classifier], # test_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, test_classifier], # name="Test nnet") ######### # pairwise_layer = nn_layers.PairwiseMultiOnlySimWithFeatsLayer(q_in=q_logistic_n_in, pairwise_layer = nn_layers.PairwiseNoFeatsLayer( q_in=q_logistic_n_in, # pairwise_layer = nn_layers.PairwiseWithFeatsLayer(q_in=q_logistic_n_in, # pairwise_layer = nn_layers.PairwiseOnlySimWithFeatsLayer(q_in=q_logistic_n_in, a_in=a_logistic_n_in) pairwise_layer.set_input((nnet_q.output, nnet_a.output)) # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + a_logistic_n_in # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 50 # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 1 n_in = q_logistic_n_in + a_logistic_n_in + 1 # n_in = feats_ndim + 1 # n_in = feats_ndim + 50 hidden_layer = nn_layers.LinearLayer(numpy_rng, n_in=n_in, n_out=n_in, activation=activation) hidden_layer.set_input(pairwise_layer.output) classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs) classifier.set_input(hidden_layer.output) train_nnet = nn_layers.FeedForwardNet( layers=[nnet_q, nnet_a, pairwise_layer, hidden_layer, classifier], # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, classifier], name="Training nnet") test_nnet = train_nnet ####### print train_nnet params = train_nnet.params ts = datetime.now().strftime('%Y-%m-%d-%H.%M.%S') nnet_outdir = 'exp.out/ndim={};batch={};max_norm={};learning_rate={};{}'.format( ndim, batch_size, max_norm, learning_rate, ts) nnet_outdir = os.path.join(data_dir, nnet_outdir) if not os.path.exists(nnet_outdir): os.makedirs(nnet_outdir) nnet_fname = os.path.join(nnet_outdir, 'nnet.dat') print "Saving to", nnet_fname cPickle.dump([train_nnet, test_nnet], open(nnet_fname, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) total_params = sum([numpy.prod(param.shape.eval()) for param in params]) print 'Total params number:', total_params cost = train_nnet.layers[-1].training_cost(y) # y_train_counts = numpy.unique(y_train, return_counts=True)[1].astype(numpy.float32) # weights_data = numpy.sum(y_train_counts) / y_train_counts # weights_data_norm = numpy.linalg.norm(weights_data) # weights_data /= weights_data_norm # print 'weights_data', weights_data # weights = theano.shared(weights_data, borrow=True) # cost = train_nnet.layers[-1].training_cost_weighted(y, weights=weights) predictions = test_nnet.layers[-1].y_pred predictions_prob = test_nnet.layers[-1].p_y_given_x[:, -1] ### L2 regularization # L2_word_emb = 1e-4 # L2_conv1d = 3e-5 # # L2_softmax = 1e-3 # L2_softmax = 1e-4 # print "Regularizing nnet weights" # for w in train_nnet.weights: # L2_reg = 0. # if w.name.startswith('W_emb'): # L2_reg = L2_word_emb # elif w.name.startswith('W_conv1d'): # L2_reg = L2_conv1d # elif w.name.startswith('W_softmax'): # L2_reg = L2_softmax # elif w.name == 'W': # L2_reg = L2_softmax # print w.name, L2_reg # cost += T.sum(w**2) * L2_reg # batch_x = T.dmatrix('batch_x') batch_x_q = T.lmatrix('batch_x_q') batch_x_a = T.lmatrix('batch_x_a') batch_x_q_overlap = T.lmatrix('batch_x_q_overlap') batch_x_a_overlap = T.lmatrix('batch_x_a_overlap') batch_y = T.ivector('batch_y') # updates = sgd_trainer.get_adagrad_updates(cost, params, learning_rate=learning_rate, max_norm=max_norm, _eps=1e-6) updates = sgd_trainer.get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=max_norm, word_vec_name='W_emb') inputs_pred = [ batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, # batch_x, ] givens_pred = { x_q: batch_x_q, x_a: batch_x_a, x_q_overlap: batch_x_q_overlap, x_a_overlap: batch_x_a_overlap, # x: batch_x } inputs_train = [ batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, # batch_x, batch_y, ] givens_train = { x_q: batch_x_q, x_a: batch_x_a, x_q_overlap: batch_x_q_overlap, x_a_overlap: batch_x_a_overlap, # x: batch_x, y: batch_y } train_fn = theano.function(inputs=inputs_train, outputs=cost, updates=updates, givens=givens_train) pred_fn = theano.function(inputs=inputs_pred, outputs=predictions, givens=givens_pred) pred_prob_fn = theano.function(inputs=inputs_pred, outputs=predictions_prob, givens=givens_pred) def predict_batch(batch_iterator): preds = numpy.hstack([ pred_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap) for batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, _ in batch_iterator ]) return preds[:batch_iterator.n_samples] def predict_prob_batch(batch_iterator): preds = numpy.hstack([ pred_prob_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap) for batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, _ in batch_iterator ]) return preds[:batch_iterator.n_samples] train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [q_train, a_train, q_overlap_train, a_overlap_train, y_train], batch_size=batch_size, randomize=True) dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [q_dev, a_dev, q_overlap_dev, a_overlap_dev, y_dev], batch_size=batch_size, randomize=False) test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [q_test, a_test, q_overlap_test, a_overlap_test, y_test], batch_size=batch_size, randomize=False) labels = sorted(numpy.unique(y_test)) print 'labels', labels def map_score(qids, labels, preds): qid2cand = defaultdict(list) for qid, label, pred in zip(qids, labels, preds): qid2cand[qid].append((pred, label)) average_precs = [] for qid, candidates in qid2cand.iteritems(): average_prec = 0 running_correct_count = 0 for i, (score, label) in enumerate(sorted(candidates, reverse=True), 1): if label > 0: running_correct_count += 1 average_prec += float(running_correct_count) / i average_precs.append(average_prec / (running_correct_count + 1e-6)) map_score = sum(average_precs) / len(average_precs) return map_score print "Zero out dummy word:", ZEROUT_DUMMY_WORD if ZEROUT_DUMMY_WORD: W_emb_list = [w for w in params if w.name == 'W_emb'] zerout_dummy_word = theano.function( [], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list]) # weights_dev = numpy.zeros(len(y_dev)) # weights_dev[y_dev == 0] = weights_data[0] # weights_dev[y_dev == 1] = weights_data[1] # print weights_dev best_dev_acc = -numpy.inf epoch = 0 timer_train = time.time() no_best_dev_update = 0 num_train_batches = len(train_set_iterator) while epoch < n_epochs: timer = time.time() for i, (x_q, x_a, x_q_overlap, x_a_overlap, y) in enumerate(tqdm(train_set_iterator), 1): train_fn(x_q, x_a, x_q_overlap, x_a_overlap, y) # Make sure the null word in the word embeddings always remains zero if ZEROUT_DUMMY_WORD: zerout_dummy_word() if i % 10 == 0 or i == num_train_batches: y_pred_dev = predict_prob_batch(dev_set_iterator) # # dev_acc = map_score(qids_dev, y_dev, predict_prob_batch(dev_set_iterator)) * 100 dev_acc = metrics.roc_auc_score(y_dev, y_pred_dev) * 100 if dev_acc > best_dev_acc: y_pred = predict_prob_batch(test_set_iterator) test_acc = map_score(qids_test, y_test, y_pred) * 100 print( 'epoch: {} batch: {} dev auc: {:.4f}; test map: {:.4f}; best_dev_acc: {:.4f}' .format(epoch, i, dev_acc, test_acc, best_dev_acc)) best_dev_acc = dev_acc best_params = [ numpy.copy(p.get_value(borrow=True)) for p in params ] no_best_dev_update = 0 if no_best_dev_update >= 3: print "Quitting after of no update of the best score on dev set", no_best_dev_update break print('epoch {} took {:.4f} seconds'.format(epoch, time.time() - timer)) epoch += 1 no_best_dev_update += 1 print('Training took: {:.4f} seconds'.format(time.time() - timer_train)) for i, param in enumerate(best_params): params[i].set_value(param, borrow=True) y_pred_test = predict_prob_batch(test_set_iterator) test_acc = map_score(qids_test, y_test, y_pred_test) * 100 fname = os.path.join( nnet_outdir, 'best_dev_params.epoch={:02d};batch={:05d};dev_acc={:.2f}.dat'.format( epoch, i, best_dev_acc)) numpy.savetxt( os.path.join( nnet_outdir, 'test.epoch={:02d};batch={:05d};dev_acc={:.2f}.predictions.npy'. format(epoch, i, best_dev_acc)), y_pred) cPickle.dump(best_params, open(fname, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) print "Running trec_eval script..." N = len(y_pred_test) df_submission = pd.DataFrame( index=numpy.arange(N), columns=['qid', 'iter', 'docno', 'rank', 'sim', 'run_id']) df_submission['qid'] = qids_test df_submission['iter'] = 0 df_submission['docno'] = numpy.arange(N) df_submission['rank'] = 0 df_submission['sim'] = y_pred_test df_submission['run_id'] = 'nnet' df_submission.to_csv(os.path.join(nnet_outdir, 'submission.txt'), header=False, index=False, sep=' ') df_gold = pd.DataFrame(index=numpy.arange(N), columns=['qid', 'iter', 'docno', 'rel']) df_gold['qid'] = qids_test df_gold['iter'] = 0 df_gold['docno'] = numpy.arange(N) df_gold['rel'] = y_test df_gold.to_csv(os.path.join(nnet_outdir, 'gold.txt'), header=False, index=False, sep=' ') subprocess.call("/bin/sh run_eval.sh '{}'".format(nnet_outdir), shell=True)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-a', choices=['abcnn1', 'abcnn2']) parser.add_argument('--similarity', choices=['euclidean', 'cosine']) parser.add_argument('--no-features', action='store_true', help='do not use external features') parser.add_argument('--l2svm', action='store_true', help='use L2-SVM as the classifier') parser.add_argument('--dropout', choices=['gaussian', 'mc']) parser.add_argument('--dropout-rate', type=float, help='dropout rate (default: %(default)s)') parser.add_argument('--nkernels', type=int, help='number of kernels (default: %(default)s)') parser.add_argument('--early-stop', metavar='N', type=int, help='stop if seeing no improvements in N epochs') parser.add_argument('-e', choices=['GoogleNews', 'aquaint+wiki'], help='word embeddings file to use') parser.add_argument('mode') parser.set_defaults(early_stop=3, e='GoogleNews', dropout_rate=0.5, nkernels=100) args = parser.parse_args() # ZEROUT_DUMMY_WORD = False ZEROUT_DUMMY_WORD = True ## Load data # mode = 'TRAIN-ALL' mode = args.mode if mode not in ['TRAIN', 'TRAIN-ALL', 'WIKIQA-TRAIN'] + [ 'WEBAP-FOLD{}-TRAIN'.format(i) for i in (1, 2, 3, 4, 5) ]: print "ERROR! mode '{}' is invalid".format(mode) sys.exit(1) print "Running training in the {} setting".format(mode) data_dir = mode def load_numpy_data(data_dir, prefix): filetypes = [ 'questions', 'answers', 'q_overlap_indices', 'a_overlap_indices', 'labels', 'qids', 'aids' ] filenames = [ '{}.{}.npy'.format(prefix, filetype) for filetype in filetypes ] return [ numpy.load(os.path.join(data_dir, filename)) for filename in filenames ] if mode in ['TRAIN-ALL', 'TRAIN']: prefix = mode.lower() q_train, a_train, q_overlap_train, a_overlap_train, y_train, _, _ = load_numpy_data( data_dir, prefix) q_dev, a_dev, q_overlap_dev, a_overlap_dev, y_dev, qids_dev, _ = load_numpy_data( data_dir, 'dev') q_test, a_test, q_overlap_test, a_overlap_test, y_test, qids_test, aids_test = load_numpy_data( data_dir, 'test') x_train = numpy.load( os.path.join(data_dir, '{}.overlap_feats.npy'.format(prefix))) x_dev = numpy.load(os.path.join(data_dir, 'dev.overlap_feats.npy')) x_test = numpy.load(os.path.join(data_dir, 'test.overlap_feats.npy')) elif mode in ['WIKIQA-TRAIN']: q_train, a_train, q_overlap_train, a_overlap_train, y_train, _, _ = load_numpy_data( data_dir, 'WikiQA-train') q_dev, a_dev, q_overlap_dev, a_overlap_dev, y_dev, qids_dev, _ = load_numpy_data( data_dir, 'WikiQA-dev-filtered') q_test, a_test, q_overlap_test, a_overlap_test, y_test, qids_test, aids_test = load_numpy_data( data_dir, 'WikiQA-test-filtered') x_train = numpy.load( os.path.join(data_dir, 'WikiQA-train.overlap_feats.npy')) x_dev = numpy.load( os.path.join(data_dir, 'WikiQA-dev-filtered.overlap_feats.npy')) x_test = numpy.load( os.path.join(data_dir, 'WikiQA-test-filtered.overlap_feats.npy')) elif mode in ['WEBAP-FOLD{}-TRAIN'.format(i) for i in (1, 2, 3, 4, 5)]: fn = ['WEBAP-FOLD{}-TRAIN'.format(i) for i in (1, 2, 3, 4, 5)].index(mode) + 1 q_train, a_train, q_overlap_train, a_overlap_train, y_train, _, _ = load_numpy_data( data_dir, 'WebAP-fold{}-train'.format(fn)) q_dev, a_dev, q_overlap_dev, a_overlap_dev, y_dev, qids_dev, _ = load_numpy_data( data_dir, 'WebAP-fold{}-dev'.format(fn)) q_test, a_test, q_overlap_test, a_overlap_test, y_test, qids_test, aids_test = load_numpy_data( data_dir, 'WebAP-fold{}-test'.format(fn)) # x_train = numpy.load(os.path.join(data_dir, 'train.overlap_feats.npy')) # x_dev = numpy.load(os.path.join(data_dir, 'dev.overlap_feats.npy')) # x_test = numpy.load(os.path.join(data_dir, 'test.overlap_feats.npy')) feats_ndim = x_train.shape[1] from sklearn.preprocessing import StandardScaler scaler = StandardScaler(copy=True) print "Scaling features" x_train = scaler.fit_transform(x_train) x_dev = scaler.transform(x_dev) x_test = scaler.transform(x_test) print 'y_train', numpy.unique(y_train, return_counts=True) print 'y_dev', numpy.unique(y_dev, return_counts=True) print 'y_test', numpy.unique(y_test, return_counts=True) print 'q_train', q_train.shape print 'q_dev', q_dev.shape print 'q_test', q_test.shape print 'a_train', a_train.shape print 'a_dev', a_dev.shape print 'a_test', a_test.shape print 'x_train', x_train.shape print 'x_dev', x_dev.shape print 'x_test', x_test.shape ## Get the word embeddings from the nnet trained on SemEval # ndim = 40 # nnet_outdir = 'exp/ndim=60;batch=100;max_norm=0;learning_rate=0.1;2014-12-02-15:53:14' # nnet_fname = os.path.join(nnet_outdir, 'nnet.dat') # params_fname = os.path.join(nnet_outdir, 'best_dev_params.epoch=00;batch=14640;dev_f1=83.12;test_acc=85.00.dat') # train_nnet, test_nnet = nn_layers.load_nnet(nnet_fname, params_fname) numpy_rng = numpy.random.RandomState(123) q_max_sent_size = q_train.shape[1] a_max_sent_size = a_train.shape[1] # print 'max', numpy.max(a_train) # print 'min', numpy.min(a_train) ndim = 5 print "Generating random vocabulary for word overlap indicator features with dim:", ndim dummy_word_id = numpy.max(a_overlap_train) # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim)) print "Gaussian" vocab_emb_overlap = numpy_rng.randn(dummy_word_id + 1, ndim) * 0.25 # vocab_emb_overlap = numpy_rng.randn(dummy_word_id+1, ndim) * 0.05 # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim)) vocab_emb_overlap[-1] = 0 # Load word2vec embeddings if args.e in ['GoogleNews']: fname = os.path.join(data_dir, 'emb_GoogleNews-vectors-negative300.bin.npy') elif args.e in ['aquaint+wiki']: fname = os.path.join(data_dir, 'emb_aquaint+wiki.txt.gz.ndim=50.bin.npy') else: print 'No such embedding file: {}'.format(args.e) sys.exit(1) print "Loading word embeddings from", fname vocab_emb = numpy.load(fname) ndim = vocab_emb.shape[1] dummpy_word_idx = numpy.max(a_train) print "Word embedding matrix size:", vocab_emb.shape x = T.dmatrix('x') x_q = T.lmatrix('q') x_q_overlap = T.lmatrix('q_overlap') x_a = T.lmatrix('a') x_a_overlap = T.lmatrix('a_overlap') y = T.ivector('y') ####### n_outs = 2 n_epochs = 25 batch_size = 50 learning_rate = 0.1 max_norm = 0 print 'batch_size', batch_size print 'n_epochs', n_epochs print 'learning_rate', learning_rate print 'max_norm', max_norm ## 1st conv layer. ndim = vocab_emb.shape[1] + vocab_emb_overlap.shape[1] ### Nonlinearity type # activation = nn_layers.relu_f activation = T.tanh dropout_rate = args.dropout_rate nkernels = args.nkernels q_k_max = 1 a_k_max = 1 # filter_widths = [3,4,5] q_filter_widths = [5] a_filter_widths = [5] # Lookup layers lookup_table_q = nn_layers.ParallelLookupTable(layers=[ nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths) - 1), nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=max(q_filter_widths) - 1) ]) lookup_table_q.set_input((x_q, x_q_overlap)) lookup_table_a = nn_layers.ParallelLookupTable(layers=[ nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(a_filter_widths) - 1), nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=max(a_filter_widths) - 1) ]) lookup_table_a.set_input((x_a, x_a_overlap)) # NOTE: these seemingly mismatched shapes are actually correct if args.a in ['abcnn1']: attention = AttentionTransformLayer( similarity=args.similarity, rng=numpy_rng, W_q_shape=(a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim), W_a_shape=(q_max_sent_size + 2 * (max(q_filter_widths) - 1), ndim)) num_input_channels = 2 elif args.a in ['abcnn2']: attention = AttentionWeightingLayer(similarity=args.similarity) num_input_channels = 1 else: attention = None num_input_channels = 1 if attention is not None: attention.set_input((lookup_table_q.output, lookup_table_a.output)) input0, input1 = attention.output else: input0, input1 = lookup_table_q.output, lookup_table_a.output input_shape_q = (batch_size, num_input_channels, q_max_sent_size + 2 * (max(q_filter_widths) - 1), ndim) input_shape_a = (batch_size, num_input_channels, a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim) ###### QUESTION ###### # lookup_table_words = nn_layers.LookupTableFastStatic( # W=vocab_emb, pad=max(q_filter_widths) - 1) # lookup_table_overlap = nn_layers.LookupTableFast( # W=vocab_emb_overlap, pad=max(q_filter_widths) - 1) # lookup_table = nn_layers.ParallelLookupTable( # layers=[lookup_table_words, lookup_table_overlap]) # input_shape = (batch_size, num_input_channels, q_max_sent_size + 2 * # (max(q_filter_widths) - 1), ndim) conv_layers = [] for filter_width in q_filter_widths: filter_shape = (nkernels, num_input_channels, filter_width, ndim) conv = nn_layers.Conv2dLayer(rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape_q) non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0], activation=activation) pooling = nn_layers.KMaxPoolLayer(k_max=q_k_max) conv2dNonLinearMaxPool = nn_layers.FeedForwardNet( layers=[conv, non_linearity, pooling]) conv_layers.append(conv2dNonLinearMaxPool) join_layer = nn_layers.ParallelLayer(layers=conv_layers) flatten_layer = nn_layers.FlattenLayer() nnet_q = nn_layers.FeedForwardNet(layers=[join_layer, flatten_layer]) nnet_q.set_input(input0) ###### ###### ANSWER ###### # lookup_table_words = nn_layers.LookupTableFastStatic( # W=vocab_emb, pad=max(q_filter_widths) - 1) # lookup_table_overlap = nn_layers.LookupTableFast( # W=vocab_emb_overlap, pad=max(q_filter_widths) - 1) # lookup_table = nn_layers.ParallelLookupTable( # layers=[lookup_table_words, lookup_table_overlap]) # num_input_channels = len(lookup_table.layers) # input_shape = (batch_size, num_input_channels, a_max_sent_size + 2 * # (max(a_filter_widths) - 1), ndim) conv_layers = [] for filter_width in a_filter_widths: filter_shape = (nkernels, num_input_channels, filter_width, ndim) conv = nn_layers.Conv2dLayer(rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape_a) non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0], activation=activation) pooling = nn_layers.KMaxPoolLayer(k_max=a_k_max) conv2dNonLinearMaxPool = nn_layers.FeedForwardNet( layers=[conv, non_linearity, pooling]) conv_layers.append(conv2dNonLinearMaxPool) join_layer = nn_layers.ParallelLayer(layers=conv_layers) flatten_layer = nn_layers.FlattenLayer() nnet_a = nn_layers.FeedForwardNet(layers=[join_layer, flatten_layer]) nnet_a.set_input(input1) ####### # print 'nnet_q.output', nnet_q.output.ndim q_logistic_n_in = nkernels * len(q_filter_widths) * q_k_max a_logistic_n_in = nkernels * len(a_filter_widths) * a_k_max if args.dropout: if args.dropout == 'gaussian': dropout_q = nn_layers.FastDropoutLayer(rng=numpy_rng) dropout_a = nn_layers.FastDropoutLayer(rng=numpy_rng) elif args.dropout == 'mc': dropout_q = nn_layers.DropoutLayer(rng=numpy_rng, p=dropout_rate) dropout_a = nn_layers.DropoutLayer(rng=numpy_rng, p=dropout_rate) dropout_q.set_input(nnet_q.output) dropout_a.set_input(nnet_a.output) # feats_nout = 10 # x_hidden_layer = nn_layers.LinearLayer(numpy_rng, n_in=feats_ndim, n_out=feats_nout, activation=activation) # x_hidden_layer.set_input(x) # feats_nout = feats_ndim ### Dropout # classifier = nn_layers.PairwiseLogisticWithFeatsRegression(q_in=logistic_n_in, # a_in=logistic_n_in, # n_in=feats_nout, # n_out=n_outs) # # classifier.set_input((dropout_q.output, dropout_a.output, x_hidden_layer.output)) # classifier.set_input((dropout_q.output, dropout_a.output, x)) # # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, dropout_q, dropout_a, classifier], # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, dropout_q, dropout_a, classifier], # name="Training nnet") # test_classifier = nn_layers.PairwiseLogisticWithFeatsRegression(q_in=logistic_n_in, # a_in=logistic_n_in, # n_in=feats_nout, # n_out=n_outs, # W=classifier.W, # W_feats=classifier.W_feats, # b=classifier.b) # # test_classifier.set_input((nnet_q.output, nnet_a.output, x_hidden_layer.output)) # test_classifier.set_input((nnet_q.output, nnet_a.output, x)) # # test_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, test_classifier], # test_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, test_classifier], # name="Test nnet") ######### # pairwise_layer = nn_layers.PairwiseMultiOnlySimWithFeatsLayer(q_in=q_logistic_n_in, # pairwise_layer = nn_layers.PairwiseWithFeatsLayer(q_in=q_logistic_n_in, # a_in=a_logistic_n_in, # n_in=feats_ndim) # pairwise_layer = nn_layers.PairwiseOnlySimWithFeatsLayer(q_in=q_logistic_n_in, # pairwise_layer = nn_layers.PairwiseNoFeatsLayer(q_in=q_logistic_n_in, # a_in=a_logistic_n_in) # pairwise_layer.set_input((nnet_q.output, nnet_a.output)) if args.no_features: pairwise_layer = nn_layers.PairwiseNoFeatsLayer(q_in=q_logistic_n_in, a_in=a_logistic_n_in) n_in = q_logistic_n_in + a_logistic_n_in + 1 if args.dropout: pairwise_layer.set_input((dropout_q.output, dropout_a.output)) else: pairwise_layer.set_input((nnet_q.output, nnet_a.output)) else: pairwise_layer = nn_layers.PairwiseWithFeatsLayer(q_in=q_logistic_n_in, a_in=a_logistic_n_in, n_in=feats_ndim) n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 1 if args.dropout: pairwise_layer.set_input((dropout_q.output, dropout_a.output, x)) else: pairwise_layer.set_input((nnet_q.output, nnet_a.output, x)) # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + a_logistic_n_in # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 50 # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 1 # n_in = q_logistic_n_in + a_logistic_n_in + 1 # n_in = feats_ndim + 1 # n_in = feats_ndim + 50 hidden_layer = nn_layers.LinearLayer(numpy_rng, n_in=n_in, n_out=n_in, activation=activation) hidden_layer.set_input(pairwise_layer.output) if args.l2svm: classifier = nn_layers.L2SVM(n_in=n_in, n_out=n_outs) else: classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs) classifier.set_input(hidden_layer.output) all_layers = [] if args.a: all_layers.append(attention) all_layers.extend([nnet_q, nnet_a]) if args.dropout: all_layers.extend([dropout_q, dropout_a]) all_layers.extend([pairwise_layer, hidden_layer, classifier]) train_nnet = nn_layers.FeedForwardNet( layers=all_layers, # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, classifier], name="Training nnet") test_nnet = train_nnet ####### print train_nnet params = train_nnet.params ts = datetime.now().strftime('%Y-%m-%d-%H.%M.%S') nnet_outdir = 'exp.out/ndim={};batch={};max_norm={};learning_rate={};{}'.format( ndim, batch_size, max_norm, learning_rate, ts) if not os.path.exists(nnet_outdir): os.makedirs(nnet_outdir) nnet_fname = os.path.join(nnet_outdir, 'nnet.dat') print "Saving to", nnet_fname cPickle.dump([train_nnet, test_nnet], open(nnet_fname, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) total_params = sum([numpy.prod(param.shape.eval()) for param in params]) print 'Total params number:', total_params cost = train_nnet.layers[-1].training_cost(y) # y_train_counts = numpy.unique(y_train, return_counts=True)[1].astype(numpy.float32) # weights_data = numpy.sum(y_train_counts) / y_train_counts # weights_data_norm = numpy.linalg.norm(weights_data) # weights_data /= weights_data_norm # print 'weights_data', weights_data # weights = theano.shared(weights_data, borrow=True) # cost = train_nnet.layers[-1].training_cost_weighted(y, weights=weights) predictions = test_nnet.layers[-1].y_pred predictions_prob = test_nnet.layers[-1].p_y_given_x[:, -1] ### L2 regularization # L2_word_emb = 1e-4 # L2_conv1d = 3e-5 # # L2_softmax = 1e-3 # L2_softmax = 1e-4 # print "Regularizing nnet weights" # for w in train_nnet.weights: # L2_reg = 0. # if w.name.startswith('W_emb'): # L2_reg = L2_word_emb # elif w.name.startswith('W_conv1d'): # L2_reg = L2_conv1d # elif w.name.startswith('W_softmax'): # L2_reg = L2_softmax # elif w.name == 'W': # L2_reg = L2_softmax # print w.name, L2_reg # cost += T.sum(w**2) * L2_reg batch_x = T.dmatrix('batch_x') batch_x_q = T.lmatrix('batch_x_q') batch_x_a = T.lmatrix('batch_x_a') batch_x_q_overlap = T.lmatrix('batch_x_q_overlap') batch_x_a_overlap = T.lmatrix('batch_x_a_overlap') batch_y = T.ivector('batch_y') # updates = sgd_trainer.get_adagrad_updates(cost, params, learning_rate=learning_rate, max_norm=max_norm, _eps=1e-6) updates = sgd_trainer.get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=max_norm, word_vec_name='W_emb') inputs_pred = [ batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, batch_x, ] givens_pred = { x_q: batch_x_q, x_a: batch_x_a, x_q_overlap: batch_x_q_overlap, x_a_overlap: batch_x_a_overlap, x: batch_x } inputs_train = [ batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, batch_x, batch_y, ] givens_train = { x_q: batch_x_q, x_a: batch_x_a, x_q_overlap: batch_x_q_overlap, x_a_overlap: batch_x_a_overlap, x: batch_x, y: batch_y } train_fn = theano.function(inputs=inputs_train, outputs=cost, updates=updates, givens=givens_train, on_unused_input='warn') pred_fn = theano.function(inputs=inputs_pred, outputs=predictions, givens=givens_pred, on_unused_input='warn') pred_prob_fn = theano.function(inputs=inputs_pred, outputs=predictions_prob, givens=givens_pred, on_unused_input='warn') def predict_batch(batch_iterator): preds = numpy.hstack([ pred_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, batch_x) for batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, batch_x, _ in batch_iterator ]) return preds[:batch_iterator.n_samples] def predict_prob_batch(batch_iterator): preds = numpy.hstack([ pred_prob_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, batch_x) for batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, batch_x, _ in batch_iterator ]) return preds[:batch_iterator.n_samples] train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [q_train, a_train, q_overlap_train, a_overlap_train, x_train, y_train], batch_size=batch_size, randomize=True) dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [q_dev, a_dev, q_overlap_dev, a_overlap_dev, x_dev, y_dev], batch_size=batch_size, randomize=False) test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [q_test, a_test, q_overlap_test, a_overlap_test, x_test, y_test], batch_size=batch_size, randomize=False) labels = sorted(numpy.unique(y_test)) print 'labels', labels def map_score(qids, labels, preds): qid2cand = defaultdict(list) for qid, label, pred in zip(qids, labels, preds): qid2cand[qid].append((pred, label)) average_precs = [] for qid, candidates in qid2cand.iteritems(): average_prec = 0 running_correct_count = 0 for i, (score, label) in enumerate(sorted(candidates, reverse=True), 1): if label > 0: running_correct_count += 1 average_prec += float(running_correct_count) / i average_precs.append(average_prec / (running_correct_count + 1e-6)) map_score = sum(average_precs) / len(average_precs) return map_score print "Zero out dummy word:", ZEROUT_DUMMY_WORD if ZEROUT_DUMMY_WORD: W_emb_list = [w for w in params if w.name == 'W_emb'] zerout_dummy_word = theano.function( [], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list]) # weights_dev = numpy.zeros(len(y_dev)) # weights_dev[y_dev == 0] = weights_data[0] # weights_dev[y_dev == 1] = weights_data[1] # print weights_dev best_dev_acc = -numpy.inf epoch = 0 timer_train = time.time() no_best_dev_update = 0 num_train_batches = len(train_set_iterator) while epoch < n_epochs: timer = time.time() for i, (x_q, x_a, x_q_overlap, x_a_overlap, x, y) in enumerate(tqdm(train_set_iterator), 1): train_fn(x_q, x_a, x_q_overlap, x_a_overlap, x, y) # Make sure the null word in the word embeddings always remains zero if ZEROUT_DUMMY_WORD: zerout_dummy_word() if i % 10 == 0 or i == num_train_batches: y_pred_dev = predict_prob_batch(dev_set_iterator) # # dev_acc = map_score(qids_dev, y_dev, predict_prob_batch(dev_set_iterator)) * 100 dev_acc = metrics.roc_auc_score(y_dev, y_pred_dev) * 100 if dev_acc > best_dev_acc: y_pred = predict_prob_batch(test_set_iterator) test_acc = map_score(qids_test, y_test, y_pred) * 100 print( 'epoch: {} batch: {} dev auc: {:.4f}; test map: {:.4f}; best_dev_acc: {:.4f}' .format(epoch, i, dev_acc, test_acc, best_dev_acc)) best_dev_acc = dev_acc best_params = [ numpy.copy(p.get_value(borrow=True)) for p in params ] no_best_dev_update = 0 if no_best_dev_update >= args.early_stop: print "Quitting after of no update of the best score on dev set", no_best_dev_update break print('epoch {} took {:.4f} seconds'.format(epoch, time.time() - timer)) epoch += 1 no_best_dev_update += 1 print('Training took: {:.4f} seconds'.format(time.time() - timer_train)) for i, param in enumerate(best_params): params[i].set_value(param, borrow=True) y_pred_test = predict_prob_batch(test_set_iterator) test_acc = map_score(qids_test, y_test, y_pred_test) * 100 fname = os.path.join( nnet_outdir, 'best_dev_params.epoch={:02d};batch={:05d};dev_acc={:.2f}.dat'.format( epoch, i, best_dev_acc)) numpy.savetxt( os.path.join( nnet_outdir, 'test.epoch={:02d};batch={:05d};dev_acc={:.2f}.predictions.npy'. format(epoch, i, best_dev_acc)), y_pred) cPickle.dump(best_params, open(fname, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) print "Running trec_eval script..." N = len(y_pred_test) df_submission = pd.DataFrame( index=numpy.arange(N), columns=['qid', 'iter', 'docno', 'rank', 'sim', 'run_id']) df_submission['qid'] = qids_test df_submission['iter'] = 0 df_submission['docno'] = aids_test df_submission['rank'] = 0 df_submission['sim'] = y_pred_test df_submission['run_id'] = 'nnet' df_submission.to_csv(os.path.join(nnet_outdir, 'submission.txt'), header=False, index=False, sep=' ') df_gold = pd.DataFrame(index=numpy.arange(N), columns=['qid', 'iter', 'docno', 'rel']) df_gold['qid'] = qids_test df_gold['iter'] = 0 df_gold['docno'] = aids_test df_gold['rel'] = y_test df_gold.to_csv(os.path.join(nnet_outdir, 'gold.txt'), header=False, index=False, sep=' ') subprocess.call("/bin/sh run_eval.sh '{}'".format(nnet_outdir), shell=True) print 'results saved to directory {}'.format(nnet_outdir)
def main(): ########## # LAYERS # ######### HOME_DIR = "semeval_parsed" input_fname = '200M' test_type = '' if len(sys.argv) > 1: test_type = sys.argv[1] data_dir = HOME_DIR + '_' + input_fname numpy_rng = numpy.random.RandomState(123) print "Load Parameters" parameter_map = cPickle.load( open(data_dir + '/parameters_distant_{}.p'.format(test_type), 'rb')) input_shape = parameter_map['inputShape'] filter_width = parameter_map['filterWidth'] n_in = parameter_map['qLogisticIn'] k_max = parameter_map['kmax'] def relu(x): return x * (x > 0) activation = relu tweets = T.imatrix('tweets_train') y = T.lvector('y') batch_tweets = T.imatrix('batch_x_q') batch_y = T.lvector('batch_y') lookup_table_words = nn_layers.LookupTableFast( W=parameter_map['LookupTableFastStaticW'].get_value(), pad=filter_width - 1) filter_shape = parameter_map['FilterShape' + str(filter_width)] conv_layers = [] conv = nn_layers.Conv2dLayer(W=parameter_map['Conv2dLayerW' + str(filter_width)], rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape) non_linearity = nn_layers.NonLinearityLayer( b=parameter_map['NonLinearityLayerB' + str(filter_width)], b_size=filter_shape[0], activation=activation) pooling = nn_layers.KMaxPoolLayer(k_max=k_max) conv2dNonLinearMaxPool = nn_layers.FeedForwardNet( layers=[conv, non_linearity, pooling]) conv_layers.append(conv2dNonLinearMaxPool) join_layer = nn_layers.ParallelLayer(layers=conv_layers) flatten_layer = nn_layers.FlattenLayer() hidden_layer = nn_layers.LinearLayer(W=parameter_map['LinearLayerW'], b=parameter_map['LinearLayerB'], rng=numpy_rng, n_in=n_in, n_out=n_in, activation=activation) n_outs = 3 classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs) nnet_tweets = nn_layers.FeedForwardNet(layers=[ lookup_table_words, join_layer, flatten_layer, hidden_layer, classifier ]) inputs_train = [batch_tweets, batch_y] givens_train = {tweets: batch_tweets, y: batch_y} inputs_pred = [ batch_tweets, ] givens_pred = { tweets: batch_tweets, } nnet_tweets.set_input(tweets) print nnet_tweets params = nnet_tweets.params cost = nnet_tweets.layers[-1].training_cost(y) predictions = nnet_tweets.layers[-1].y_pred updates = sgd_trainer.get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=0, word_vec_name='None') train_fn = theano.function( inputs=inputs_train, outputs=cost, updates=updates, givens=givens_train, ) pred_fn = theano.function(inputs=inputs_pred, outputs=predictions, givens=givens_pred) def predict_batch(batch_iterator): preds = numpy.hstack( [pred_fn(batch_x_q[0]) for batch_x_q in batch_iterator]) return preds[:batch_iterator.n_samples] ####### #Names# ####### test_2016n = 'Test 2016' test_2015n = 'Test 2015' test_2014n = 'Test 2014' test_2013n = 'Test 2013' test_2014ljn = 'Test 2014 LiveJournal' test_2014srcn = 'Test 2014 Sarcasm' test_2013_smsn = 'Test 2013 SMS' ep_pred = {} ep_pred[test_2016n] = [] ep_pred[test_2015n] = [] ep_pred[test_2014n] = [] ep_pred[test_2013n] = [] ep_pred[test_2014ljn] = [] ep_pred[test_2014srcn] = [] ep_pred[test_2013_smsn] = [] ####################### # Supervised Learining# ###################### batch_size = 1000 training2013_tids = numpy.load( os.path.join(data_dir, 'task-B-train.20140221.tids.npy')) training2013_tweets = numpy.load( os.path.join(data_dir, 'task-B-train.20140221.tweets.npy')) training2013_sentiments = numpy.load( os.path.join(data_dir, 'task-B-train.20140221.sentiments.npy')) dev_2013_tids = numpy.load( os.path.join(data_dir, 'task-B-dev.20140225.tids.npy')) dev_2013_tweets = numpy.load( os.path.join(data_dir, 'task-B-dev.20140225.tweets.npy')) dev_2013_sentiments = numpy.load( os.path.join(data_dir, 'task-B-dev.20140225.sentiments.npy')) trainingA_2016_tids = numpy.load( os.path.join(data_dir, 'task-A-train-2016.tids.npy')) trainingA_2016_tweets = numpy.load( os.path.join(data_dir, 'task-A-train-2016.tweets.npy')) trainingA_2016_sentiments = numpy.load( os.path.join(data_dir, 'task-A-train-2016.sentiments.npy')) devA_2016_tids = numpy.load( os.path.join(data_dir, 'task-A-dev-2016.tids.npy')) devA_2016_tweets = numpy.load( os.path.join(data_dir, 'task-A-dev-2016.tweets.npy')) devA_2016_sentiments = numpy.load( os.path.join(data_dir, 'task-A-dev-2016.sentiments.npy')) devtestA_2016_tids = numpy.load( os.path.join(data_dir, 'task-A-devtest-2016.tids.npy')) devtestA_2016_tweets = numpy.load( os.path.join(data_dir, 'task-A-devtest-2016.tweets.npy')) devtestA_2016_sentiments = numpy.load( os.path.join(data_dir, 'task-A-devtest-2016.sentiments.npy')) test_2016_tids = numpy.load( os.path.join(data_dir, 'task-A-test2016.tids.npy')) test_2016_tweets = numpy.load( os.path.join(data_dir, 'task-A-test2016.tweets.npy')) test_2016_sentiments = numpy.load( os.path.join(data_dir, 'task-A-test2016.sentiments.npy')) test_2013_tids = numpy.load( os.path.join(data_dir, 'task-B-test2013-twitter.tids.npy')) test_2013_tweets = numpy.load( os.path.join(data_dir, 'task-B-test2013-twitter.tweets.npy')) test_2013_sentiments = numpy.load( os.path.join(data_dir, 'task-B-test2013-twitter.sentiments.npy')) test_2014_tids = numpy.load( os.path.join(data_dir, 'task-B-test2014-twitter.tids.npy')) test_2014_tweets = numpy.load( os.path.join(data_dir, 'task-B-test2014-twitter.tweets.npy')) test_2014_sentiments = numpy.load( os.path.join(data_dir, 'task-B-test2014-twitter.sentiments.npy')) test_2015_tids = numpy.load( os.path.join(data_dir, 'task-B-test2015-twitter.tids.npy')) test_2015_tweets = numpy.load( os.path.join(data_dir, 'task-B-test2015-twitter.tweets.npy')) test_2015_sentiments = numpy.load( os.path.join(data_dir, 'task-B-test2015-twitter.sentiments.npy')) test_2013_sms_tids = numpy.load( os.path.join(data_dir, 'task-B-test2013-sms.tids.npy')) test_2013_sms_tweets = numpy.load( os.path.join(data_dir, 'task-B-test2013-sms.tweets.npy')) test_2013_sms_sentiments = numpy.load( os.path.join(data_dir, 'task-B-test2013-sms.sentiments.npy')) test_2014_livejournal_tids = numpy.load( os.path.join(data_dir, 'task-B-test2014-livejournal.tids.npy')) test_2014_livejournal_tweets = numpy.load( os.path.join(data_dir, 'task-B-test2014-livejournal.tweets.npy')) test_2014_livejournal_sentiments = numpy.load( os.path.join(data_dir, 'task-B-test2014-livejournal.sentiments.npy')) test_2014_sarcasm_tids = numpy.load( os.path.join(data_dir, 'task-B-test2014-twittersarcasm.tids.npy')) test_2014_sarcasm_tweets = numpy.load( os.path.join(data_dir, 'task-B-test2014-twittersarcasm.tweets.npy')) test_2014_sarcasm_sentiments = numpy.load( os.path.join(data_dir, 'task-B-test2014-twittersarcasm.sentiments.npy')) training_full_tweets = numpy.concatenate( (training2013_tweets, dev_2013_tweets), axis=0) training_full_tweets = numpy.concatenate( (training_full_tweets, trainingA_2016_tweets), axis=0) training_full_tweets = numpy.concatenate( (training_full_tweets, devA_2016_tweets), axis=0) training_full_tweets = numpy.concatenate( (training_full_tweets, devtestA_2016_tweets), axis=0) training_full_sentiments = numpy.concatenate( (training2013_sentiments, dev_2013_sentiments), axis=0) training_full_sentiments = numpy.concatenate( (training_full_sentiments, trainingA_2016_sentiments), axis=0) training_full_sentiments = numpy.concatenate( (training_full_sentiments, devA_2016_sentiments), axis=0) training_full_sentiments = numpy.concatenate( (training_full_sentiments, devtestA_2016_sentiments), axis=0) train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [training_full_tweets, training_full_sentiments], batch_size=batch_size, randomize=True) test_2015_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2015_tweets], batch_size=batch_size, randomize=False) dev2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [devA_2016_tweets], batch_size=batch_size, randomize=False) devtestA2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [devtestA_2016_tweets], batch_size=batch_size, randomize=False) train2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [trainingA_2016_tweets], batch_size=batch_size, randomize=False) test2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2016_tweets], batch_size=batch_size, randomize=False) test2013_itarator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2013_tweets], batch_size=batch_size, randomize=False) test_2014_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2014_tweets], batch_size=batch_size, randomize=False) test_2014_sarcasm_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2014_sarcasm_tweets], batch_size=batch_size, randomize=False) train2013_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [training2013_tweets], batch_size=batch_size, randomize=False) dev_2013_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [dev_2013_tweets], batch_size=batch_size, randomize=False) test_2013_sms_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2013_sms_tweets], batch_size=batch_size, randomize=False) test_2014_livejournal_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2014_livejournal_tweets], batch_size=batch_size, randomize=False) W_emb_list = [w for w in params if w.name == 'W_emb'] zerout_dummy_word = theano.function([], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list]) epoch = 0 n_epochs = 50 early_stop = 50 check_freq = 4 timer_train = time.time() no_best_dev_update = 0 best_dev_acc = -numpy.inf num_train_batches = len(train_set_iterator) while epoch < n_epochs: timer = time.time() for i, (tweet, y_label) in enumerate(tqdm(train_set_iterator, ascii=True), 1): train_fn(tweet, y_label) if i % check_freq == 0 or i == num_train_batches: y_pred_dev_2015 = predict_batch(test_2015_iterator) #y_pred_train_2013 = predict_batch(train2013_iterator) #y_pred_train_2016 = predict_batch(train2016_iterator) #y_pred_dev2016 = predict_batch(dev2016_iterator) #y_pred_dev2013 = predict_batch(dev_2013_iterator) y_pred_test_2016 = predict_batch(test2016_iterator) y_pred_test_2014 = predict_batch(test_2014_iterator) y_pred_test_2013 = predict_batch(test2013_itarator) y_pred_test_sms_2013 = predict_batch(test_2013_sms_iterator) y_pred_test_livejournal_2014 = predict_batch( test_2014_livejournal_iterator) y_pred_test_sarcasm_2014 = predict_batch( test_2014_sarcasm_iterator) #y_pred_devtest_2016 = predict_batch(devtestA2016_iterator) dev_acc_2015 = semeval_f1_taskA(test_2015_sentiments, y_pred_dev_2015) dev_acc_2014 = semeval_f1_taskA(test_2014_sentiments, y_pred_test_2014) dev_acc_2014_lj = semeval_f1_taskA( test_2014_livejournal_sentiments, y_pred_test_livejournal_2014) dev_acc_2014_srcs = semeval_f1_taskA( test_2014_sarcasm_sentiments, y_pred_test_sarcasm_2014) dev_acc_2013 = semeval_f1_taskA(test_2013_sentiments, y_pred_test_2013) dev_acc_2013_sms = semeval_f1_taskA(test_2013_sms_sentiments, y_pred_test_sms_2013) dev_acc_2016 = semeval_f1_taskA(test_2016_sentiments, y_pred_test_2016) ep_pred[test_2016n].append(dev_acc_2016) ep_pred[test_2015n].append(dev_acc_2015) ep_pred[test_2014n].append(dev_acc_2014) ep_pred[test_2013n].append(dev_acc_2013) ep_pred[test_2014ljn].append(dev_acc_2014_lj) ep_pred[test_2014srcn].append(dev_acc_2014_srcs) ep_pred[test_2013_smsn].append(dev_acc_2013_sms) if dev_acc_2016 > best_dev_acc: best_dev_acc = dev_acc_2016 best_params = [ numpy.copy(p.get_value(borrow=True)) for p in params ] no_best_dev_update = 0 print('2016 epoch: {} chunk: {} best_chunk_auc: {:.4f};'. format(epoch, i, dev_acc_2016)) print('2015 epoch: {} chunk: {} best_chunk_auc: {:.4f};'. format(epoch, i, dev_acc_2015)) print('2014 epoch: {} chunk: {} best_chunk_auc: {:.4f};'. format(epoch, i, dev_acc_2014)) print('2013 epoch: {} chunk: {} best_chunk_auc: {:.4f};'. format(epoch, i, dev_acc_2013)) print('2014lj epoch: {} chunk: {} best_chunk_auc: {:.4f};'. format(epoch, i, dev_acc_2014_lj)) print( '2014src epoch: {} chunk: {} best_chunk_auc: {:.4f};'. format(epoch, i, dev_acc_2014_srcs)) print( '2013sms epoch: {} chunk: {} best_chunk_auc: {:.4f};'. format(epoch, i, dev_acc_2013_sms)) #print('devtest2016 epoch: {} chunk: {} best_chunk_auc: {:.4f};'.format(epoch, i, dev_acc_2016_devtest)) zerout_dummy_word() print('epoch {} took {:.4f} seconds'.format(epoch, time.time() - timer)) epoch += 1 no_best_dev_update += 1 if no_best_dev_update >= early_stop: print "Quitting after of no update of the best score on dev set", no_best_dev_update break print('Training took: {:.4f} seconds'.format(time.time() - timer_train)) for i, param in enumerate(best_params): params[i].set_value(param, borrow=True) cPickle.dump( ep_pred, open(data_dir + '/supervised_results_{}.p'.format(test_type), 'wb')) return ####################### # Get Sentence Vectors# ###################### batch_size = input_shape[0] inputs_senvec = [batch_tweets] givents_senvec = { tweets: batch_tweets, } output = nnet_tweets.layers[-2].output output_fn = function(inputs=inputs_senvec, outputs=output, givens=givents_senvec) sets = [(test_2014_tids, test_2014_tweets, 'task-B-test2014-twitter'), (test_2015_tids, test_2015_tweets, 'task-B-test2015-twitter'), (training2013_tids, training2013_tweets, 'task-BD-train-2013'), (test_2013_sms_tids, test_2013_sms_tweets, 'task-B-test2013-sms'), (devA_2016_tids, devA_2016_tweets, 'task-A-dev-2016'), (trainingA_2016_tids, trainingA_2016_tweets, 'task-A-train-2016'), (devtestA_2016_tids, devtestA_2016_tweets, 'task-A-devtest-2016'), (test_2016_tids, test_2016_tweets, 'SemEval2016-task4-test.subtask-A'), (test_2014_sarcasm_tids, test_2014_sarcasm_tweets, 'test_2014_sarcasm'), (test_2014_livejournal_tids, test_2014_livejournal_tweets, 'task-B-test2014-livejournal'), (test_2013_tids, test_2013_tweets, 'task-BD-train-2013'), (dev_2013_tids, dev_2013_tweets, 'task-BD-dev-2013')] for (fids, fset, name) in sets: test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [fset], batch_size=batch_size, randomize=False) counter = 0 fname = open( os.path.join(data_dir, 'sentence-vecs/{}.txt'.format(name)), 'w+') for i, tweet in enumerate(tqdm(test_set_iterator), 1): o = output_fn(tweet[0]) for vec in o: fname.write(fids[counter]) for el in numpy.nditer(vec): fname.write(" %f" % el) fname.write("\n") counter += 1 if counter == test_set_iterator.n_samples: break ############################## # Get Predictions Probabilites# ############################# batch_size = input_shape[0] output = nnet_tweets.layers[-1].p_y_given_x output_fn = function(inputs=inputs_senvec, outputs=output, givens=givents_senvec) for (fids, fset, name) in sets: test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [fset], batch_size=batch_size, randomize=False) counter = 0 fname = open( os.path.join(data_dir, 'prob_predictions/{}.txt'.format(name)), 'w+') for i, tweet in enumerate(tqdm(test_set_iterator), 1): o = output_fn(tweet[0]) for vec in o: for el in numpy.nditer(vec): fname.write(" %f" % el) fname.write("\n") counter += 1 if counter == test_set_iterator.n_samples: break
def main(): ZEROUT_DUMMY_WORD = True arg_parser = argparse.ArgumentParser() arg_parser.add_argument('--data', type=unicode, help='E.g.: pre, pre_hf, etc.') arg_parser.add_argument('--filter', '-f', type=unicode, default='3,4,5', help='E.g.: 3 or 3,4,5') arg_parser.add_argument('--n_kernels', type=int, default=100, help='# of kernels (filters)') arg_parser.add_argument('--n_epochs', type=int, default=25) arg_parser.add_argument('--batch_size', type=int, default=50) arg_parser.add_argument('--dropout_rate', type=float, default=0.5) arg_parser.add_argument('--vocab_embedding_type', type=unicode, default='both', help='both/static/nonstatic') arg_parser.add_argument('--vocab_embedding_size', type=int, default='50', help='50/200/500/800') arg_parser.add_argument('--L2_embedding', type=float, default=0.) arg_parser.add_argument('--L2_conv', type=float, default=0.) arg_parser.add_argument('--L2_linear', type=float, default=0.) arg_parser.add_argument('--activation', type=unicode, default='tanh') arg_parser.add_argument('--input_type', type=unicode, default='index', help='E.g.: index (use a lookup layer), embed.') arg_parser.set_defaults(save_nn_features=False) arg_parser.add_argument('--save_features', dest='save_nn_features', action='store_true', help='Save outputs from second last layer') args = arg_parser.parse_args() data_name = args.data filter_widths = [int(a) for a in args.filter.split(',')] n_epochs = args.n_epochs batch_size = args.batch_size dropout_rate = args.dropout_rate vocab_embedding_type = args.vocab_embedding_type vocab_embedding_size = args.vocab_embedding_size L2_embedding = args.L2_embedding L2_conv = args.L2_conv L2_linear = args.L2_linear nkernels = args.n_kernels activation_str = args.activation input_type = args.input_type save_nn_features = args.save_nn_features ## Load data data_dir = '../../data/{}'.format(data_name) embedding_dir = '../../data/word2vec' if input_type == 'index': x_train = np.load(os.path.join(data_dir, 'train_input.npy')) x_dev = np.load(os.path.join(data_dir, 'valid_input.npy')) x_test = np.load(os.path.join(data_dir, 'test_input.npy')) elif input_type == 'embed': x_train = np.load(os.path.join( data_dir, 'train_embed_{}.npy'.format(vocab_embedding_size))) x_dev = np.load(os.path.join( data_dir, 'valid_embed_{}.npy'.format(vocab_embedding_size))) x_test = np.load(os.path.join( data_dir, 'test_embed_{}.npy'.format(vocab_embedding_size))) y_train = np.load(os.path.join(data_dir, 'train_label.npy')) y_dev = np.load(os.path.join(data_dir, 'valid_label.npy')) y_test = np.load(os.path.join(data_dir, 'test_label.npy')) y_candidates = np.unique(np.concatenate((y_train, y_dev, y_test))) n_y_class = len(y_candidates) # for multi class label, from (0, 1, 3, 7, ..) to (0, 1, 2, 3, ...) y_train = np.array([np.where(y_candidates==yy)[0][0] for yy in y_train], dtype='int32') y_dev = np.array([np.where(y_candidates==yy)[0][0] for yy in y_dev], dtype='int32') y_test = np.array([np.where(y_candidates==yy)[0][0] for yy in y_test], dtype='int32') if n_y_class > 2: y_train_foreval = np.zeros([len(y_train), n_y_class]) y_train_foreval[np.arange(len(y_train)), y_train] = 1 y_dev_foreval = np.zeros([len(y_dev), n_y_class]) y_dev_foreval[np.arange(len(y_dev)), y_dev] = 1 y_test_foreval = np.zeros([len(y_test), n_y_class]) y_test_foreval[np.arange(len(y_test)), y_test] = 1 else: y_train_foreval = np.array(y_train > 0, dtype=int) y_dev_foreval = np.array(y_dev > 0, dtype=int) y_test_foreval = np.array(y_test > 0, dtype=int) print 'y_train', np.unique(y_train, return_counts=True), print 'y_dev', np.unique(y_dev, return_counts=True) print 'y_test', np.unique(y_test, return_counts=True) print 'x_train', x_train.shape, x_train.dtype, theano.config.floatX print 'x_dev', x_dev.shape print 'x_test', x_test.shape np_rng = np.random.RandomState() x_max_sent_size = x_train.shape[1] if input_type == 'index': ## Load word2vec embeddings fname = os.path.join(embedding_dir, 'word2vec_{}.npy'.format(vocab_embedding_size)) print "Loading word embeddings from", fname vocab_emb = np.asarray(np.load(fname), dtype=theano.config.floatX) ndim = vocab_emb.shape[1] dummy_word_idx = vocab_emb.shape[0] - 1 print "Word embedding matrix size:", vocab_emb.shape, type(vocab_emb), vocab_emb.dtype print 'dummy word index:', dummy_word_idx elif input_type == 'embed': ndim = x_train.shape[2] assert ndim == vocab_embedding_size, \ 'n_dim {} should be the same as emb_size {}'.format(ndim, vocab_embedding_size) if input_type == 'index': x = T.lmatrix('x') else: x = T.dtensor3('x_emb') y = T.ivector('y') ## Settings n_out = n_y_class if n_y_class > 2 else 1 max_norm = 0 print 'batch_size', batch_size print 'n_epochs', n_epochs print 'dropout_rate', dropout_rate print 'max_norm', max_norm print 'n_out', n_out print 'filter_widths', filter_widths reg_str = 'L2emb{}L2conv{}L2linear{}'.format(args.L2_embedding, args.L2_conv, args.L2_linear) setting_str = 'filter={filter};n_f={n_f};activation={activation};' \ 'emb_size={emb_size};emb_type={emb_type};reg={reg};' \ ''.format(filter=args.filter, n_f=args.n_kernels, activation=args.activation, emb_size=args.vocab_embedding_size, emb_type=args.vocab_embedding_type, reg=reg_str) ts_str = datetime.now().strftime('%Y-%m-%d-%H.%M.%S') nnet_outdir_pattern = ('../../output/{data}/{setting};time={time}') nnet_outdir = nnet_outdir_pattern.format(data=data_name, setting=setting_str, time=ts_str) if not os.path.exists(nnet_outdir): os.makedirs(nnet_outdir) ## Conv layer. activation = T.tanh if activation_str == 'tanh' else T.nnet.relu k_max = 1 num_input_channels = 1 # not all of the following 3 layers are used: if input_type == 'index': lookup_table_static = nn_layers.LookupTableFastStatic( W=vocab_emb, pad=max(filter_widths)-1) lookup_table_nonstatic = nn_layers.LookupTableFast( W=vocab_emb, pad=max(filter_widths)-1, borrow=False) elif input_type == 'embed': lookup_table_static = nn_layers.EmbeddingInput( pad=max(filter_widths)-1) # This is the input shape to the conv layer, not the first layer. input_shape = (batch_size, num_input_channels, x_max_sent_size + 2*(max(filter_widths)-1), ndim) tconv_layers = [] for i_width, filter_width in enumerate(filter_widths): filter_shape = (nkernels, num_input_channels, filter_width, ndim) conv = nn_layers.Conv2dLayer( rng=np_rng, filter_shape=filter_shape, input_shape=input_shape) non_linearity = nn_layers.NonLinearityLayer( b_size=filter_shape[0], activation=activation) conv_static = nn_layers.FeedForwardNet(layers=[conv, non_linearity]) if vocab_embedding_type == 'nonstatic': conv_nonstatic = nn_layers.FeedForwardNet(layers=[conv, non_linearity]) else: conv_nonstatic = nn_layers.CopiedLayer(conv_static) if i_width == 0: tc_static = nn_layers.FeedForwardNet( layers=[lookup_table_static, conv_static]) if input_type == 'index': tc_nonstatic = nn_layers.FeedForwardNet( layers=[lookup_table_nonstatic, conv_nonstatic]) else: tc_static = nn_layers.FeedForwardNet( layers=[nn_layers.CopiedLayer(lookup_table_static), conv_static]) if input_type == 'index': tc_nonstatic = nn_layers.FeedForwardNet( layers=[nn_layers.CopiedLayer(lookup_table_nonstatic), conv_nonstatic]) if vocab_embedding_type == 'both': tc_multichannel = nn_layers.SumMergeLayer( layers=[tc_static, tc_nonstatic]) elif vocab_embedding_type == 'static': tc_multichannel = tc_static elif vocab_embedding_type == 'nonstatic': tc_multichannel = tc_nonstatic pooling = nn_layers.KMaxPoolLayer(k_max=k_max) tconv2dNonLinearMaxPool = nn_layers.FeedForwardNet( layers=[tc_multichannel, pooling]) tconv_layers.append(tconv2dNonLinearMaxPool) join_layer = nn_layers.ParallelLayer(layers=tconv_layers) flatten_layer = nn_layers.FlattenLayer() nnet = nn_layers.FeedForwardNet( layers=[join_layer, flatten_layer, ]) nnet.set_input(x) logistic_n_in = nkernels * len(filter_widths) * k_max dropout = nn_layers.DropoutLayer(rng=np_rng, p=dropout_rate) dropout.set_input(nnet.output) if n_out > 2: classifier = nn_layers.LogisticRegression(n_in=logistic_n_in, n_out=n_out) else: classifier = nn_layers.BinaryLogisticRegression(n_in=logistic_n_in) classifier.set_input(dropout.output) train_nnet = nn_layers.FeedForwardNet( layers=[nnet, dropout, classifier], name="Training nnet") test_nnet = train_nnet print 'train_nnet:\n{}'.format(train_nnet) params = train_nnet.params nnet_fname = os.path.join(nnet_outdir, 'nnet.dat') print "Saving to", nnet_fname cPickle.dump([train_nnet, test_nnet], open(nnet_fname, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) with open(os.path.join(nnet_outdir, 'model_str.txt'), 'w') as f: f.write(str(train_nnet)) total_params = sum([np.prod(param.shape.eval()) for param in params]) print 'Total params number:', total_params cost = train_nnet.layers[-1].training_cost(y) predictions = test_nnet.layers[-1].y_pred predictions_prob = test_nnet.layers[-1].p_y_given_x[:] second_last_features = test_nnet.layers[-3].output ## Add L_2 regularization print "Regularizing nnet weights: ", for w in train_nnet.weights: if w.name.startswith('W_emb'): L2_reg_w = L2_embedding elif w.name.startswith('W_conv1d'): L2_reg_w = L2_conv elif w.name.startswith('W_softmax'): L2_reg_w = L2_linear elif w.name == 'W': L2_reg_w = 0. print '{}:{}, '.format(w.name, L2_reg_w), cost += T.sum(w**2) * L2_reg_w print '' if input_type == 'index': batch_x = T.lmatrix('batch_x') elif input_type == 'embed': batch_x = T.dtensor3('batch_x_emb') batch_y = T.ivector('batch_y') updates = sgd_trainer.get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=max_norm, word_vec_name='W_emb') inputs_pred = [batch_x,] givens_pred = {x: batch_x,} inputs_train = [batch_x, batch_y,] givens_train = {x: batch_x, y: batch_y,} train_fn = theano.function(inputs=inputs_train, outputs=cost, updates=updates, givens=givens_train) pred_fn = theano.function(inputs=inputs_pred, outputs=predictions, givens=givens_pred) pred_prob_fn = theano.function(inputs=inputs_pred, outputs=predictions_prob, givens=givens_pred) features_fn = theano.function(inputs=inputs_pred, outputs=second_last_features, givens=givens_pred) def predict_batch(batch_iterator): preds = np.concatenate( [pred_fn(batch_data[0]) for batch_data in batch_iterator]) return preds[:batch_iterator.n_samples] def predict_prob_batch(batch_iterator): preds = np.concatenate( [pred_prob_fn(batch_data[0]) for batch_data in batch_iterator]) return preds[:batch_iterator.n_samples] def get_features_batch(batch_iterator): features = np.concatenate( [features_fn(batch_data[0]) for batch_data in batch_iterator]) return features[:batch_iterator.n_samples] train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( np_rng, [x_train, y_train], batch_size=batch_size, randomize=True) train_set_iterator_eval = sgd_trainer.MiniBatchIteratorConstantBatchSize( np_rng, [x_train, y_train], batch_size=batch_size, randomize=False) dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( np_rng, [x_dev, y_dev], batch_size=batch_size, randomize=False) test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( np_rng, [x_test, y_test], batch_size=batch_size, randomize=False) print "Zero out dummy word:", ZEROUT_DUMMY_WORD if ZEROUT_DUMMY_WORD: W_emb_list = [w for w in params if w.name == 'W_emb'] zerout_dummy_word = theano.function( [], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list] ) best_dev_auc = -np.inf epoch = 0 timer_train = time.time() no_best_dev_update = 0 num_train_batches = len(train_set_iterator) best_params = [np.copy(p.get_value(borrow=True)) for p in params] for i, p in enumerate(best_params): print i, p.shape, print best_params[i].sum() while epoch < n_epochs: timer = time.time() for i, (x, y) in enumerate(tqdm(train_set_iterator), 1): train_fn(x, y) # Make sure the null word embedding always remains zero if ZEROUT_DUMMY_WORD: zerout_dummy_word() if i % 10 == 0 or i == num_train_batches: y_pred_dev = predict_prob_batch(dev_set_iterator) print y_dev_foreval.shape, y_pred_dev.shape dev_auc = metrics.roc_auc_score(y_dev_foreval, y_pred_dev) * 100 if dev_auc > best_dev_auc: y_pred = predict_prob_batch(test_set_iterator) test_auc = metrics.roc_auc_score(y_test_foreval, y_pred) * 100 print ('epoch: {} batch: {} dev auc: {:.4f}; ' 'best_dev_auc: {:.4f}; test_auc: {:.4f}' .format(epoch, i, dev_auc, best_dev_auc, test_auc)) best_dev_auc = dev_auc best_params_pre = best_params best_params = [ np.copy(p.get_value(borrow=True)) for p in params] no_best_dev_update = 0 for i, p in enumerate(best_params): print i,p.shape,'\t\t\t', print np.array_equal(best_params[i],best_params_pre[i]), print '\t\t\t', print best_params[i].sum() print if no_best_dev_update >= 3: print "Quitting after of no update of the best score on dev set", print no_best_dev_update break print ('epoch {} took {:.4f} seconds' .format(epoch, time.time() - timer)) epoch += 1 no_best_dev_update += 1 print('Training took: {:.4f} seconds'.format(time.time() - timer_train)) for i, param in enumerate(best_params): params[i].set_value(param, borrow=True) y_pred_train = predict_batch(train_set_iterator_eval) y_pred_prob_train = predict_prob_batch(train_set_iterator_eval) y_pred_dev = predict_batch(dev_set_iterator) y_pred_prob_dev = predict_prob_batch(dev_set_iterator) y_pred_test = predict_batch(test_set_iterator) y_pred_prob_test = predict_prob_batch(test_set_iterator) print 'Train:' print 'acc is:', metrics.accuracy_score(y_train, y_pred_train) print 'auc is:', metrics.roc_auc_score(y_train_foreval, y_pred_prob_train) print 'prc is:', metrics.average_precision_score(y_train_foreval, y_pred_prob_train) print 'maxf1 is:', maxf1(y_train_foreval, y_pred_prob_train) print 'prec @ 10/20/30:', topKPrecision(y_train_foreval, y_pred_prob_train) print 'Valid:' print 'acc is:', metrics.accuracy_score(y_dev, y_pred_dev), print 'auc is:', metrics.roc_auc_score(y_dev_foreval, y_pred_prob_dev) print 'prc is:', metrics.average_precision_score(y_dev_foreval, y_pred_prob_dev) print 'maxf1 is:', maxf1(y_dev_foreval, y_pred_prob_dev) print 'prec @ 10/20/30:', topKPrecision(y_dev_foreval, y_pred_prob_dev) print 'Test:' test_acc = metrics.accuracy_score(y_test, y_pred_test) test_auc = metrics.roc_auc_score(y_test_foreval, y_pred_prob_test) test_prc = metrics.average_precision_score(y_test_foreval, y_pred_prob_test) test_maxf1 = maxf1(y_test_foreval, y_pred_prob_test) test_prec = topKPrecision(y_test_foreval, y_pred_prob_test) print 'acc is:', test_acc print 'auc is:', test_auc print 'prc is:', test_prc print 'maxf1 is:', test_maxf1 print 'prec @ 10/20/30:', test_prec with open('../../output/summary.txt', 'a') as f: f.write(data_name + '\t' + setting_str + '\t') f.write('\t'.join([str(a) for a in \ [test_acc, test_auc, test_prc, test_maxf1, test_prec]])) f.write('\n') fname = os.path.join(nnet_outdir, ('best_dev_params.epoch={:02d};batch={:05d};' 'dev_auc={:.2f}.dat' .format(epoch, i, best_dev_auc))) cPickle.dump(best_params, open(fname, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) pred_txt_name_suffix = ('.epoch={:02d};batch={:05d};' 'dev_auc={:.2f}.predictions.txt' .format(epoch, i, best_dev_auc)) np.savetxt(os.path.join(nnet_outdir, 'train' + pred_txt_name_suffix), y_pred_prob_train) np.savetxt(os.path.join(nnet_outdir, 'valid' + pred_txt_name_suffix), y_pred_prob_dev) np.savetxt(os.path.join(nnet_outdir, 'test' + pred_txt_name_suffix), y_pred_prob_test) if save_nn_features: y_features_train = get_features_batch(train_set_iterator_eval) y_features_dev = get_features_batch(dev_set_iterator) y_features_test = get_features_batch(test_set_iterator) np.save(os.path.join(nnet_outdir, 'cnn_features_train.npy'), y_features_train) np.save(os.path.join(nnet_outdir, 'cnn_features_dev.npy'), y_features_dev) np.save(os.path.join(nnet_outdir, 'cnn_features_test.npy'), y_features_test) N = len(y_pred_test) df_submission = pd.DataFrame( index=np.arange(N), columns=['docno', 'label','pred'] + \ ['p' + str(i+1) for i in xrange(n_out)]) df_submission['docno'] = np.arange(N) df_submission['label'] = y_test df_submission['pred'] = y_pred_test if n_out > 1: for i in xrange(n_out): df_submission['p' + str(i+1)] = y_pred_prob_test[:, i] else: df_submission['p1'] = y_pred_prob_test df_submission.to_csv(os.path.join(nnet_outdir, 'submission.txt'), header=True, index=True, sep=' ') df_submission.to_csv(os.path.join(nnet_outdir, 'submission1.txt'), header=False, index=False, sep=' ') print nnet_outdir print vocab_emb.shape print 'epoch', epoch
def main(): # ZEROUT_DUMMY_WORD = False ZEROUT_DUMMY_WORD = True ## Load data # mode = 'TRAIN-ALL' #mode = 'TRAIN_DATA' #mode = 'TRAIN_NO_OVERLAP' #if len(sys.argv) > 1: # mode = sys.argv[1] # if not mode in ['TRAIN', 'TRAIN-ALL']: # print "ERROR! The two possible training settings are: ['TRAIN', 'TRAIN-ALL']" # sys.exit(1) mode = 'k_time_data1'.upper() print "Running training in the {} setting".format(mode) position_num = 10 select_model = "PSCM" if select_model == "PSCM": click_model_index = 4 #PSCM elif select_model == "UBM": click_model_index = 1 else: raise "MODEL SELECT ERROR!" data_dir = mode add_train = numpy.load(os.path.join(data_dir, 'train.additions.npy')) q_train = numpy.load(os.path.join(data_dir, 'train.questions.npy')) a_train = numpy.load(os.path.join(data_dir, 'train.answers.npy')) y_train = numpy.load(os.path.join(data_dir, 'train.labels.npy')) add_dev = numpy.load(os.path.join(data_dir, 'dev.additions.npy')) q_dev = numpy.load(os.path.join(data_dir, 'dev.questions.npy')) a_dev = numpy.load(os.path.join(data_dir, 'dev.answers.npy')) #q_overlap_dev = numpy.load(os.path.join(data_dir, 'dev.q_overlap_indices.npy')) #a_overlap_dev = numpy.load(os.path.join(data_dir, 'dev.a_overlap_indices.npy')) y_dev = numpy.load(os.path.join(data_dir, 'dev.labels.npy')) qids_dev = numpy.load(os.path.join(data_dir, 'dev.qids.npy')) add_test = numpy.load(os.path.join(data_dir, 'test.additions.npy')) q_test = numpy.load(os.path.join(data_dir, 'test.questions.npy')) a_test = numpy.load(os.path.join(data_dir, 'test.answers.npy')) #q_overlap_test = numpy.load(os.path.join(data_dir, 'test.q_overlap_indices.npy')) #a_overlap_test = numpy.load(os.path.join(data_dir, 'test.a_overlap_indices.npy')) y_test = numpy.load(os.path.join(data_dir, 'test.labels.npy')) qids_test = numpy.load(os.path.join(data_dir, 'test.qids.npy')) # x_train = numpy.load(os.path.join(data_dir, 'train.overlap_feats.npy')) # x_dev = numpy.load(os.path.join(data_dir, 'dev.overlap_feats.npy')) # x_test = numpy.load(os.path.join(data_dir, 'test.overlap_feats.npy')) # feats_ndim = x_train.shape[1] # from sklearn.preprocessing import StandardScaler # scaler = StandardScaler() # print "Scaling overlap features" # x_train = scaler.fit_transform(x_train) # x_dev = scaler.transform(x_dev) # x_test = scaler.transform(x_test) #multi dim #y_train_tmp = numpy.dstack((y_train, y_train, y_train))[0] #y_dev_tmp = numpy.dstack((y_dev, y_dev, y_dev))[0] #y_test_tmp = numpy.dstack((y_test, y_test, y_test))[0] #y_train = y_train_tmp #y_dev = y_dev_tmp #y_test = y_test_tmp max_query_id = numpy.max([ numpy.max(add_train[:, 0]), numpy.max(add_test[:, 0]), numpy.max(add_dev[:, 0]) ]) max_url_id = numpy.max([ numpy.max(add_train[:, 1:]), numpy.max(add_test[:, 1:]), numpy.max(add_dev[:, 1:]) ]) print 'max_query_id', max_query_id print 'max_url_id', max_url_id print 'y_train', numpy.unique(y_train, return_counts=True) print 'y_dev', numpy.unique(y_dev, return_counts=True) print 'y_test', numpy.unique(y_test, return_counts=True) print 'q_train', q_train.shape print 'q_dev', q_dev.shape print 'q_test', q_test.shape print 'a_train', a_train.shape print 'a_dev', a_dev.shape print 'a_test', a_test.shape ## Get the word embeddings from the nnet trained on SemEval # ndim = 40 # nnet_outdir = 'exp/ndim=60;batch=100;max_norm=0;learning_rate=0.1;2014-12-02-15:53:14' # nnet_fname = os.path.join(nnet_outdir, 'nnet.dat') # params_fname = os.path.join(nnet_outdir, 'best_dev_params.epoch=00;batch=14640;dev_f1=83.12;test_acc=85.00.dat') # train_nnet, test_nnet = nn_layers.load_nnet(nnet_fname, params_fname) numpy_rng = numpy.random.RandomState(123) q_max_sent_size = q_train.shape[1] a_max_sent_size = a_train.shape[2] # print 'max', numpy.max(a_train) # print 'min', numpy.min(a_train) #ndim = 5 #print "Generating random vocabulary for word overlap indicator features with dim:", ndim #dummy_word_id = numpy.max(a_overlap_train) # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim)) #print "Gaussian" #vocab_emb_overlap = numpy_rng.randn(dummy_word_id + 1, ndim) * 0.25 # vocab_emb_overlap = numpy_rng.randn(dummy_word_id+1, ndim) * 0.05 # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim)) #vocab_emb_overlap[-1] = 0 # Load word2vec embeddings fname = os.path.join(data_dir, 'emb_vectors.skip.1124.4m.10w.npy') print "Loading word embeddings from", fname vocab_emb = numpy.load(fname) ndim = vocab_emb.shape[1] dummpy_word_idx = numpy.max(a_train) print "Word embedding matrix size:", vocab_emb.shape x = T.dmatrix('x') x_q = T.lmatrix('q') #x_q_overlap = T.lmatrix('q_overlap') #x_a = T.lmatrix('a') x_a_all = T.ltensor3('a_all') #x_a_overlap = T.lmatrix('a_overlap') #y = T.ivector('y') y = T.imatrix('y') add_info = T.dmatrix('add_info') ####### n_outs = 2 n_epochs = 15 batch_size = 50 learning_rate = 0.1 max_norm = 0 print 'batch_size', batch_size print 'n_epochs', n_epochs print 'learning_rate', learning_rate print 'max_norm', max_norm ## 1st conv layer. #ndim = vocab_emb.shape[1] + vocab_emb_overlap.shape[1] ndim = vocab_emb.shape[1] ### Nonlinearity type # activation = nn_layers.relu_f activation = T.tanh dropout_rate = 0.5 nkernels = 100 q_k_max = 1 a_k_max = 1 # filter_widths = [3,4,5] q_filter_widths = [5] a_filter_widths = [5] ###### QUESTION ###### lookup_table_words = nn_layers.LookupTableFastStatic( W=vocab_emb, pad=max(q_filter_widths) - 1) #lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=max(q_filter_widths) - 1) #lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words, lookup_table_overlap]) lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words]) num_input_channels = 1 input_shape = (batch_size, num_input_channels, q_max_sent_size + 2 * (max(q_filter_widths) - 1), ndim) conv_layers = [] for filter_width in q_filter_widths: filter_shape = (nkernels, num_input_channels, filter_width, ndim) conv = nn_layers.Conv2dLayer(rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape) non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0], activation=activation) pooling = nn_layers.KMaxPoolLayer(k_max=q_k_max) conv2dNonLinearMaxPool = nn_layers.FeedForwardNet( layers=[conv, non_linearity, pooling]) conv_layers.append(conv2dNonLinearMaxPool) join_layer = nn_layers.ParallelLayer(layers=conv_layers) flatten_layer = nn_layers.FlattenLayer() nnet_q = nn_layers.FeedForwardNet(layers=[ lookup_table, join_layer, flatten_layer, ]) #nnet_q.set_input((x_q, x_q_overlap)) nnet_q.set_input([x_q]) ###### ###### ANSWER ###### nnet_a_list = [] #lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths) - 1) for i in xrange(position_num): #lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths) - 1) #lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=max(q_filter_widths) - 1) #lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words, lookup_table_overlap]) #lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words]) # num_input_channels = len(lookup_table.layers) #input_shape = (batch_size, num_input_channels, a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim) input_shape = (batch_size, num_input_channels, a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim) conv_layers = [] for filter_width in a_filter_widths: filter_shape = (nkernels, num_input_channels, filter_width, ndim) conv = nn_layers.Conv2dLayer(rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape) non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0], activation=activation) pooling = nn_layers.KMaxPoolLayer(k_max=a_k_max) conv2dNonLinearMaxPool = nn_layers.FeedForwardNet( layers=[conv, non_linearity, pooling]) conv_layers.append(conv2dNonLinearMaxPool) join_layer = nn_layers.ParallelLayer(layers=conv_layers) flatten_layer = nn_layers.FlattenLayer() nnet_a = nn_layers.FeedForwardNet(layers=[ lookup_table, join_layer, flatten_layer, ]) #nnet_a.set_input((x_a, x_a_overlap)) nnet_a.set_input([x_a_all[:, i, :]]) nnet_a_list.append(nnet_a) ####### # print 'nnet_q.output', nnet_q.output.ndim q_logistic_n_in = nkernels * len(q_filter_widths) * q_k_max #a_logistic_n_in = nkernels * len(a_filter_widths) * a_k_max a_logistic_n_in = nkernels * len(a_filter_widths) * a_k_max print "q_logistic_n_in, ", q_logistic_n_in print "a_logistic_n_in, ", a_logistic_n_in #pairwise_layer = nn_layers.PositionPairwiseNoFeatsLayer(q_in=q_logistic_n_in, a_in=a_logistic_n_in,position=position_num) pairwise_layer = nn_layers.PositionOnlySimPairwiseNoFeatsLayer( q_in=q_logistic_n_in, a_in=a_logistic_n_in, position=position_num) pairwise_out_list = [nnet_q.output] for i in xrange(position_num): pairwise_out_list.append(nnet_a_list[i].output) pairwise_layer.set_input(pairwise_out_list) #pairwise_layer.set_input((nnet_q.output, nnet_a.output)) # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + a_logistic_n_in # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 50 # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 1 #n_in = q_logistic_n_in + a_logistic_n_in * position_num + 1 * position_num #n_in = 1 * position_num + position_num * (position_num - 1) / 2 n_in = q_logistic_n_in + a_logistic_n_in * position_num + 1 * position_num + position_num * ( position_num - 1) / 2 # n_in = feats_ndim + 1 # n_in = feats_ndim + 50 hidden_layer = nn_layers.LinearLayer(numpy_rng, n_in=n_in, n_out=n_in, activation=activation) hidden_layer.set_input(pairwise_layer.output) #classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs) #classifier.set_input(hidden_layer.output) classifier = nn_layers.FeatureClickModelLayer( n_in=n_in, n_out=n_outs, max_q_id=max_query_id, max_u_id=max_url_id, dim=position_num, click_model_index=click_model_index) #classifier = nn_layers.SimpleClickModelLayer(n_in=n_in, n_out=n_outs, max_q_id=max_query_id, max_u_id=max_url_id, dim=position_num) #classifier = nn_layers.MultiDimLogisticRegression(n_in=n_in, n_out=n_outs, dim=position_num) #classifier = nn_layers.LogisticRegression2(n_in=n_in, n_out=n_outs) classifier.set_input([hidden_layer.output, add_info]) #train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, pairwise_layer, hidden_layer, classifier], # name="Training nnet") train_nnet = nn_layers.FeedForwardNet( layers=[nnet_q] + nnet_a_list + [pairwise_layer, hidden_layer, classifier], name="Training nnet") test_nnet = train_nnet ####### #print train_nnet params = train_nnet.params ts = datetime.now().strftime('%Y-%m-%d-%H.%M.%S') nnet_outdir = 'exp.multi.out/model={},data={};ndim={};batch={};max_norm={};learning_rate={};{}'.format( select_model, mode, ndim, batch_size, max_norm, learning_rate, ts) if not os.path.exists(nnet_outdir): os.makedirs(nnet_outdir) nnet_fname = os.path.join(nnet_outdir, 'nnet.dat') print "Saving to", nnet_fname cPickle.dump([train_nnet, test_nnet], open(nnet_fname, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) #total_params = sum([numpy.prod(param.shape.eval()) for param in params]) #print 'Total params number:', total_params cost = train_nnet.layers[-1].training_cost(y) # y_train_counts = numpy.unique(y_train, return_counts=True)[1].astype(numpy.float32) # weights_data = numpy.sum(y_train_counts) / y_train_counts # weights_data_norm = numpy.linalg.norm(weights_data) # weights_data /= weights_data_norm # print 'weights_data', weights_data # weights = theano.shared(weights_data, borrow=True) # cost = train_nnet.layers[-1].training_cost_weighted(y, weights=weights) predictions = test_nnet.layers[-1].y_pred #predictions_prob = test_nnet.layers[-1].p_y_given_x[:, position_num:position_num * 2] predictions_prob = test_nnet.layers[-1].p_y_given_x ### L2 regularization # L2_word_emb = 1e-4 # L2_conv1d = 3e-5 # # L2_softmax = 1e-3 # L2_softmax = 1e-4 # print "Regularizing nnet weights" # for w in train_nnet.weights: # L2_reg = 0. # if w.name.startswith('W_emb'): # L2_reg = L2_word_emb # elif w.name.startswith('W_conv1d'): # L2_reg = L2_conv1d # elif w.name.startswith('W_softmax'): # L2_reg = L2_softmax # elif w.name == 'W': # L2_reg = L2_softmax # print w.name, L2_reg # cost += T.sum(w**2) * L2_reg # batch_x = T.dmatrix('batch_x') batch_x_q = T.lmatrix('batch_x_q') #batch_x_a = T.lmatrix('batch_x_a') batch_x_a_all = T.ltensor3('batch_x_a_all') #batch_x_q_overlap = T.lmatrix('batch_x_q_overlap') #batch_x_a_overlap = T.lmatrix('batch_x_a_overlap') #batch_y = T.ivector('batch_y') batch_y = T.imatrix('batch_y') batch_add_info = T.dmatrix('batch_add_info') # updates = sgd_trainer.get_adagrad_updates(cost, params, learning_rate=learning_rate, max_norm=max_norm, _eps=1e-6) updates = sgd_trainer.get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=max_norm, word_vec_name='W_emb') inputs_pred = [ batch_x_q, batch_x_a_all, batch_add_info, #batch_x_q_overlap, #batch_x_a_overlap, # batch_x, ] givens_pred = { x_q: batch_x_q, x_a_all: batch_x_a_all, add_info: batch_add_info, #x_q_overlap: batch_x_q_overlap, #x_a_overlap: batch_x_a_overlap, # x: batch_x } inputs_train = [ batch_x_q, batch_x_a_all, #batch_x_q_overlap, #batch_x_a_overlap, # batch_x, batch_add_info, batch_y, ] givens_train = { x_q: batch_x_q, x_a_all: batch_x_a_all, #x_q_overlap: batch_x_q_overlap, #x_a_overlap: batch_x_a_overlap, # x: batch_x, add_info: batch_add_info, y: batch_y } train_fn = theano.function(inputs=inputs_train, outputs=cost, updates=updates, givens=givens_train, on_unused_input='warn') pred_fn = theano.function(inputs=inputs_pred, outputs=predictions, givens=givens_pred, on_unused_input='warn') pred_prob_fn = theano.function(inputs=inputs_pred, outputs=predictions_prob, givens=givens_pred, on_unused_input='warn') def predict_batch(batch_iterator): #preds = numpy.vstack([pred_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap) for # batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, _ in batch_iterator]) preds = numpy.vstack([ pred_fn(batch_x_q, batch_x_a, batch_add_info) for batch_x_q, batch_x_a, batch_add_info, _ in batch_iterator ]) real_preds = preds[:, -1 * position_num:] inner_outputs = preds return real_preds[:batch_iterator. n_samples], inner_outputs[:batch_iterator.n_samples] def predict_prob_batch(batch_iterator): #preds = numpy.vstack([pred_prob_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap) for # batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, _ in batch_iterator]) preds = numpy.vstack([ pred_prob_fn(batch_x_q, batch_x_a, batch_add_info) for batch_x_q, batch_x_a, batch_add_info, _ in batch_iterator ]) real_preds = preds[:, -1 * position_num:] inner_outputs = preds return real_preds[:batch_iterator. n_samples], inner_outputs[:batch_iterator.n_samples] train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [q_train, a_train, add_train, y_train], batch_size=batch_size, randomize=True) dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [q_dev, a_dev, add_dev, y_dev], batch_size=batch_size, randomize=False) test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [q_test, a_test, add_test, y_test], batch_size=batch_size, randomize=False) labels = sorted(numpy.unique(y_test[:, -1])) print 'labels', labels def perplexity_score(labels, preds): positionPerplexity = [0.0] * position_num positionPerplexityClickSkip = [[0.0, 0.0] for i in xrange(position_num)] counts = [0] * position_num countsClickSkip = [[0, 0] for i in xrange(position_num)] for label, pred in zip(labels, preds): for i in range(0, len(label)): click = 1 if label[i] else 0 tmp_pred = max(min(pred[i], 0.99999), 0.00001) logProb = math.log(tmp_pred, 2) if click == 0: logProb = math.log(1 - tmp_pred, 2) positionPerplexity[i] += logProb positionPerplexityClickSkip[i][click] += logProb counts[i] += 1 countsClickSkip[i][click] += 1 positionPerplexity = [ 2**(-x / count if count else x) for (x, count) in zip(positionPerplexity, counts) ] positionPerplexityClickSkip = [[2 ** (-x[click] / (count[click] if count[click] else 1) if count else x) \ for (x, count) in zip(positionPerplexityClickSkip, countsClickSkip)] for click in xrange(2)] perplexity = sum(positionPerplexity) / len(positionPerplexity) ret_str = "---------\n" ret_str += "Perplexity\t" + str(perplexity) + "\n" ret_str += "positionPerplexity" for i in range(0, position_num): ret_str += "\t" + str(positionPerplexity[i]) ret_str += "\n" ret_str += "positionPerplexitySkip" for i in range(0, position_num): ret_str += "\t" + str(positionPerplexityClickSkip[0][i]) ret_str += "\n" ret_str += "positionPerplexityClick" for i in range(0, position_num): ret_str += "\t" + str(positionPerplexityClickSkip[1][i]) ret_str += "\n------------\n" #print ret_str return perplexity, ret_str def map_score(qids, labels, preds): qid2cand = defaultdict(list) for qid, label, pred in zip(qids, labels, preds): qid2cand[qid].append((pred, label)) average_precs = [] for qid, candidates in qid2cand.iteritems(): average_prec = 0 running_correct_count = 0 for i, (score, label) in enumerate(sorted(candidates, reverse=True), 1): if label > 0: running_correct_count += 1 average_prec += float(running_correct_count) / i average_precs.append(average_prec / (running_correct_count + 1e-6)) map_score = sum(average_precs) / len(average_precs) return map_score print "Zero out dummy word:", ZEROUT_DUMMY_WORD if ZEROUT_DUMMY_WORD: W_emb_list = [w for w in params if w.name == 'W_emb'] zerout_dummy_word = theano.function( [], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list]) # weights_dev = numpy.zeros(len(y_dev)) # weights_dev[y_dev == 0] = weights_data[0] # weights_dev[y_dev == 1] = weights_data[1] # print weights_dev best_dev_acc = -numpy.inf best_dev_perp = numpy.inf epoch = 0 timer_train = time.time() no_best_dev_update = 0 num_train_batches = len(train_set_iterator) while epoch < n_epochs: timer = time.time() for i, (x_q, x_a, add, y) in enumerate(tqdm(train_set_iterator), 1): train_fn(x_q, x_a, add, y) # Make sure the null word in the word embeddings always remains zero if ZEROUT_DUMMY_WORD: zerout_dummy_word() if i % 10 == 0 or i == num_train_batches: y_pred_dev, y_inner_dev = predict_prob_batch(dev_set_iterator) #print "shape:" #print str(y_dev.shape) #print str(y_pred_dev.shape) # # dev_acc = map_score(qids_dev, y_dev, predict_prob_batch(dev_set_iterator)) * 100 dev_acc = metrics.roc_auc_score(y_dev[:, -1], y_pred_dev[:, -1]) * 100 dev_perp, dev_perp_str = perplexity_score(y_dev, y_pred_dev) if dev_acc > best_dev_acc: y_pred, y_inner = predict_prob_batch(test_set_iterator) test_acc = map_score(qids_test, y_test[:, -1], y_pred[:, -1]) * 100 print( 'epoch: {} batch: {} dev auc: {:.4f}; test map: {:.4f}; best_dev_acc: {:.4f}' .format(epoch, i, dev_acc, test_acc, best_dev_acc)) best_dev_acc = dev_acc if dev_perp < best_dev_perp: y_pred, y_inner = predict_prob_batch(test_set_iterator) test_acc = map_score(qids_test, y_test[:, -1], y_pred[:, -1]) * 100 test_perplexity, test_perplexity_str = perplexity_score( y_test, y_pred) print( 'epoch: {} batch: {} dev auc: {:.4f}; test map: {:.4f}; best_dev_acc: {:.4f}; dev_perp: {:.4f}; best_dev_perp: {:.4f}' .format(epoch, i, dev_acc, test_acc, best_dev_acc, dev_perp, best_dev_perp)) print str(test_perplexity_str) best_params = [ numpy.copy(p.get_value(borrow=True)) for p in params ] best_inner = y_inner no_best_dev_update = 0 best_dev_perp = dev_perp if no_best_dev_update >= 3: print "Quitting after of no update of the best score on dev set", no_best_dev_update break numpy.savetxt( os.path.join( nnet_outdir, 'test.epoch={:02d};batch={:05d};dev_perp={:.2f}.best_inner.npy' .format(epoch, i, best_dev_perp)), best_inner) print('epoch {} took {:.4f} seconds'.format(epoch, time.time() - timer)) epoch += 1 no_best_dev_update += 1 print('Training took: {:.4f} seconds'.format(time.time() - timer_train)) for i, param in enumerate(best_params): params[i].set_value(param, borrow=True) y_pred_test, y_inner_test = predict_prob_batch(test_set_iterator) test_acc = map_score(qids_test, y_test[:, -1], y_pred_test[:, -1]) * 100 test_perp, test_perp_str = perplexity_score(y_test, y_pred_test) print "FINAL ACCURACY" print str(test_acc) print "FINAL PERPLEXITY" print str(test_perp_str) fname = os.path.join( nnet_outdir, 'best_dev_params.epoch={:02d};batch={:05d};dev_acc={:.2f}.dat'.format( epoch, i, best_dev_acc)) numpy.savetxt( os.path.join( nnet_outdir, 'test.epoch={:02d};batch={:05d};dev_acc={:.2f}.predictions.npy'. format(epoch, i, best_dev_acc)), y_pred_test) numpy.savetxt( os.path.join( nnet_outdir, 'test.final.epoch={:02d};batch={:05d};dev_acc={:.2f}.best_inner.npy' .format(epoch, i, best_dev_acc)), best_inner) cPickle.dump(best_params, open(fname, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL)
def main(): data_dir = "parsed_tweets" numpy_rng = numpy.random.RandomState(123) q_max_sent_size = 140 # Load word2vec embeddings embedding_fname = 'emb_smiley_tweets_embedding_final.npy' fname_wordembeddings = os.path.join(data_dir, embedding_fname) print "Loading word embeddings from", fname_wordembeddings vocab_emb = numpy.load(fname_wordembeddings) print type(vocab_emb[0][0]) print "Word embedding matrix size:", vocab_emb.shape #Load hasthag embeddings embedding_fname = 'emb_smiley_tweets_embedding_topn.npy' fname_htembeddings = os.path.join(data_dir, embedding_fname) print "Loading word embeddings from", fname_htembeddings vocab_emb_ht = numpy.load(fname_htembeddings) print type(vocab_emb_ht[0][0]) print "Word embedding matrix size:", vocab_emb_ht.shape print 'Load Test Set' dev_set = numpy.load( 'parsed_tweets/hashtag_top100_smiley_tweets_test.tweets.npy') y_dev_set = numpy.load( 'parsed_tweets/hashtag_top100_smiley_tweets_test.hashtags.npy') tweets = T.imatrix('tweets_train') y = T.lvector('y_train') ####### n_outs = 100 batch_size = 1000 max_norm = 0 print 'batch_size', batch_size print 'max_norm', max_norm ## 1st conv layer. ndim = vocab_emb.shape[1] ### Nonlinearity type def relu(x): return x * (x > 0) activation = relu nkernels1 = 1000 k_max = 1 num_input_channels = 1 filter_width1 = 4 n_in = nkernels1 * k_max input_shape = (batch_size, num_input_channels, q_max_sent_size + 2 * (filter_width1 - 1), ndim) ########## # LAYERS # ######### parameter_map = {} parameter_map['nKernels1'] = nkernels1 parameter_map['num_input_channels'] = num_input_channels parameter_map['ndim'] = ndim parameter_map['inputShape'] = input_shape parameter_map['activation'] = 'relu' parameter_map['n_in'] = n_in parameter_map['kmax'] = k_max parameter_map['filterWidth'] = filter_width1 lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=filter_width1 - 1) parameter_map['LookupTableFastStaticW'] = lookup_table_words.W filter_shape = (nkernels1, num_input_channels, filter_width1, ndim) parameter_map['FilterShape' + str(filter_width1)] = filter_shape conv = nn_layers.Conv2dLayer(rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape) parameter_map['Conv2dLayerW' + str(filter_width1)] = conv.W non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0], activation=activation) parameter_map['NonLinearityLayerB' + str(filter_width1)] = non_linearity.b pooling = nn_layers.KMaxPoolLayer(k_max=k_max) conv2dNonLinearMaxPool = nn_layers.FeedForwardNet( layers=[conv, non_linearity, pooling]) flatten_layer = nn_layers.FlattenLayer() hidden_layer = nn_layers.LinearLayer(numpy_rng, n_in=n_in, n_out=n_in, activation=activation) parameter_map['LinearLayerW'] = hidden_layer.W parameter_map['LinearLayerB'] = hidden_layer.b classifier = nn_layers.Training(numpy_rng, W=None, shape=(102, nkernels1)) #classifier = nn_layers.LogisticRegression(n_in=n_in,n_out=n_outs) nnet_tweets = nn_layers.FeedForwardNet(layers=[ lookup_table_words, conv2dNonLinearMaxPool, flatten_layer, hidden_layer, classifier ]) nnet_tweets.set_input(tweets) print nnet_tweets ################ # TRAIN MODEL # ############### batch_tweets = T.imatrix('batch_x_q') batch_y = T.lvector('batch_y') params = nnet_tweets.params print params mrg_rng = MRG_RandomStreams() i = mrg_rng.uniform(size=(batch_size, vocab_emb_ht.shape[0]), low=0.0, high=1.0, dtype=theano.config.floatX).argsort(axis=1) cost = nnet_tweets.layers[-1].training_cost(y, i) predictions = nnet_tweets.layers[-1].y_pred predictions_prob = nnet_tweets.layers[-1].f #cost = nnet_tweets.layers[-1].training_cost(y) #predictions = nnet_tweets.layers[-1].y_pred #predictions_prob = nnet_tweets.layers[-1].p_y_given_x[:, -1] inputs_train = [batch_tweets, batch_y] givens_train = {tweets: batch_tweets, y: batch_y} inputs_pred = [batch_tweets] givens_pred = {tweets: batch_tweets} updates = sgd_trainer.get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=max_norm, word_vec_name='None') train_fn = theano.function(inputs=inputs_train, outputs=cost, updates=updates, givens=givens_train) pred_fn = theano.function(inputs=inputs_pred, outputs=predictions, givens=givens_pred) pred_prob_fn = theano.function(inputs=inputs_pred, outputs=predictions_prob, givens=givens_pred) def predict_prob_batch(batch_iterator): preds = numpy.vstack( [pred_prob_fn(batch_x_q[0]) for batch_x_q in batch_iterator]) return preds[:batch_iterator.n_samples] def predict_batch(batch_iterator): preds = numpy.vstack( [pred_fn(batch_x_q[0]) for batch_x_q in batch_iterator]) return preds[:batch_iterator.n_samples] W_emb_list = [w for w in params if w.name == 'W_emb'] zerout_dummy_word = theano.function([], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list]) epoch = 0 n_epochs = 25 early_stop = 3 best_dev_acc = -numpy.inf no_best_dev_update = 0 timer_train = time.time() done = False best_params = [numpy.copy(p.get_value(borrow=True)) for p in params] while epoch < n_epochs and not done: max_chunks = numpy.inf curr_chunks = 0 timer = time.time() fname_tweet = open( os.path.join(data_dir, 'hashtag_top100_smiley_tweets_train.tweets.npy'), 'rb') fname_sentiments = open( os.path.join(data_dir, 'hashtag_top100_smiley_tweets_train.hashtags.npy'), 'rb') while curr_chunks < max_chunks: train_set, y_train_set, chunks = get_next_chunk(fname_tweet, fname_sentiments, n_chunks=2) curr_chunks += chunks if train_set is None: break print "Length trains_set:", len(train_set) print "Length dev_set:", len(dev_set) print "Length y_trains_set:", len(y_train_set) print "Length y_dev_set:", len(y_dev_set) train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [train_set, y_train_set], batch_size=batch_size, randomize=True) dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [dev_set], batch_size=batch_size, randomize=False) for i, (tweet, y_label) in enumerate(tqdm(train_set_iterator, ascii=True), 1): train_fn(tweet, y_label) # Make sure the null word in the word embeddings always remains zero zerout_dummy_word() y_pred_dev = predict_prob_batch(dev_set_iterator) dev_acc = precision_at_k(y_dev_set, y_pred_dev, k=1) * 100 #dev_acc = metrics.accuracy_score(y_dev_set,y_pred_dev) if dev_acc > best_dev_acc: print( 'epoch: {} chunk: {} best_chunk_auc: {:.4f}; best_dev_acc: {:.4f}' .format(epoch, curr_chunks, dev_acc, best_dev_acc)) best_dev_acc = dev_acc no_best_dev_update = 0 else: print( 'epoch: {} chunk: {} best_chunk_auc: {:.4f}; best_dev_acc: {:.4f}' .format(epoch, curr_chunks, dev_acc, best_dev_acc)) cPickle.dump( parameter_map, open(data_dir + '/parameters_{}.p'.format('distant'), 'wb')) cPickle.dump( parameter_map, open(data_dir + '/parameters_{}.p'.format('distant'), 'wb')) print('epoch {} took {:.4f} seconds'.format(epoch, time.time() - timer)) if no_best_dev_update >= early_stop: print "Quitting after of no update of the best score on dev set", no_best_dev_update break no_best_dev_update += 1 epoch += 1 fname_tweet.close() fname_sentiments.close() cPickle.dump(parameter_map, open(data_dir + '/parameters_{}.p'.format('distant'), 'wb')) print('Training took: {:.4f} seconds'.format(time.time() - timer_train))