def errorAnalysisRundown(): clbl_vec, flbl_vec = process_qc.label_structure('../exp/label_struct_bi') predsFile = '../exp/predictions' goldFile = '../exp/goldrs' textFile = '../data/QC/Chinese_qc/finaltest' findErrors(predsFile, goldFile, textFile, clbl_vec) findCorrects(predsFile, goldFile, textFile, clbl_vec)
def lbl2index(): c_vec, f_vec = label_structure('./label_structure_new') c2idx = {} f2idx = {} for i in xrange(len(c_vec)): c2idx[c_vec[i]] = i for i in xrange(len(f_vec)): f2idx[f_vec[i]] = i return c2idx, f2idx
def lbl2index(): c_vec, f_vec = label_structure("./label_structure_new") c2idx = {} f2idx = {} for i in xrange(len(c_vec)): c2idx[c_vec[i]] = i for i in xrange(len(f_vec)): f2idx[f_vec[i]] = i return c2idx, f2idx
def outputErrorInstances(): bestfFile = _pathBase_ + 'exp/bestjointcnnfrs' goldfrsFile = _pathBase_ + 'exp/goldfrs' rcnstrct_sents_file = _pathBase_ + 'exp/reconstructed_sentences' maxplrsFile = _pathBase_ + 'exp/feature_map_max' error_output_file = _pathBase_ + 'exp/error_max_pool_info' with open(bestfFile, 'r') as reader: predlblseq = [int(line) for line in reader.readlines()] with open(goldfrsFile, 'r') as reader: goldlblseq = [int(line) for line in reader.readlines()] with open(rcnstrct_sents_file, 'r') as reader: sentseq = [line.rstrip() for line in reader.readlines()] with open(maxplrsFile, 'r') as reader: maxplseq = [[int(strint) for strint in line.split()] for line in reader.readlines()] assert len(goldlblseq) == len(predlblseq) == len(sentseq) == len(maxplseq) c_vec, f_vec = label_structure('label_structure_new') error_instances = [] maxplinfolines = [] count = 0 for i in xrange(len(goldlblseq)): if goldlblseq[i] != predlblseq[i]: count += 1 words = sentseq[i].split() error_instances.append(f_vec[predlblseq[i]] + '\t' + f_vec[goldlblseq[i]] + '\t' + sentseq[i]) maxplinfo = '' for j in xrange(len(maxplseq[i])): argmax = maxplseq[i][j] threegram = str(j) + ':' + words[argmax] + '/' + words[ argmax + 1] + '/' + words[argmax + 2] maxplinfo += threegram + ' ' maxplinfolines.append(maxplinfo.rstrip()) assert len(error_instances) == len(maxplinfolines) print count with open(error_output_file, 'w') as writer: for i in xrange(len(error_instances)): writer.write(error_instances[i] + '\n') writer.write(maxplinfolines[i] + '\n')
def outputErrorInstances(): bestfFile = _pathBase_ + "exp/bestjointcnnfrs" goldfrsFile = _pathBase_ + "exp/goldfrs" rcnstrct_sents_file = _pathBase_ + "exp/reconstructed_sentences" maxplrsFile = _pathBase_ + "exp/feature_map_max" error_output_file = _pathBase_ + "exp/error_max_pool_info" with open(bestfFile, "r") as reader: predlblseq = [int(line) for line in reader.readlines()] with open(goldfrsFile, "r") as reader: goldlblseq = [int(line) for line in reader.readlines()] with open(rcnstrct_sents_file, "r") as reader: sentseq = [line.rstrip() for line in reader.readlines()] with open(maxplrsFile, "r") as reader: maxplseq = [[int(strint) for strint in line.split()] for line in reader.readlines()] assert len(goldlblseq) == len(predlblseq) == len(sentseq) == len(maxplseq) c_vec, f_vec = label_structure("label_structure_new") error_instances = [] maxplinfolines = [] count = 0 for i in xrange(len(goldlblseq)): if goldlblseq[i] != predlblseq[i]: count += 1 words = sentseq[i].split() error_instances.append(f_vec[predlblseq[i]] + "\t" + f_vec[goldlblseq[i]] + "\t" + sentseq[i]) maxplinfo = "" for j in xrange(len(maxplseq[i])): argmax = maxplseq[i][j] threegram = str(j) + ":" + words[argmax] + "/" + words[argmax + 1] + "/" + words[argmax + 2] maxplinfo += threegram + " " maxplinfolines.append(maxplinfo.rstrip()) assert len(error_instances) == len(maxplinfolines) print count with open(error_output_file, "w") as writer: for i in xrange(len(error_instances)): writer.write(error_instances[i] + "\n") writer.write(maxplinfolines[i] + "\n")
def train_joint_conv_net(w2vFile, dataFile, labelStructureFile, cfswitch, filter_hs, n_epochs=1000, batch_size=50, feature_maps=100, hasmlphidden=False, usefscore=False): """ function: learning and testing sentence level Question Classification Task in a joint fashion, ie. adding the loss function of coarse label prediction and fine label prediction together. :param w2vFile: the path of the word embedding file(pickle file with numpy array value, produced by word2vec.py module) :param dataFile: the dataset file produced by process_data.py module :param labelStructureFile: a file that describes label structure of coarse and fine grains. It is produced in produce_data.py in outputlabelstructure() "param filter_h: sliding window size. *** warning *** you cannot just change window size here, if you want to use a different window for the experiment. YOU NEED TO RE-PRODUCE A NEW DATASET IN process_data.py WITH THE CORRESPONDING WINDOW SIZE. :param n_epochs: the number of epochs the training needs to run :param batch_size: the size of the mini-batch :param feature_maps: how many dimensions you want the abstract sentence representation to be :param mlphiddensize: the size of the hidden layer in MLP :param logFile: the output file of the brief info of each epoch results, basically a save for the print out :param logTest: keep track of results on test set :return: a tuple of best fine grained prediction accuracy and its corresponding coarse grained prediction accuracy """ """ Loading and preparing data """ datasets = load(dataFile) clbl_vec, flbl_vec = process_qc.label_structure(labelStructureFile) trainDataSetIndex = 0 testDataSetIndex = 1 validDataSetIndex = 2 sentenceIndex = 0 clblIndex = 1 # coarse label(clbl) index in the dataset structure flblIndex = 2 # fine label(flbl) index if cfswitch == 'c': lblIndex = clblIndex label_vec = clbl_vec elif cfswitch == 'f': lblIndex = flblIndex label_vec = flbl_vec else: print 'wrong arg value in: cfswtich!' sys.exit() label_size = len(label_vec) if hasmlphidden: layer_size = [feature_maps * len(filter_hs), 100, label_size] else: layer_size = [feature_maps * len(filter_hs), label_size] # train part train_y = shared_store(datasets[trainDataSetIndex][lblIndex]) train_x = shared_store(datasets[trainDataSetIndex][sentenceIndex]) # test part gold_test_y = datasets[testDataSetIndex][lblIndex] test_x = shared_store(datasets[testDataSetIndex][sentenceIndex]) # valid part gold_valid_y = datasets[validDataSetIndex][lblIndex] valid_x = shared_store(datasets[validDataSetIndex][sentenceIndex]) w2v = load(w2vFile) img_w = w2v.shape[1] # the dimension of the word embedding img_h = len(datasets[trainDataSetIndex][sentenceIndex] [0]) # length of each sentence filter_w = img_w # word embedding dimension image_shapes = [] filter_shapes = [] for i in xrange(len(filter_hs)): image_shapes.append((batch_size, 1, img_h, img_w * filter_hs[i])) filter_shapes.append((feature_maps, 1, 1, filter_w * filter_hs[i])) pool_size = (img_h, 1) train_size = len(datasets[trainDataSetIndex][sentenceIndex]) print 'number of sentences in training set: ' + str(train_size) print 'max sentence length: ' + str( len(datasets[trainDataSetIndex][sentenceIndex][0])) print 'train data shape: ' + str( datasets[trainDataSetIndex][sentenceIndex].shape) print 'word embedding dim: ' + str(w2v.shape[1]) """ Building model in theano language, less comments here. You can refer to Theano web site for more details """ batch_index = T.lvector('hello_batch_index') x = T.itensor3('hello_x') y = T.ivector('hello_y') w2v_shared = theano.shared(value=w2v, name='w2v', borrow=True) rng = np.random.RandomState(3435) conv_layer_outputs = [] conv_layers = [] for i in xrange(len(filter_hs)): input = w2v_shared[x.flatten()].reshape( (x.shape[0], 1, x.shape[1], x.shape[2] * img_w))[:, :, :, 0:filter_hs[i] * img_w] conv_layer = LeNetConvPoolLayer(rng, input=input, filter_shape=filter_shapes[i], poolsize=pool_size, image_shape=image_shapes[i], non_linear="relu") conv_layers.append(conv_layer) conv_layer_outputs.append(conv_layer.output.flatten(2)) mlp_input = T.concatenate(conv_layer_outputs, 1) classifier = MLPDropout( rng=rng, input=mlp_input, layer_sizes=layer_size, # [feature_maps * len(filter_hs), label_size], dropout_rate=0.5, activation=Iden) params = [] for conv_layer in conv_layers: params += conv_layer.params params += classifier.params cost = classifier.negative_log_likelihood(y) updates = sgd_updates_adadelta(params, cost) n_batches = train_x.shape.eval()[0] / batch_size train_model = theano.function( inputs=[batch_index], outputs=cost, updates=updates, givens={ x: train_x[batch_index], y: train_y[batch_index], }, ) """ Building test model """ test_conv_layer_outputs = [] for i, conv_layer in enumerate(conv_layers): test_input = w2v_shared[x.flatten()].reshape( (x.shape[0], 1, x.shape[1], x.shape[2] * img_w))[:, :, :, 0:filter_hs[i] * img_w] test_conv_layer_outputs.append( conv_layer.conv_layer_output(test_input, (test_x.shape.eval()[0], 1, img_h, img_w * filter_hs[i])).flatten(2)) test_prediction = classifier.predict( T.concatenate(test_conv_layer_outputs, 1)) # test on test set test_model = theano.function(inputs=[], outputs=test_prediction, givens={ x: test_x, }) # test on valid set valid_model = theano.function(inputs=[], outputs=test_prediction, givens={ x: valid_x, }) """ Training part """ print 'training....' best_valid_ep = 0 best_valid_acc = 0. best_test_ep = 0 best_test_acc = 0. final_acc = 0. epoch = 0 last_acc = 0. # create gold value sequences, required by the eval.py with open('../exp/goldrs', 'w') as writer: for lbl in gold_test_y: writer.write(str(lbl) + '\n') # training loop while (epoch < n_epochs): epoch += 1 print '************* epoch ' + str(epoch) batch_indexes = range(train_size) rng.shuffle(batch_indexes) for bchidx in xrange(n_batches): random_indexes = batch_indexes[bchidx * batch_size:(bchidx + 1) * batch_size] train_cost = train_model(random_indexes) test_y_preds = test_model() valid_y_preds = valid_model() if usefscore: test_acc = eval.fscore(gold_test_y, test_y_preds) valid_acc = eval.fscore(gold_valid_y, valid_y_preds) else: test_acc = eval.accuracy(gold_test_y, test_y_preds) valid_acc = eval.accuracy(gold_valid_y, valid_y_preds) if valid_acc > best_valid_acc: best_valid_acc = valid_acc best_valid_ep = epoch if final_acc < test_acc: final_acc = test_acc with open('../exp/predictions', 'w') as writer: for lblidx in test_y_preds: writer.write(str(lblidx) + '\n') if test_acc > best_test_acc: best_test_acc = test_acc best_test_ep = epoch # output predictions print 'test accuracy is: ' + str(test_acc) print 'valid accuracy is: ' + str(valid_acc) print 'current best valid prediction accuracy is: ' + str( best_valid_acc) + ' at epoch ' + str(best_valid_ep) print 'current best final prediction accuracy is: ' + str( final_acc) + ' at epoch ' + str(best_valid_ep) print 'current best test prediction accuracy is: ' + str( best_test_acc) + ' at epoch ' + str(best_test_ep) last_acc = test_acc # final_acc = last_acc return final_acc
def confusionMatrix(): switch = 'c' with open(_pathBase_ + 'exp/bestcnn' + switch + 'rs', 'r') as reader: rslines = reader.readlines() with open(_pathBase_ + 'exp/goldrs', 'r') as reader: truelines = reader.readlines() with open(_pathBase_ + 'data/boschtest_new', 'r') as reader: testlines = reader.readlines() assert len(truelines) == len(rslines) == len(testlines) c2idx, f2idx = lbl2index() predslines = [int(line) for line in rslines] testcseq = [c2idx[line.split()[0].split(':')[0]] for line in testlines] testfseq = [f2idx[line.split()[0]] for line in testlines] c_vec, f_vec = label_structure('./label_structure_new') if switch == 'c': lbl_vec = c_vec goldlines = testcseq else: lbl_vec = f_vec goldlines = testfseq cm = [[0 for i in xrange(len(lbl_vec))] for j in xrange(len(lbl_vec))] error_instances = [] for i in xrange(len(goldlines)): glbl = goldlines[i] plbl = predslines[i] print plbl print glbl cm[plbl][glbl] += 1 if plbl != glbl: error_instances.append(lbl_vec[plbl] + '\t' + testlines[i]) tmpline = '' for i in xrange(len(lbl_vec)): tmpline += str(i) + ': ' + lbl_vec[i] + '\t' if i % 5 == 0: print tmpline tmpline = '' print tmpline tmpline = ' [' for i in xrange(len(cm)): if i < 10: tmpline += ' ' + str(i) + ', ' else: tmpline += str(i) + ', ' print tmpline.rstrip(', ') + ']' for i in xrange(len(cm)): row = cm[i] tmpline = '[' for num in row: if num >= 10: tmpline += str(num) + ', ' else: tmpline += ' ' + str(num) + ', ' tmpline = tmpline.rstrip(', ') + ']' if i < 10: print '[ ' + str(i) + ']' + tmpline else: print '[' + str(i) + ']' + tmpline for line in error_instances: print line.strip()
def confusionMatrix(): switch = "c" with open(_pathBase_ + "exp/bestcnn" + switch + "rs", "r") as reader: rslines = reader.readlines() with open(_pathBase_ + "exp/goldrs", "r") as reader: truelines = reader.readlines() with open(_pathBase_ + "data/boschtest_new", "r") as reader: testlines = reader.readlines() assert len(truelines) == len(rslines) == len(testlines) c2idx, f2idx = lbl2index() predslines = [int(line) for line in rslines] testcseq = [c2idx[line.split()[0].split(":")[0]] for line in testlines] testfseq = [f2idx[line.split()[0]] for line in testlines] c_vec, f_vec = label_structure("./label_structure_new") if switch == "c": lbl_vec = c_vec goldlines = testcseq else: lbl_vec = f_vec goldlines = testfseq cm = [[0 for i in xrange(len(lbl_vec))] for j in xrange(len(lbl_vec))] error_instances = [] for i in xrange(len(goldlines)): glbl = goldlines[i] plbl = predslines[i] print plbl print glbl cm[plbl][glbl] += 1 if plbl != glbl: error_instances.append(lbl_vec[plbl] + "\t" + testlines[i]) tmpline = "" for i in xrange(len(lbl_vec)): tmpline += str(i) + ": " + lbl_vec[i] + "\t" if i % 5 == 0: print tmpline tmpline = "" print tmpline tmpline = " [" for i in xrange(len(cm)): if i < 10: tmpline += " " + str(i) + ", " else: tmpline += str(i) + ", " print tmpline.rstrip(", ") + "]" for i in xrange(len(cm)): row = cm[i] tmpline = "[" for num in row: if num >= 10: tmpline += str(num) + ", " else: tmpline += " " + str(num) + ", " tmpline = tmpline.rstrip(", ") + "]" if i < 10: print "[ " + str(i) + "]" + tmpline else: print "[" + str(i) + "]" + tmpline for line in error_instances: print line.strip()
def train_joint_conv_net( w2vFile, dataFile, labelStructureFile, cfswitch, filter_hs, n_epochs=1000, batch_size=50, feature_maps=100, hasmlphidden=False, usefscore=False ): """ function: learning and testing sentence level Question Classification Task in a joint fashion, ie. adding the loss function of coarse label prediction and fine label prediction together. :param w2vFile: the path of the word embedding file(pickle file with numpy array value, produced by word2vec.py module) :param dataFile: the dataset file produced by process_data.py module :param labelStructureFile: a file that describes label structure of coarse and fine grains. It is produced in produce_data.py in outputlabelstructure() "param filter_h: sliding window size. *** warning *** you cannot just change window size here, if you want to use a different window for the experiment. YOU NEED TO RE-PRODUCE A NEW DATASET IN process_data.py WITH THE CORRESPONDING WINDOW SIZE. :param n_epochs: the number of epochs the training needs to run :param batch_size: the size of the mini-batch :param feature_maps: how many dimensions you want the abstract sentence representation to be :param mlphiddensize: the size of the hidden layer in MLP :param logFile: the output file of the brief info of each epoch results, basically a save for the print out :param logTest: keep track of results on test set :return: a tuple of best fine grained prediction accuracy and its corresponding coarse grained prediction accuracy """ """ Loading and preparing data """ datasets = load(dataFile) clbl_vec, flbl_vec = process_qc.label_structure(labelStructureFile) trainDataSetIndex = 0 testDataSetIndex = 1 validDataSetIndex = 2 sentenceIndex = 0 clblIndex = 1 # coarse label(clbl) index in the dataset structure flblIndex = 2 # fine label(flbl) index if cfswitch == 'c': lblIndex = clblIndex label_vec = clbl_vec elif cfswitch == 'f': lblIndex = flblIndex label_vec = flbl_vec else: print 'wrong arg value in: cfswtich!' sys.exit() label_size = len(label_vec) if hasmlphidden: layer_size = [feature_maps * len(filter_hs), 100, label_size] else: layer_size = [feature_maps * len(filter_hs), label_size] # train part train_y = shared_store(datasets[trainDataSetIndex][lblIndex]) train_x = shared_store(datasets[trainDataSetIndex][sentenceIndex]) # test part gold_test_y = datasets[testDataSetIndex][lblIndex] test_x = shared_store(datasets[testDataSetIndex][sentenceIndex]) # valid part gold_valid_y = datasets[validDataSetIndex][lblIndex] valid_x = shared_store(datasets[validDataSetIndex][sentenceIndex]) w2v = load(w2vFile) img_w = w2v.shape[1] # the dimension of the word embedding img_h = len(datasets[trainDataSetIndex][sentenceIndex][0]) # length of each sentence filter_w = img_w # word embedding dimension image_shapes = [] filter_shapes = [] for i in xrange(len(filter_hs)): image_shapes.append((batch_size, 1, img_h, img_w * filter_hs[i])) filter_shapes.append((feature_maps, 1, 1, filter_w * filter_hs[i])) pool_size = (img_h, 1) train_size = len(datasets[trainDataSetIndex][sentenceIndex]) print 'number of sentences in training set: ' + str(train_size) print 'max sentence length: ' + str(len(datasets[trainDataSetIndex][sentenceIndex][0])) print 'train data shape: ' + str(datasets[trainDataSetIndex][sentenceIndex].shape) print 'word embedding dim: ' + str(w2v.shape[1]) """ Building model in theano language, less comments here. You can refer to Theano web site for more details """ batch_index = T.lvector('hello_batch_index') x = T.itensor3('hello_x') y = T.ivector('hello_y') w2v_shared = theano.shared(value=w2v, name='w2v', borrow=True) rng = np.random.RandomState(3435) conv_layer_outputs = [] conv_layers = [] for i in xrange(len(filter_hs)): input = w2v_shared[x.flatten()].reshape( (x.shape[0], 1, x.shape[1], x.shape[2] * img_w) )[:, :, :, 0:filter_hs[i] * img_w] conv_layer = LeNetConvPoolLayer( rng, input=input, filter_shape=filter_shapes[i], poolsize=pool_size, image_shape=image_shapes[i], non_linear="relu" ) conv_layers.append(conv_layer) conv_layer_outputs.append(conv_layer.output.flatten(2)) mlp_input = T.concatenate(conv_layer_outputs, 1) classifier = MLPDropout( rng=rng, input=mlp_input, layer_sizes=layer_size, # [feature_maps * len(filter_hs), label_size], dropout_rate=0.5, activation=Iden ) params = [] for conv_layer in conv_layers: params += conv_layer.params params += classifier.params cost = classifier.negative_log_likelihood(y) updates = sgd_updates_adadelta(params, cost) n_batches = train_x.shape.eval()[0] / batch_size train_model = theano.function( inputs=[batch_index], outputs=cost, updates=updates, givens={ x: train_x[batch_index], y: train_y[batch_index], }, ) """ Building test model """ test_conv_layer_outputs = [] for i, conv_layer in enumerate(conv_layers): test_input = w2v_shared[x.flatten()].reshape( (x.shape[0], 1, x.shape[1], x.shape[2] * img_w) )[:, :, :, 0:filter_hs[i] * img_w] test_conv_layer_outputs.append( conv_layer.conv_layer_output( test_input, (test_x.shape.eval()[0], 1, img_h, img_w * filter_hs[i]) ).flatten(2) ) test_prediction = classifier.predict(T.concatenate(test_conv_layer_outputs, 1)) # test on test set test_model = theano.function( inputs=[], outputs=test_prediction, givens={ x: test_x, } ) # test on valid set valid_model = theano.function( inputs=[], outputs=test_prediction, givens={ x: valid_x, } ) """ Training part """ print 'training....' best_valid_ep = 0 best_valid_acc = 0. best_test_ep = 0 best_test_acc = 0. final_acc = 0. epoch = 0 last_acc = 0. # create gold value sequences, required by the eval.py with open('../exp/goldrs', 'w') as writer: for lbl in gold_test_y: writer.write(str(lbl) + '\n') # training loop while (epoch < n_epochs): epoch += 1 print '************* epoch ' + str(epoch) batch_indexes = range(train_size) rng.shuffle(batch_indexes) for bchidx in xrange(n_batches): random_indexes = batch_indexes[bchidx * batch_size:(bchidx + 1) * batch_size] train_cost = train_model(random_indexes) test_y_preds = test_model() valid_y_preds = valid_model() if usefscore: test_acc = eval.fscore(gold_test_y, test_y_preds) valid_acc = eval.fscore(gold_valid_y, valid_y_preds) else: test_acc = eval.accuracy(gold_test_y, test_y_preds) valid_acc = eval.accuracy(gold_valid_y, valid_y_preds) if valid_acc > best_valid_acc: best_valid_acc = valid_acc best_valid_ep = epoch if final_acc < test_acc: final_acc = test_acc with open('../exp/predictions', 'w') as writer: for lblidx in test_y_preds: writer.write(str(lblidx) + '\n') if test_acc > best_test_acc: best_test_acc = test_acc best_test_ep = epoch # output predictions print 'test accuracy is: ' + str(test_acc) print 'valid accuracy is: ' + str(valid_acc) print 'current best valid prediction accuracy is: ' + str(best_valid_acc) + ' at epoch ' + str(best_valid_ep) print 'current best final prediction accuracy is: ' + str(final_acc) + ' at epoch ' + str(best_valid_ep) print 'current best test prediction accuracy is: ' + str(best_test_acc) + ' at epoch ' + str(best_test_ep) last_acc = test_acc # final_acc = last_acc return final_acc