def run(self, params, batcher): embed = {'train': {}, 'test': {}} bsize = params.batch_size for key in self.data: logging.info('Computing embedding for {0}'.format(key)) # Sort to reduce padding sorted_data = sorted(zip(self.data[key]['X'], self.data[key]['y']), key=lambda z: (len(z[0]), z[1])) self.data[key]['X'], self.data[key]['y'] = map(list, zip(*sorted_data)) embed[key]['X'] = [] for ii in range(0, len(self.data[key]['y']), bsize): batch = self.data[key]['X'][ii:ii + bsize] embeddings = batcher(params, batch) embed[key]['X'].append(embeddings) embed[key]['X'] = np.vstack(embed[key]['X']) embed[key]['y'] = np.array(self.data[key]['y']) logging.info('Computed {0} embeddings'.format(key)) config_classifier = {'nclasses': self.nclasses, 'seed': self.seed, 'usepytorch': params.usepytorch, 'classifier': params.classifier, 'kfold': params.kfold} clf = KFoldClassifier(embed['train'], embed['test'], config_classifier) devacc, testacc, _ = clf.run() logging.debug('\nDev acc : {0} Test acc : {1} for \ FormalityJa classification\n'.format(devacc, testacc)) return {'devacc': devacc, 'acc': testacc, 'ntest': len(embed['test']['X'])}
def run(self, params, batcher): train_embeddings, test_embeddings = [], [] # Sort to reduce padding sorted_corpus_train = sorted(zip(self.train['X'], self.train['y']), key=lambda z: (len(z[0]), z[1])) train_samples = [x for (x, y) in sorted_corpus_train] train_labels = [y for (x, y) in sorted_corpus_train] sorted_corpus_test = sorted(zip(self.test['X'], self.test['y']), key=lambda z: (len(z[0]), z[1])) test_samples = [x for (x, y) in sorted_corpus_test] test_labels = [y for (x, y) in sorted_corpus_test] # Get train embeddings for ii in range(0, len(train_labels), params.batch_size): batch = train_samples[ii:ii + params.batch_size] embeddings = batcher(params, batch) train_embeddings.append(embeddings) train_embeddings = np.vstack(train_embeddings) logging.info('Computed train embeddings') # Get test embeddings for ii in range(0, len(test_labels), params.batch_size): batch = test_samples[ii:ii + params.batch_size] embeddings = batcher(params, batch) test_embeddings.append(embeddings) test_embeddings = np.vstack(test_embeddings) logging.info('Computed test embeddings') config_classifier = { 'nclasses': 6, 'seed': self.seed, 'usepytorch': params.usepytorch, 'classifier': params.classifier, 'kfold': params.kfold } clf = KFoldClassifier( { 'X': train_embeddings, 'y': np.array(train_labels) }, { 'X': test_embeddings, 'y': np.array(test_labels) }, config_classifier) classifier, devacc, testacc, _ = clf.run() logging.debug('\nDev acc : {0} Test acc : {1} \ for TREC\n'.format(devacc, testacc)) return { 'devacc': devacc, 'acc': testacc, 'ndev': len(self.train['X']), 'ntest': len(self.test['X']), 'classifier': classifier, 'X_train': train_embeddings, #'Y': np.array(train_labels + test_labels) 'X_test': test_embeddings, 'text': train_samples }
def single_run(self, params, batcher, train_X, train_y, test_X, test_y, field): # batcher is the algorithm train_embeddings, test_embeddings = [], [] # Sort to reduce padding sorted_corpus_train = sorted(zip(train_X, train_y), key=lambda z: (len(z[0]), z[1])) train_samples = [x for (x, y) in sorted_corpus_train] train_labels = [y for (x, y) in sorted_corpus_train] sorted_corpus_test = sorted(zip(test_X, test_y), key=lambda z: (len(z[0]), z[1])) test_samples = [x for (x, y) in sorted_corpus_test] test_labels = [y for (x, y) in sorted_corpus_test] # Get train embeddings for ii in range(0, len(train_labels), params.batch_size): batch = train_samples[ii:ii + params.batch_size] embeddings = batcher(params, batch) train_embeddings.append(embeddings) train_embeddings = np.vstack(train_embeddings) logging.info('Computed train embeddings') # Get test embeddings for ii in range(0, len(test_labels), params.batch_size): batch = test_samples[ii:ii + params.batch_size] embeddings = batcher(params, batch) test_embeddings.append(embeddings) test_embeddings = np.vstack(test_embeddings) logging.info('Computed test embeddings') config_classifier = { 'nclasses': 3, 'seed': self.seed, 'usepytorch': params.usepytorch, 'classifier': params.classifier, 'nhid': params.nhid, 'kfold': params.kfold } clf = KFoldClassifier( { 'X': train_embeddings, 'y': np.array(train_labels) }, { 'X': test_embeddings, 'y': np.array(test_labels) }, config_classifier) devacc, testacc, _ = clf.run() logging.debug('\n' + field + ' Dev acc : {0} Test acc : {1} \ for ABSA_SP\n'.format(devacc, testacc)) return { '{} devacc'.format(field): devacc, '{} acc'.format(field): testacc, '{} ndev'.format(field): len(train_X), '{} ntest'.format(field): len(test_X) }
def run(self, params, batcher): train_embeddings, test_embeddings = [], [] train_samples = self.train['X'] train_labels = self.train['y'] test_samples = self.test['X'] test_labels = self.test['y'] # Get train embeddings for ii in range(0, len(train_labels), params.batch_size): batch = train_samples[ii:ii + params.batch_size] embeddings = batcher(params, batch) train_embeddings.append(embeddings) train_embeddings = np.vstack(train_embeddings) logging.info('Computed train embeddings') # Get test embeddings for ii in range(0, len(test_labels), params.batch_size): batch = test_samples[ii:ii + params.batch_size] embeddings = batcher(params, batch) test_embeddings.append(embeddings) # for e in embeddings: # test_embeddings.append(e) test_embeddings = np.vstack(test_embeddings) logging.info('Computed test embeddings') config_classifier = { 'nclasses': 27, 'seed': self.seed, 'usepytorch': params.usepytorch, 'classifier': params.classifier, 'kfold': params.kfold } clf = KFoldClassifier( { 'X': train_embeddings, 'y': np.array(train_labels) }, { 'X': test_embeddings, 'y': np.array(test_labels) }, config_classifier) devacc, testacc, _ = clf.run() logging.debug('\nDev acc : {0} Test acc : {1} \ for ' + self.evalType + " ".format(devacc, testacc)) return { 'devacc': devacc, 'acc': testacc, 'ndev': len(self.train['X']), 'ntest': len(self.test['X']) }
def run(self, params, batcher): mrpc_embed = {'train': {}, 'test': {}} for key in self.mrpc_data: logging.info('Computing embedding for {0}'.format(key)) # Sort to reduce padding text_data = {} sorted_corpus = sorted(zip(self.mrpc_data[key]['X_A'], self.mrpc_data[key]['X_B'], self.mrpc_data[key]['y']), key=lambda z: (len(z[0]), len(z[1]), z[2])) text_data['A'] = [x for (x, y, z) in sorted_corpus] text_data['B'] = [y for (x, y, z) in sorted_corpus] text_data['y'] = [z for (x, y, z) in sorted_corpus] for txt_type in ['A', 'B']: mrpc_embed[key][txt_type] = batcher(params, text_data[txt_type], key) # mrpc_embed[key][txt_type] = [] # for ii in range(0, len(text_data['y']), params.batch_size): # batch = text_data[txt_type][ii:ii + params.batch_size] # embeddings = batcher(params, batch) # mrpc_embed[key][txt_type].append(embeddings) # mrpc_embed[key][txt_type] = np.vstack(mrpc_embed[key][txt_type]) mrpc_embed[key]['y'] = np.array(text_data['y']) logging.info('Computed {0} embeddings'.format(key)) # Train trainA = mrpc_embed['train']['A'] trainB = mrpc_embed['train']['B'] trainF = np.c_[np.abs(trainA - trainB), trainA * trainB] trainY = mrpc_embed['train']['y'] # Test testA = mrpc_embed['test']['A'] testB = mrpc_embed['test']['B'] testF = np.c_[np.abs(testA - testB), testA * testB] testY = mrpc_embed['test']['y'] config = {'nclasses': 2, 'seed': self.seed, 'usepytorch': params.usepytorch, 'classifier': params.classifier, 'nhid': params.nhid, 'kfold': params.kfold} clf = KFoldClassifier(train={'X': trainF, 'y': trainY}, test={'X': testF, 'y': testY}, config=config) devacc, testacc, yhat = clf.run() testf1 = round(100*f1_score(testY, yhat), 2) logging.debug('Dev acc : {0} Test acc {1}; Test F1 {2} for MRPC.\n' .format(devacc, testacc, testf1)) return {'devacc': devacc, 'acc': testacc, 'f1': testf1, 'ndev': len(trainA), 'ntest': len(testA)}
def run(self, params, batcher): mrpc_embed = {'train': {}, 'test': {}} if (params.train is not None and params.train == False): mrpc_embed = {'train': {}, 'test': {}} else: mrpc_embed = {'train': {}, 'dev': {}, 'test': {}} test_file_x_a = 'embeddings/testx_a_' + params.model_name + "_mrpc.csv" test_file_x_b = 'embeddings/testx_b_' + params.model_name + "_mrpc.csv" test_file_y = 'embeddings/testy_' + params.model_name + "_mrpc.csv" train_file_x_a = 'embeddings/trainx_a' + params.model_name + "_mrpc.csv" train_file_x_b = 'embeddings/trainx_b' + params.model_name + "_mrpc.csv" train_file_y = 'embeddings/trainy_' + params.model_name + "_mrpc.csv" self.params = params self.adversarialFunc = params.adversarialFunc # for key in self.mrpc_data: # logging.info('Computing embedding for {0}'.format(key)) # # Sort to reduce padding # text_data = {} # sorted_corpus = sorted(zip(self.mrpc_data[key]['X_A'], # self.mrpc_data[key]['X_B'], # self.mrpc_data[key]['y']), # key=lambda z: (len(z[0]), len(z[1]), z[2])) # # text_data['A'] = [x for (x, y, z) in sorted_corpus] # text_data['B'] = [y for (x, y, z) in sorted_corpus] # text_data['y'] = [z for (x, y, z) in sorted_corpus] # # for txt_type in ['A', 'B']: # mrpc_embed[key][txt_type] = [] # for ii in range(0, len(text_data['y']), params.batch_size): # n = len(text_data['y']) / params.batch_size # if ((ii/params.batch_size)*100/n) % 10 == 0: # print("%d percent done out of %d"%( ((ii/params.batch_size)*100/n), len(text_data['y']))) # batch = text_data[txt_type][ii:ii + params.batch_size] # embeddings = batcher(params, batch) # mrpc_embed[key][txt_type].append(embeddings) # mrpc_embed[key][txt_type] = np.vstack(mrpc_embed[key][txt_type]) # mrpc_embed[key]['y'] = np.array(text_data['y']) # logging.info('Computed {0} embeddings'.format(key)) # # # # pickle.dump(mrpc_embed['test']['A'], open(test_file_x_a, 'wb')) # pickle.dump(mrpc_embed['test']['B'], open(test_file_x_b, 'wb')) # pickle.dump(mrpc_embed['test']['y'], open(test_file_y, 'wb')) # # pickle.dump(mrpc_embed['train']['A'], open(train_file_x_a, 'wb')) # pickle.dump(mrpc_embed['train']['B'], open(train_file_x_b, 'wb')) # pickle.dump(mrpc_embed['train']['y'], open(train_file_y, 'wb')) # # print("dumped embedding files") logging.info("reading files") mrpc_embed['test']['A'] = pickle.load(open(test_file_x_a, 'rb')) mrpc_embed['test']['B'] = pickle.load(open(test_file_x_b, 'rb')) mrpc_embed['test']['y'] = pickle.load(open(test_file_y, 'rb')) mrpc_embed['train']['A'] = pickle.load(open(train_file_x_a, 'rb')) mrpc_embed['train']['B'] = pickle.load(open(train_file_x_b, 'rb')) mrpc_embed['train']['y'] = pickle.load(open(train_file_y, 'rb')) # Train trainA = mrpc_embed['train']['A'] trainB = mrpc_embed['train']['B'] trainF = np.c_[np.abs(trainA - trainB), trainA * trainB] trainY = mrpc_embed['train']['y'] # Test testA = mrpc_embed['test']['A'] testB = mrpc_embed['test']['B'] testF = np.c_[np.abs(testA - testB), testA * testB] testY = mrpc_embed['test']['y'] print("trainf vector shape", trainF.shape) config = { 'nclasses': 2, 'seed': self.seed, 'usepytorch': params.usepytorch, 'classifier': params.classifier, 'nhid': params.nhid, 'kfold': params.kfold, 'adversarial_sample_generator': self.generate_adv_samples if self.adversarialFunc is not None else None, 'batcher': batcher if batcher is not None else None } # X = {'train': {}, 'valid': {}, 'test': {}} # y = {'train': {}, 'valid': {}, 'test': {}} # # for key in mrpc_embed.keys(): # X[key] = mrpc_embed.get(key)['X'] # y[key] = mrpc_embed.get(key)['y'] params.task_name = "mrpc" clf = KFoldClassifier(train={ 'X': trainF, 'y': trainY }, test={ 'X': testF, 'y': testY }, config=config) devacc, testacc, yhat = clf.run(params) testf1 = round(100 * f1_score(testY, yhat), 2) logging.debug( 'Dev acc : {0} Test acc {1}; Test F1 {2} for MRPC.\n'.format( devacc, testacc, testf1)) return { 'devacc': devacc, 'acc': testacc, 'f1': testf1, 'ndev': len(trainA), 'ntest': len(testA) }
def run(self, params, batcher): qa_embed = {'train': {}, 'test': {}} for key in self.qa_data: logging.info('Computing embedding for {0}'.format(key)) # Sort to reduce padding text_data = {} sorted_corpus = sorted(zip(self.qa_data[key]['question'], self.qa_data[key]['snippet'], self.qa_data[key]['label']), key=lambda z: (len(z[0]), len(z[1]), z[2])) text_data['question'] = [x for (x, y, z) in sorted_corpus] text_data['snippet'] = [y for (x, y, z) in sorted_corpus] text_data['label'] = [z for (x, y, z) in sorted_corpus] for txt_type in ['question', 'snippet']: qa_embed[key][txt_type] = [] for ii in range(0, len(text_data['label']), params.batch_size): batch = text_data[txt_type][ii:ii + params.batch_size] #print(batch) embeddings = batcher(params, batch) #print(embeddings.shape) #for i,j in zip(batch,embeddings): # print(i,j) qa_embed[key][txt_type].append(embeddings) qa_embed[key][txt_type] = np.vstack(qa_embed[key][txt_type]) qa_embed[key]['label'] = np.array(text_data['label']) logging.info('Computed {0} embeddings'.format(key)) # Train trainQ = qa_embed['train']['question'] trainS = qa_embed['train']['snippet'] #trainQS = np.c_[np.abs(trainQ - trainS), trainQ * trainS] trainQS = np.hstack( (trainQ, trainS, trainQ * trainS, np.abs(trainQ - trainS))) trainY = qa_embed['train']['label'] #print(trainQ) #print(trainS) #print(trainQS) # Test testQ = qa_embed['test']['question'] testS = qa_embed['test']['snippet'] #testQS = np.c_[np.abs(testQ - testS), testQ * testS] testQS = np.hstack( (testQ, testS, testQ * testS, np.abs(testQ - testS))) testY = qa_embed['test']['label'] config = { 'nclasses': 2, 'seed': self.seed, 'usepytorch': params.usepytorch, 'classifier': params.classifier, 'nhid': params.nhid, 'kfold': params.kfold } config_classifier = copy.deepcopy(params.classifier) config_classifier['max_epoch'] = 1 config_classifier['epoch_size'] = 64 config_classifier['batch_size'] = 64 config['classifier'] = config_classifier print(config_classifier) clf = KFoldClassifier(train={ 'X': trainQS, 'y': trainY }, test={ 'X': testQS, 'y': testY }, config=config) devacc, testacc, yhat = clf.run() testf1 = round(100 * f1_score(testY, yhat), 2) logging.debug( 'Dev acc : {0} Test acc {1}; Test F1 {2} for BioASQ 5b task (yes/no questions).\n' .format(devacc, testacc, testf1)) return { 'devacc': devacc, 'acc': testacc, 'f1': testf1, 'ndev': len(trainQS), 'ntest': len(testQS) }
def run(self, params, batcher): rqe_embed = {'train': {}, 'test': {}} for key in self.rqe_data: logging.info('Computing embedding for {0}'.format(key)) # Sort to reduce padding text_data = {} sorted_corpus = sorted(zip(self.rqe_data[key]['chq'], self.rqe_data[key]['faq'], self.rqe_data[key]['label'], self.rqe_data[key]['pid']), key=lambda z: (len(z[0]), len(z[1]), z[2])) text_data['chq'] = [x for (x, y, z, w) in sorted_corpus] text_data['faq'] = [y for (x, y, z, w) in sorted_corpus] text_data['label'] = [z for (x, y, z, w) in sorted_corpus] text_data['pid'] = [w for (x, y, z, w) in sorted_corpus] for txt_type in ['chq', 'faq']: rqe_embed[key][txt_type] = [] for ii in range(0, len(text_data['label']), params.batch_size): batch = text_data[txt_type][ii:ii + params.batch_size] embeddings = batcher(params, batch) rqe_embed[key][txt_type].append(embeddings) rqe_embed[key][txt_type] = np.vstack(rqe_embed[key][txt_type]) rqe_embed[key]['label'] = np.array(text_data['label']) logging.info('Computed {0} embeddings'.format(key)) # Train trainC = rqe_embed['train']['chq'] trainF = rqe_embed['train']['faq'] #print(trainC.shape,trainF.shape,(np.abs(trainC - trainF)).shape, (trainC * trainF).shape) #trainCF = np.c_[trainC, trainF,np.abs(trainC - trainF), (trainC * trainF)] trainCF = np.hstack( (trainC, trainF, trainC * trainF, np.abs(trainC - trainF))) trainY = rqe_embed['train']['label'] # Test testC = rqe_embed['test']['chq'] testF = rqe_embed['test']['faq'] #testCF = np.c_[testC, testF, np.abs(testC - testF), testC * testF] testCF = np.hstack( (testC, testF, testC * testF, np.abs(testC - testF))) testY = rqe_embed['test']['label'] config = { 'nclasses': 2, 'seed': self.seed, 'usepytorch': params.usepytorch, 'classifier': params.classifier, 'nhid': params.nhid, 'kfold': params.kfold } clf = KFoldClassifier(train={ 'X': trainCF, 'y': trainY }, test={ 'X': testCF, 'y': testY }, config=config) devacc, testacc, yhat = clf.run() pred = [] print(text_data['pid']) for i in yhat: pred.append(i) print(pred) testf1 = round(100 * f1_score(testY, yhat), 2) logging.debug( 'Dev acc : {0} Test acc {1}; Test F1 {2} for RQE.\n'.format( devacc, testacc, testf1)) return { 'devacc': devacc, 'acc': testacc, 'f1': testf1, 'ndev': len(trainCF), 'ntest': len(testCF) }
def run(self, params, batcher): train_embeddings, test_embeddings = [], [] # Sort to reduce padding sorted_corpus_train = sorted(zip(self.train['X'], self.train['y']), key=lambda z: (len(z[0]), z[1])) train_samples = [x for (x, y) in sorted_corpus_train] train_labels = [y for (x, y) in sorted_corpus_train] zipped_corpus_test = sorted(enumerate( zip(self.test['X'], self.test['y'])), key=lambda z: (len(z[1][0]), z[1][1])) sorted_test_indices = [i for (i, z) in zipped_corpus_test] test_samples = [x for (i, (x, y)) in zipped_corpus_test] test_labels = [y for (i, (x, y)) in zipped_corpus_test] # Get train embeddings for ii in range(0, len(train_labels), params.batch_size): batch = train_samples[ii:ii + params.batch_size] embeddings = batcher(params, batch) train_embeddings.append(embeddings) train_embeddings = np.vstack(train_embeddings) logging.info('Computed train embeddings') # Get test embeddings for ii in range(0, len(test_labels), params.batch_size): batch = test_samples[ii:ii + params.batch_size] embeddings = batcher(params, batch) test_embeddings.append(embeddings) test_embeddings = np.vstack(test_embeddings) logging.info('Computed test embeddings') config_classifier = { 'nclasses': 6, 'seed': self.seed, 'usepytorch': params.usepytorch, 'classifier': params.classifier, 'kfold': params.kfold } clf = KFoldClassifier( { 'X': train_embeddings, 'y': np.array(train_labels) }, { 'X': test_embeddings, 'y': np.array(test_labels) }, config_classifier) devacc, testacc, yhat_sorted = clf.run() yhat = [None] * len(yhat_sorted) for (i, y) in enumerate(yhat_sorted): yhat[sorted_test_indices[i]] = y logging.debug('\nDev acc : {0} Test acc : {1} \ for TREC\n'.format(devacc, testacc)) return { 'devacc': devacc, 'acc': testacc, 'ndev': len(self.train['X']), 'ntest': len(self.test['X']), 'yhat': yhat, 'metadata': self.metadata }