def __init__(self, size): self.size = size base_dir = '.' glove_dict, glove_arr, glove_size = load_word_vectors( base_dir, 'glove.twitter.27B', size) self.glove_dict = glove_dict self.glove_arr = glove_arr
def __init__(self, name='', use_glove=False): self.name = name self.word2index = {} self.word2count = {} self.index2word = {LangDef.StartToken: "SOS", LangDef.EndToken: "EOS"} self.n_words = 2 # Count SOS and EOS if use_glove: # Get a dictionary of words (word to vec) with word vector size of 100 dimensions # It will download around 800Mb if necessary self.__wv_dict, self.__wv_arr, self.__wv_size = load_word_vectors( '.', 'glove.6B', 100) print('Loaded', len(self.__wv_arr), 'words')
def main(): # Config args = parseConfig() config = Config(args) print(config) logger = Logger() print("Logging destination: ", logger) # Load Embeddings vocab, embeddings, embedding_dim = load_word_vectors( '../data/glove', 'glove.6B', 100) # Model model = RNNHybrid_1(config, embeddings, vocab) # Weights Init model.apply(initialize_weights) if config.use_gpu: model = model.cuda() # Load Data train_dataset = HybridDataset(config, vocab) # Train-Val Split train_idx, val_idx = splitIndices(train_dataset, config, shuffle=True) train_sampler, val_sampler = SubsetRandomSampler( train_idx), SubsetRandomSampler(val_idx) train_loader = DataLoader(train_dataset, batch_size=config.batch_size, num_workers=3, sampler=train_sampler) val_loader = DataLoader(train_dataset, batch_size=config.batch_size, num_workers=1, sampler=val_sampler) config.train_loader = train_loader config.val_loader = val_loader # Print Distributions train_dataset.printDistributions(train_idx, msg="Training", logger=logger) train_dataset.printDistributions(val_idx, msg="Val", logger=logger) # Train optimizer = optim.Adam(model.parameters(), lr=config.lr) loss_fn = nn.CrossEntropyLoss().type(config.dtype) hybrid_train(model, loss_fn, optimizer, config.epochs, logger=logger)
def load_data(): print "LOADING WORD2VEC MODEL..." w2v_model = gensim.models.KeyedVectors.load_word2vec_format('/home/theapemachine/data/word2vec/text8-vector.bin', binary=True) print "LOADING SPACY MODEL..." spacy_model = spacy.load('en') print "LOADING GLOVE..." glove_dict, glove_arr, glove_size = load_word_vectors('.', 'glove.6B', 100) print "LOADING RESNIK..." resnik = nltk.corpus.wordnet_ic.ic('ic-bnc-resnik-add1.dat') print print "DATA LOADED!" print return w2v_model, spacy_model, glove_dict, glove_arr, glove_size, resnik
basepath = './data' embedding_path = '../data/glove' train_pairs = readQuoradata(basepath + '/train/') dev_pairs = readQuoradata(basepath + '/dev/') test_pairs = readQuoradata(basepath + '/test/') print('# of train pairs: %d' % len(train_pairs)) print('# of dev pairs: %d' % len(dev_pairs)) print('# of test pairs: %d' % len(test_pairs)) tokens, word2id = make_vocab(train_pairs, dev_pairs, test_pairs) with open(os.path.join('./results', 'vocab.pkl'), 'wb') as f: pickle.dump((tokens, word2id), f, protocol=pickle.HIGHEST_PROTOCOL) wv_dict, wv_arr, wv_size = load_word_vectors(embedding_path, 'glove.840B', 300) pretrained_emb = [] for _, word in enumerate(tokens): if word in wv_dict: pretrained_emb.append(wv_arr[wv_dict[word]].numpy()) else: pretrained_emb.append(np.random.uniform(-0.05, 0.05, size=[300])) pretrained_emb = np.stack(pretrained_emb) assert pretrained_emb.shape == (len(tokens), 300) model = StackBiLSTMMaxout(h_size=[512, 1024, 2048], v_size=len(tokens), d=300, mlp_d=1600, dropout_r=0.1,
vocab |= set(left) vocab |= set(right) for pair in test_pairs_m: left = pair[0] right = pair[1] vocab |= set(left) vocab |= set(right) for pair in test_pairs_um: left = pair[0] right = pair[1] vocab |= set(left) vocab |= set(right) tokens = list(vocab) #for line in open(basepath + '/vocab.txt'): # tokens.append(line.strip().decode('utf-8')) wv_dict, wv_arr, wv_size = load_word_vectors(embedding_path, 'glove.840B', EMBEDDING_DIM) #embedding = [] tokens.append('oov') tokens.append('bos') #embedding.append(dict[word].numpy()) #print(len(embedding)) #np.save('embedding',np.array(embedding)) #sys.exit() pretrained_emb = np.zeros(shape=(len(tokens), EMBEDDING_DIM)) oov = {} for id in range(100): oov[id] = torch.normal(torch.zeros(EMBEDDING_DIM), std=1) id = 0 for word in tokens: try: dict[word] = wv_arr[wv_dict[word]] / torch.norm(
print(' * vocabulary size. %d' % len(src_dict)) print(' * number of train batches. %d' % len(train)) print(' * maximum batch size. %d' % args.batch_size) print('Building model...') model = EncoderDecoder( # removed (args.hid_dim, args.hid_dim) added args.hid_dim (args.layers, args.layers), args.emb_dim, args.hid_dim, args.att_dim, src_dict, att_type=args.att_type, dropout=args.dropout, bidi=args.bidi, cell=args.cell) # Load Glove Pretrained Embeddings if args.pretrained != 'empty': wv_dict, wv_arr, wv_size = load_word_vectors( args.pretrained, 'glove.6B', 50) print('Loaded', len(wv_arr), 'words from pretrained embeddings.') model.emb_dim = wv_size wv_list = list(wv_dict) model.load_embeddings(wv_arr, wv_list, verbose=False) # Optimisation optimizer = Optimizer( model.parameters(), args.optim, args.learning_rate, args.max_grad_norm, lr_decay=args.learning_rate_decay, start_decay_at=args.start_decay_at) criterion = make_criterion(len(src_dict), src_dict.get_pad()) model.apply(u.make_initializer( rnn={'type': 'orthogonal', 'args': {'gain': 1.0}})) print('* number of parameters: %d' % model.n_params())
def main(args): #torch.manual_seed(123) EMBEDDING_DIM = 200 HIDDEN_DIM = 250 num_epochs = 20 task = args.task granularity = args.granularity dict = {} dict_char_ngram = {} word_freq = {} fake_dict = {} oov = [] feature_maps = [50, 100, 150, 200, 200, 200, 200] kernels = [1, 2, 3, 4, 5, 6, 7] charcnn_embedding_size = 15 max_word_length = 20 c2w_mode = False character_ngrams = 3 character_ngrams_2 = None character_ngrams_overlap = False glove_mode = None update_inv_mode = None update_oov_mode = None combine_mode = None lm_mode = None word_mode = (glove_mode, update_inv_mode, update_oov_mode) basepath = os.path.dirname(os.path.abspath(__file__)) if task == 'url': num_class = 2 trainset = readURLdata(basepath + '/data/url/train/', granularity) testset = readURLdata(basepath + '/data/url/test/', granularity) elif task == 'msrp': num_class = 2 trainset = readURLdata(basepath + '/data/msrp/train/', granularity) testset = readURLdata(basepath + '/data/msrp/test/', granularity) elif task == 'pit': num_class = 2 trainset = readPITdata(basepath + '/data/pit/train/', granularity) testset = readPITdata(basepath + '/data/pit/test/', granularity) else: print('wrong input for the first argument!') sys.exit() if granularity == 'char': # charcnn parameters feature_maps = [50, 100, 150, 200, 200, 200, 200] kernels = [1, 2, 3, 4, 5, 6, 7] charcnn_embedding_size = 15 max_word_length = 20 # c2w parameters if args.language_model: lm_mode = True else: lm_mode = False if args.char_assemble == 'c2w': c2w_mode = True else: c2w_mode = False character_ngrams = args.char_ngram character_ngrams_overlap = False #tokens = [] #for line in open(basepath + '/data/' + task + '/vocab.txt'): # tokens.append(line.strip()) tokens = set() lsents, rsents, labels = trainset for sent in lsents: for word in sent: tokens.add(word) for sent in rsents: for word in sent: tokens.add(word) lsents, rsents, labels = testset for sent in lsents: for word in sent: tokens.add(word) for sent in rsents: for word in sent: tokens.add(word) tokens = list(tokens) org_tokens = tokens[:] tokens.append('<s>') tokens.append('</s>') tokens.append('oov') # word_freq = pickle.load(open(basepath + '/data/' + task + '/word_freq.p', "rb")) word_freq = {} files = [ '/train/a.toks', '/train/b.toks', '/test/a.toks', '/test/b.toks' ] for filename in files: for line in open(basepath + '/data/' + task + filename): line = line.strip() for word in line.split(): # if word not in oov: try: word_freq[word] += 1 except: word_freq[word] = 1 if c2w_mode: EMBEDDING_DIM = 200 else: EMBEDDING_DIM = 1100 if character_ngrams == 1: # dict_char_ngram = pickle.load(open(base_path+ '/char_dict.p', "rb")) dict_char_ngram = set() for word in tokens: for i in range(len(word)): dict_char_ngram.add(word[i]) ngrams_list = list(dict_char_ngram) dict_char_ngram = {} count = 0 for unit in ngrams_list: dict_char_ngram[unit] = count count += 1 elif character_ngrams == 2 and character_ngrams_overlap: # dict_char_ngram = pickle.load(open(base_path+ '/bigram_dict.p', "rb")) dict_char_ngram = set() for word in tokens: if len(word) <= 2: dict_char_ngram.add(word) else: for i in range(len(word) - 1): dict_char_ngram.add(word[i:i + 2]) ngrams_list = list(dict_char_ngram) dict_char_ngram = {} count = 0 for unit in ngrams_list: dict_char_ngram[unit] = count count += 1 elif character_ngrams == 2 and not character_ngrams_overlap: # dict_char_ngram = pickle.load(open(base_path+ '/bigram_dict_no_overlap.p', "rb")) dict_char_ngram = set() for word in tokens: if len(word) <= 2: dict_char_ngram.add(word) else: for i in range(0, len(word) - 1, 2): dict_char_ngram.add(word[i:i + 2]) if len(word) % 2 == 1: dict_char_ngram.add(word[len(word) - 1]) ngrams_list = list(dict_char_ngram) dict_char_ngram = {} count = 0 for unit in ngrams_list: dict_char_ngram[unit] = count count += 1 elif character_ngrams == 3 and character_ngrams_overlap: # dict_char_ngram = pickle.load(open(base_path+ '/trigram_dict.p', "rb")) dict_char_ngram = set() for word in tokens: if len(word) <= 3: dict_char_ngram.add(word) else: for i in range(len(word) - 2): dict_char_ngram.add(word[i:i + 3]) ngrams_list = list(dict_char_ngram) dict_char_ngram = {} count = 0 for unit in ngrams_list: dict_char_ngram[unit] = count count += 1 elif character_ngrams == 3 and not character_ngrams_overlap: # dict_char_ngram = pickle.load(open(base_path+ '/trigram_dict_no_overlap.p', "rb")) dict_char_ngram = set() for word in tokens: if len(word) <= 3: dict_char_ngram.add(word) else: for i in range(0, len(word) - 2, 3): dict_char_ngram.add(word[i:i + 3]) if len(word) % 3 == 1: dict_char_ngram.add(word[len(word) - 1]) elif len(word) % 3 == 2: dict_char_ngram.add(word[len(word) - 2:]) ngrams_list = list(dict_char_ngram) dict_char_ngram = {} count = 0 for unit in ngrams_list: dict_char_ngram[unit] = count count += 1 dict_char_ngram[' '] = len(dict_char_ngram) print('current task: ' + task + ', lm mode: ' + str(lm_mode) + ', c2w mode: ' + str(c2w_mode) + ', n = ' + str(character_ngrams) + ', overlap = ' + str(character_ngrams_overlap) + '.') elif granularity == 'word': tokens = [] count = 0 num_inv = 0 num_oov = 0 if args.pretrained: glove_mode = True else: glove_mode = False update_inv_mode = False update_oov_mode = False word_mode = (glove_mode, update_inv_mode, update_oov_mode) if task == 'msrp': #for line in open(basepath + '/data/' + task + '/vocab.txt'): # tokens.append(line.strip()) tokens = set() lsents, rsents, labels = trainset for sent in lsents: for word in sent: tokens.add(word) for sent in rsents: for word in sent: tokens.add(word) lsents, rsents, labels = testset for sent in lsents: for word in sent: tokens.add(word) for sent in rsents: for word in sent: tokens.add(word) tokens = list(tokens) tokens.append('oov') dict = {} EMBEDDING_DIM = 300 # wv_dict, wv_arr, wv_size = load_word_vectors(basepath + '/VDPWI-NN-Torch/data/glove', 'glove.twitter.27B', EMBEDDING_DIM) wv_dict, wv_arr, wv_size = load_word_vectors( expanduser("~") + '/Documents/research/pytorch/DeepPairWiseWord' + '/VDPWI-NN-Torch/data/glove', 'glove.840B', EMBEDDING_DIM) # wv_dict, wv_arr, wv_size = load_word_vectors(basepath + '/data/paragram/paragram_300_sl999/', 'paragram', EMBEDDING_DIM) # wv_dict={} # wv_arr={} for word in tokens: fake_dict[word] = torch.Tensor([ random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM) ]) try: dict[word] = wv_arr[wv_dict[word]] num_inv += 1 except: num_oov += 1 # print(word) oov.append(word) dict[word] = torch.Tensor([ random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM) ]) elif task == 'url' or task == 'pit': for line in open(basepath + '/data/' + task + '/vocab.txt'): tokens.append(line.strip()) # print(len(tokens)) tokens.append('oov') dict = {} EMBEDDING_DIM = 200 wv_dict, wv_arr, wv_size = load_word_vectors( basepath + '/VDPWI-NN-Torch/data/glove', 'glove.twitter.27B', EMBEDDING_DIM) num_oov = 0 num_inv = 0 for word in tokens: fake_dict[word] = torch.Tensor([ random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM) ]) try: dict[word] = wv_arr[wv_dict[word]] num_inv += 1 except: num_oov += 1 oov.append(word) dict[word] = torch.Tensor([ random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM) ]) dict_char_ngram = pickle.load( open(basepath + '/data/' + task + '/char_dict.p', "rb")) word_freq = pickle.load( open(basepath + '/data/' + task + '/word_freq.p', "rb")) print('finished loading word vector, there are ' + str(num_inv) + ' INV words and ' + str(num_oov) + ' OOV words.') print('current task: ' + task + ', glove mode = ' + str(glove_mode) + ', update_inv_mode = ' + str(update_inv_mode) + ', update_oov_mode = ' + str(update_oov_mode)) saved_file = 'current task: ' + task + ', glove mode = ' + str( glove_mode) + ', update_inv_mode = ' + str( update_inv_mode) + ', update_oov_mode = ' + str( update_oov_mode) + '.txt' else: print('wrong input for the second argument!') sys.exit() model = DeepPairWiseWord(EMBEDDING_DIM, HIDDEN_DIM, 1, task, granularity, num_class, dict, fake_dict, dict_char_ngram, oov, tokens, word_freq, feature_maps, kernels, charcnn_embedding_size, max_word_length, character_ngrams, c2w_mode, character_ngrams_overlap, word_mode, combine_mode, lm_mode, args.deep_CNN) #, corpus) if torch.cuda.is_available(): model = model.cuda() lsents, rsents, labels = trainset criterion = nn.MultiMarginLoss(p=1, margin=1.0, weight=None, size_average=True) if torch.cuda.is_available(): criterion = criterion.cuda() optimizer = torch.optim.RMSprop( model.parameters(), lr=0.0001 ) #, momentum=0.1, weight_decay=0.05)#,momentum=0.9,weight_decay=0.95) #optimizer = torch.optim.Adam(model.parameters(), lr=0.0001) # Train the Model #print(oov) print('start training') max_result = -1 batch_size = 32 report_interval = 50000 for epoch in range(num_epochs): print('--' * 20) model.train() optimizer.zero_grad() start_time = time.time() data_loss = 0 indices = torch.randperm(len(lsents)) train_correct = 0 #print(len(indices)) for index, i in enumerate(indices): #print(index) #start_time = time.time() sentA = lsents[i] sentB = rsents[i] if task == 'sick' or task == 'sts' or task == 'snli' or task == 'wiki': label = Variable(torch.Tensor(labels[i])) else: label = Variable(torch.LongTensor(labels[i])) #.cuda() if torch.cuda.is_available(): label = label.cuda() output, extra_loss = model(sentA, sentB, index) #tmp_output = np.exp(output.data[0].cpu().numpy()) #print index, 'gold: ', labels[i][0], 'predict: ', np.argmax(tmp_output) #print(extra_loss) loss = criterion(output, label) + extra_loss loss.backward() data_loss += loss.data[0] output = np.exp(output.data[0].cpu().numpy()) if labels[i][0] == np.argmax(output): train_correct += 1 #print(loss-extra_loss) #print('*'*20) if (index + 1) % batch_size == 0: optimizer.step() optimizer.zero_grad() if (index + 1) % report_interval == 0: msg = '%d completed epochs, %d batches' % (epoch, index + 1) msg += '\t train batch loss: %f' % (data_loss / (index + 1)) train_acc = train_correct / (index + 1) print(msg) if (index + 1) % (int(len(lsents) / 2)) == 0: model.eval() # test on URL dataset #print('testing on URL dataset:') #testset = readURLdata(basepath + '/data/url/test_9324/', granularity) test_lsents, test_rsents, test_labels = testset predicted = [] gold = [] correct = 0 for test_i in range(len(test_lsents)): sentA = test_lsents[test_i] sentB = test_rsents[test_i] output, _ = model(sentA, sentB, index) output = np.exp(output.data[0].cpu().numpy()) if test_labels[test_i][0] == np.argmax(output): correct += 1 predicted.append(output[1]) gold.append(test_labels[test_i][0]) _, result = URL_maxF1_eval(predict_result=predicted, test_data_label=gold) if result > max_result: max_result = result elapsed_time = time.time() - start_time print('Epoch ' + str(epoch + 1) + ' finished within ' + str(timedelta(seconds=elapsed_time)) + ', and current time:' + str(datetime.now())) print('Best result until now: %.6f' % max_result) model.train()
parser = argparse.ArgumentParser(description='order2taskplan-pytorch') parser.add_argument('--resume','-r', help='use checkpoint model parameters as initial parameters (default: False)', action="store_true") parser.add_argument('--pretrained','-p', help='use checkpoint model parameters and do not train anymore (default: False)', action="store_true") parser.add_argument('--epochs', default=20000, type=int, metavar='N', help='number of total epochs to run') parser.add_argument('--start-epoch', default=1, type=int, metavar='N', help='manual epoch number (useful on restarts)') args = parser.parse_args() torch.backends.cudnn.benchmark = True wv_dict, wv_arr, wv_size = load_word_vectors('../data/glove', 'glove.6B', 300) wv_index2dict={v: k for k, v in wv_dict.iteritems()} print('Loaded', len(wv_arr), 'words') def get_word(word): return wv_arr[wv_dict[word]] def closest(d, n=10): t1=time.time() all_dists = [(w, torch.dist(d, get_word(w))) for w in wv_dict] sorted_dists=sorted(all_dists, key=lambda t: t[1])[:n] t2=time.time() print(t2-t1) return sorted_dists def closest2(d,n=10):
def __init__(self, size): self.size = size glove_dict, glove_arr, glove_size = load_word_vectors( 'data/', 'glove.twitter.27B', size) self.glove_dict = glove_dict self.glove_arr = glove_arr
def main(args): #torch.manual_seed(123) EMBEDDING_DIM = 200 HIDDEN_DIM = 250 num_epochs = 20 task = args.task granularity = args.granularity dict = {} dict_char_ngram = {} word_freq = {} fake_dict = {} oov = [] feature_maps = [50, 100, 150, 200, 200, 200, 200] kernels = [1, 2, 3, 4, 5, 6, 7] charcnn_embedding_size = 15 max_word_length = 20 c2w_mode = False character_ngrams = 3 character_ngrams_2 = None character_ngrams_overlap = False glove_mode = None update_inv_mode = None update_oov_mode = None combine_mode = None lm_mode = None word_mode = (glove_mode, update_inv_mode, update_oov_mode) if torch.cuda.is_available(): basepath = expanduser("~") + '/pytorch/DeepPairWiseWord' else: basepath = expanduser( "~") + '/Documents/research/pytorch/DeepPairWiseWord' if task == 'url': num_class = 2 trainset = readURLdata(basepath + '/data/url/train/', granularity) testset = readURLdata(basepath + '/data/url/test_9324/', granularity) elif task == 'quora': num_class = 2 trainset = readURLdata(basepath + '/data/quora/train/', granularity) testset = readURLdata(basepath + '/data/quora/test/', granularity) elif task == 'msrp': num_class = 2 trainset = readURLdata(basepath + '/data/msrp/train/', granularity) testset = readURLdata(basepath + '/data/msrp/test/', granularity) elif task == 'sick': num_class = 5 trainset = readSICKdata(basepath + '/data/sick/train/', granularity) devset = readSICKdata(basepath + '/data/sick/dev/', granularity) testset = readSICKdata(basepath + '/data/sick/test/', granularity) elif task == 'pit': num_class = 2 trainset = readPITdata(basepath + '/data/pit/train/', granularity) #devset = readPITdata(basepath+'/data/pit/dev/',granularity) testset = readPITdata(basepath + '/data/pit/test/', granularity) elif task == 'hindi': num_class = 2 trainset = read_Hindi_data(basepath + '/data/hindi/train/', granularity) testset = read_Hindi_data(basepath + '/data/hindi/test/', granularity) elif task == 'sts': num_class = 6 trainset = readSTSdata(basepath + '/data/sts/train/', granularity) testset = readSTSdata(basepath + '/data/sts/test/', granularity) elif task == 'snli': num_class = 3 trainset = readSNLIdata(basepath + '/data/snli/train/', granularity) testset = readSNLIdata(basepath + '/data/snli/test/', granularity) elif task == 'mnli': num_class = 3 trainset = readMNLIdata(basepath + '/data/mnli/train/', granularity) devset_m = readMNLIdata(basepath + '/data/mnli/dev_m/', granularity) devset_um = readMNLIdata(basepath + '/data/mnli/dev_um/', granularity) testset_m = readMNLIdata(basepath + '/data/mnli/test_m/', granularity) testset_um = readMNLIdata(basepath + '/data/mnli/test_um/', granularity) elif task == 'wiki': ''' _name_to_id = { 'counter-vandalism': 0, 'fact-update': 1, 'refactoring': 2, 'copy-editing': 3, 'other': 4, 'wikification': 5, 'vandalism': 6, 'simplification': 7, 'elaboration': 8, 'verifiability': 9, 'process': 10, 'clarification': 11, 'disambiguation': 12, 'point-of-view': 13 } ''' num_class = 14 data = pickle.load(open(basepath + "/data/wiki/data.cpickle", "rb")) left = [] right = [] label = [] id = [] for i in range(2976): id.append(data[i][0]) label.append([int(item) for item in data[i][3][0]]) left_sent = [item.encode('utf-8') for item in data[i][1][0]] right_sent = [item.encode('utf-8') for item in data[i][2][0]] shared = [] for item in left_sent: if item in right_sent: shared.append(item) for item in shared: if item in left_sent and item in right_sent: left_sent.remove(item) right_sent.remove(item) if len(left_sent) == 0: left_sent = ['<EMPTY-EDIT>'] if len(right_sent) == 0: right_sent = ['<EMPTY-EDIT>'] left.append(left_sent) right.append(right_sent) #print(left_sent) #print(right_sent) #print(id[0]) #print('*'*20) trainset = (left, right, label) #sys.exit() left = [] right = [] label = [] for i in range(2376, 2976): id.append(data[i][0]) label.append([int(item) for item in data[i][3][0]]) left_sent = [item.encode('utf-8') for item in data[i][1][0]] right_sent = [item.encode('utf-8') for item in data[i][2][0]] shared = [] for item in left_sent: if item in right_sent: shared.append(item) for item in shared: if item in left_sent and item in right_sent: left_sent.remove(item) right_sent.remove(item) if len(left_sent) == 0: left_sent = ['<EMPTY-EDIT>'] if len(right_sent) == 0: right_sent = ['<EMPTY-EDIT>'] left.append(left_sent) right.append(right_sent) testset = (left, right, label) elif task == 'wikiqa': num_class = 2 trainset = readURLdata(basepath + '/data/wikiqa/train/', granularity) testset = readURLdata(basepath + '/data/wikiqa/test/', granularity) elif task == 'trecqa': num_class = 2 trainset = readURLdata(basepath + '/data/trecqa/train-all/', granularity) testset = readURLdata(basepath + '/data/trecqa/raw-test/', granularity) else: print('wrong input for the first argument!') sys.exit() if granularity == 'word': tokens = [] count = 0 num_inv = 0 num_oov = 0 glove_mode = True update_inv_mode = True update_oov_mode = True word_mode = (glove_mode, update_inv_mode, update_oov_mode) if task == 'sick' or task == 'quora' or task == 'msrp': for line in open(basepath + '/data/' + task + '/vocab.txt'): tokens.append(line.strip()) dict = {} EMBEDDING_DIM = 300 #wv_dict, wv_arr, wv_size = load_word_vectors(basepath + '/VDPWI-NN-Torch/data/glove', 'glove.twitter.27B', EMBEDDING_DIM) wv_dict, wv_arr, wv_size = load_word_vectors( basepath + '/VDPWI-NN-Torch/data/glove', 'glove.840B', EMBEDDING_DIM) #wv_dict, wv_arr, wv_size = load_word_vectors(basepath + '/data/paragram/paragram_300_sl999/', 'paragram', EMBEDDING_DIM) #wv_dict={} #wv_arr={} for word in tokens: fake_dict[word] = torch.Tensor([ random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM) ]) try: dict[word] = wv_arr[wv_dict[word]] num_inv += 1 except: num_oov += 1 #print(word) oov.append(word) dict[word] = torch.Tensor([ random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM) ]) elif task == 'sts': for line in open(basepath + '/data/' + task + '/vocab.txt'): tokens.append(line.strip()) dict = {} #EMBEDDING_DIM = 200 #wv_dict, wv_arr, wv_size = load_word_vectors(basepath + '/VDPWI-NN-Torch/data/glove', 'glove.twitter.27B', EMBEDDING_DIM) #EMBEDDING_DIM = 300 #wv_dict, wv_arr, wv_size = load_word_vectors(basepath + '/VDPWI-NN-Torch/data/glove', 'glove.840B', EMBEDDING_DIM) EMBEDDING_DIM = 300 wv_dict, wv_arr, wv_size = load_word_vectors( basepath + '/data/paragram/paragram_300_sl999/', 'paragram', EMBEDDING_DIM) #wv_dict={} #wv_arr={} #oov = [] #for line in open(basepath + '/data/' + task + '/oov.txt'): # line = line.strip() # oov.append(line) #inv = [] #for line in open(basepath + '/data/' + task + '/inv_14000.txt'): # line = line.strip() # inv.append(line) # count=len(oov)+len(inv) #inv = tokens num_oov = 0 num_inv = 0 for word in tokens: fake_dict[word] = torch.Tensor([ random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM) ]) try: dict[word] = wv_arr[wv_dict[word]] num_inv += 1 except: num_oov += 1 oov.append(word) dict[word] = torch.Tensor([ random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM) ]) dict_char_ngram = pickle.load( open(basepath + '/data/' + task + '/char_dict.p', "rb")) word_freq = pickle.load( open(basepath + '/data/' + task + '/word_freq.p', "rb")) elif task == 'snli' or task == 'wikiqa' or task == 'trecqa' or task == 'mnli': for line in open(basepath + '/data/' + task + '/vocab.txt'): tokens.append(line.strip()) dict = {} #EMBEDDING_DIM = 200 #wv_dict, wv_arr, wv_size = load_word_vectors(basepath + '/VDPWI-NN-Torch/data/glove', 'glove.twitter.27B', EMBEDDING_DIM) EMBEDDING_DIM = 300 wv_dict, wv_arr, wv_size = load_word_vectors( basepath + '/VDPWI-NN-Torch/data/glove', 'glove.840B', EMBEDDING_DIM) #EMBEDDING_DIM = 300 #wv_dict, wv_arr, wv_size = load_word_vectors(basepath+'/data/paragram/paragram_300_sl999/', 'paragram', EMBEDDING_DIM) num_oov = 0 num_inv = 0 for word in tokens: fake_dict[word] = torch.Tensor([ random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM) ]) try: dict[word] = wv_arr[wv_dict[word]] num_inv += 1 except: num_oov += 1 oov.append(word) dict[word] = torch.Tensor([ random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM) ]) #dict_char_ngram = pickle.load(open(basepath + '/data/' + task + '/char_dict.p', "rb")) #word_freq = pickle.load(open(basepath + '/data/' + task + '/word_freq.p', "rb")) dict_char_ngram = {} word_freq = {} elif task == 'hindi': #words, embeddings = pickle.load(open(basepath+'/data/hindi/polyglot-hi.pkl', 'rb')) #print("Emebddings shape is {}".format(embeddings.shape)) #print words[777], embeddings[777] embeddings_file_bin = basepath + '/data/hindi/hi/hi.bin' model_bin = KeyedVectors.load(embeddings_file_bin) #print(words[777], model_bin[words[777]]) #sys.exit() for line in open(basepath + '/data/' + task + '/vocab.txt'): tokens.append(line.strip().decode('utf-8')) dict = {} EMBEDDING_DIM = 300 for word in tokens: fake_dict[word] = torch.Tensor([ random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM) ]) try: dict[word] = model_bin[word] num_inv += 1 except: num_oov += 1 oov.append(word) dict[word] = torch.Tensor([ random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM) ]) elif task == 'url' or task == 'pit': for line in open(basepath + '/data/' + task + '/vocab.txt'): tokens.append(line.strip()) # print(len(tokens)) dict = {} EMBEDDING_DIM = 200 wv_dict, wv_arr, wv_size = load_word_vectors( basepath + '/VDPWI-NN-Torch/data/glove', 'glove.twitter.27B', EMBEDDING_DIM) #EMBEDDING_DIM = 300 #wv_dict, wv_arr, wv_size = load_word_vectors(basepath + '/data/paragram/paragram_300_sl999/', 'paragram', EMBEDDING_DIM) #wv_dict={} #wv_arr={} # print(len(wv_dict)) #oov = [] #for line in open(basepath+'/data/'+task+'/oov.txt'): # line = line.strip() # oov.append(line) #inv=[] #for line in open(basepath+'/data/'+task+'/inv_4000.txt'): # line = line.strip() # inv.append(line) #count=len(oov)+len(inv) #inv = tokens num_oov = 0 num_inv = 0 for word in tokens: fake_dict[word] = torch.Tensor([ random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM) ]) try: dict[word] = wv_arr[wv_dict[word]] num_inv += 1 except: num_oov += 1 oov.append(word) dict[word] = torch.Tensor([ random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM) ]) dict_char_ngram = pickle.load( open(basepath + '/data/' + task + '/char_dict.p', "rb")) word_freq = pickle.load( open(basepath + '/data/' + task + '/word_freq.p', "rb")) print('finished loading word vector, there are ' + str(num_inv) + ' INV words and ' + str(num_oov) + ' OOV words.') print('current task: ' + task + ', glove mode = ' + str(glove_mode) + ', update_inv_mode = ' + str(update_inv_mode) + ', update_oov_mode = ' + str(update_oov_mode)) saved_file = 'current task: ' + task + ', glove mode = ' + str( glove_mode) + ', update_inv_mode = ' + str( update_inv_mode) + ', update_oov_mode = ' + str( update_oov_mode) + '.txt' #subprocess.call(['echo','finished loading word vector, there are ',str(num_inv),' INV words and ',str(len(oov)),' OOV words.']) elif granularity == 'char': # charcnn parameters feature_maps = [50, 100, 150, 200, 200, 200, 200] kernels = [1, 2, 3, 4, 5, 6, 7] charcnn_embedding_size = 15 max_word_length = 20 #c2w parameters lm_mode = False c2w_mode = False character_ngrams = 1 character_ngrams_overlap = True tokens = [] if task != 'wiki': if task == 'hindi': for line in open(basepath + '/data/' + task + '/vocab.txt'): tokens.append(line.strip().decode('utf-8')) tokens.append('<s>'.decode()) tokens.append('</s>'.decode()) tokens.append('oov'.decode()) else: for line in open(basepath + '/data/' + task + '/vocab.txt'): tokens.append(line.strip()) org_tokens = tokens[:] tokens.append('<s>') tokens.append('</s>') tokens.append('oov') word_freq = pickle.load( open(basepath + '/data/' + task + '/word_freq.p', "rb")) if c2w_mode: EMBEDDING_DIM = 200 else: EMBEDDING_DIM = 1100 if character_ngrams == 1: dict_char_ngram = pickle.load( open(basepath + '/data/' + task + '/char_dict.p', "rb")) elif character_ngrams == 2 and character_ngrams_overlap: dict_char_ngram = pickle.load( open(basepath + '/data/' + task + '/bigram_dict.p', "rb")) elif character_ngrams == 2 and not character_ngrams_overlap: dict_char_ngram = pickle.load( open(basepath + '/data/' + task + '/bigram_dict_no_overlap.p', "rb")) elif character_ngrams == 3 and character_ngrams_overlap: dict_char_ngram = pickle.load( open(basepath + '/data/' + task + '/trigram_dict.p', "rb")) elif character_ngrams == 3 and not character_ngrams_overlap: dict_char_ngram = pickle.load( open(basepath + '/data/' + task + '/trigram_dict_no_overlap.p', "rb")) print('current task: ' + task + ', lm mode: ' + str(lm_mode) + ', c2w mode: ' + str(c2w_mode) + ', n = ' + str(character_ngrams) + ', overlap = ' + str(character_ngrams_overlap) + '.') saved_file = 'current task: ' + task + ', lm mode: ' + str( lm_mode) + ', c2w mode: ' + str(c2w_mode) + ', n = ' + str( character_ngrams) + ', overlap = ' + str( character_ngrams_overlap) + '.txt' elif granularity == 'mix': tokens = [] num_oov = 0 num_inv = 0 for line in open(basepath + '/data/' + task + '/vocab.txt'): tokens.append(line.strip()) tokens.append('<s>') tokens.append('</s>') tokens.append('oov') # print(len(tokens)) dict = {} #oov=[] if task == 'sts': EMBEDDING_DIM = 300 wv_dict, wv_arr, wv_size = load_word_vectors( basepath + '/data/paragram/paragram_300_sl999/', 'paragram', EMBEDDING_DIM) #wv_dict, wv_arr, wv_size = load_word_vectors(basepath + '/VDPWI-NN-Torch/data/glove', 'glove.840B', EMBEDDING_DIM) else: EMBEDDING_DIM = 200 wv_dict, wv_arr, wv_size = load_word_vectors( basepath + '/VDPWI-NN-Torch/data/glove', 'glove.twitter.27B', EMBEDDING_DIM) ''' EMBEDDING_DIM = 300 wv_dict, wv_arr, wv_size = load_word_vectors(basepath + '/data/paragram/paragram_300_sl999/', 'paragram', EMBEDDING_DIM) ''' oov = [] for word in tokens: ''' if word in oov or word in inv: count+=1 dict[word] = torch.Tensor([0 for i in range(EMBEDDING_DIM)]) else: dict[word] = wv_arr[wv_dict[word]] num_inv+=1 ''' try: dict[word] = wv_arr[wv_dict[word]] num_inv += 1 except: num_oov += 1 oov.append(word) # print(word) dict[word] = torch.Tensor([ random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM) ]) #dict[word] = torch.Tensor([0 for i in range(EMBEDDING_DIM)]) lm_mode = False combine_mode = 'g_0.75' # 'concat', 'g_0.25', 'g_0.50', 'g_0.75', 'adaptive', 'attention', 'backoff' # c2w parameters c2w_mode = False character_ngrams = 1 #character_ngrams_2 = 3 character_ngrams_overlap = False if character_ngrams == 1: dict_char_ngram = pickle.load( open(basepath + '/data/' + task + '/char_dict.p', "rb")) elif character_ngrams == 2 and character_ngrams_overlap: dict_char_ngram = pickle.load( open(basepath + '/data/' + task + '/bigram_dict.p', "rb")) elif character_ngrams == 2 and not character_ngrams_overlap: dict_char_ngram = pickle.load( open(basepath + '/data/' + task + '/bigram_dict_no_overlap.p', "rb")) elif character_ngrams == 3 and character_ngrams_overlap: dict_char_ngram = pickle.load( open(basepath + '/data/' + task + '/trigram_dict.p', "rb")) elif character_ngrams == 3 and not character_ngrams_overlap: dict_char_ngram = pickle.load( open(basepath + '/data/' + task + '/trigram_dict_no_overlap.p', "rb")) ''' if character_ngrams_2 == 1: dict_char_ngram_2 = pickle.load(open(basepath + '/data/' + task + '/char_dict.p', "rb")) elif character_ngrams_2 == 2 and character_ngrams_overlap: dict_char_ngram_2 = pickle.load(open(basepath + '/data/' + task + '/bigram_dict.p', "rb")) elif character_ngrams_2 == 2 and not character_ngrams_overlap: dict_char_ngram_2 = pickle.load(open(basepath + '/data/' + task + '/bigram_dict_no_overlap.p', "rb")) elif character_ngrams_2 == 3 and character_ngrams_overlap: dict_char_ngram_2 = pickle.load(open(basepath + '/data/' + task + '/trigram_dict.p', "rb")) elif character_ngrams_2 == 3 and not character_ngrams_overlap: dict_char_ngram_2 = pickle.load(open(basepath + '/data/' + task + '/trigram_dict_no_overlap.p', "rb")) ''' word_freq = pickle.load( open(basepath + '/data/' + task + '/word_freq.p', "rb")) print('current task: ' + task + ', lm mode: ' + str(lm_mode) + ', combination mode: ' + combine_mode + ', c2w mode: ' + str(c2w_mode) + ', n = ' + str(character_ngrams) + ', overlap = ' + str(character_ngrams_overlap) + '.') print('finished loading word & char table, there are ' + str(num_inv) + ' INV words and ' + str(num_oov) + ' OOV words.') elif granularity == 'cross': oov = [] dict_char = [] tokens = [] word_freq = [] overlap = True if overlap: dict_ngram = pickle.load( open(basepath + '/data/' + task + '/cross_trigram_dict.p', "rb")) else: dict_ngram = pickle.load( open( basepath + '/data/' + task + '/cross_trigram_dict_no_overlap.p', "rb")) else: print('wrong input for the second argument!') sys.exit() model = DeepPairWiseWord(EMBEDDING_DIM, HIDDEN_DIM, 1, task, granularity, num_class, dict, fake_dict, dict_char_ngram, oov, tokens, word_freq, feature_maps, kernels, charcnn_embedding_size, max_word_length, character_ngrams, c2w_mode, character_ngrams_overlap, word_mode, combine_mode, lm_mode) #, corpus) #print(get_n_params(model)) #sys.exit() #print(model.lm_train_data) #sys.exit() #premodel=DeepPairWiseWord(EMBEDDING_DIM,HIDDEN_DIM,1,task,granularity,num_class,dict,dict_char,oov) #premodel.load_state_dict(torch.load('model_char_only.pkl')) #premodel=torch.load('model_char_only.pkl') #model.embedding=premodel.embedding #model.lstm_c2w=premodel.lstm_c2w #model.df=premodel.df #model.db=premodel.db #model.bias=premodel.bias if torch.cuda.is_available(): model = model.cuda() lsents, rsents, labels = trainset #print(len(lsents)) #threshold=40000 #lsents = lsents[:threshold] #rsents = rsents[:threshold] #labels = labels[:threshold] # Loss and Optimizer if task == 'sick' or task == 'sts' or task == 'snli': indices = torch.randperm(len(lsents)) print('indices:') print(indices[:10]) #for line in open('./data/sick/order.txt'): # indices.append(int(line.strip()) - 1) criterion = nn.KLDivLoss() if torch.cuda.is_available(): criterion = criterion.cuda() elif task == 'url' or task == 'pit' or task == 'hindi' or task == 'quora' or task == 'msrp' or task == 'wikiqa' or task == 'trecqa' or task == 'mnli': ''' indices = torch.randperm(len(trainset[0])) with open('./data/'+task+'/order.txt','w') as f: for item in indices: f.writelines(str(item)+'\n') ''' #indices = [] #for line in open('./data/'+task+'/order.txt'): # indices.append(int(line.strip())) indices = torch.randperm(len(lsents)) #print('indices:') #print(indices[:10]) criterion = nn.MultiMarginLoss(p=1, margin=1.0, weight=None, size_average=True) if torch.cuda.is_available(): criterion = criterion.cuda() elif task == 'wiki': indices = torch.randperm(len(lsents)) print('indices:') print(indices[:10]) criterion = nn.MultiLabelSoftMarginLoss() if torch.cuda.is_available(): criterion = criterion.cuda() optimizer = torch.optim.RMSprop( model.parameters(), lr=0.0001 ) #, momentum=0.1, weight_decay=0.05)#,momentum=0.9,weight_decay=0.95) #optimizer = torch.optim.Adam(model.parameters(), lr=0.0001) # Train the Model #print(oov) print('start training') #subprocess.call(['echo','start training']) gold = [] gold_um = [] if task == 'url': for line in open(basepath + '/data/' + task + '/test_9324/sim.txt'): gold.append(int(line.strip())) elif task == 'snli': for line in open(basepath + '/data/' + task + '/test/sim.txt'): gold.append(line.strip()) elif task == 'trecqa': for line in open(basepath + '/data/' + task + '/raw-test/sim.txt'): gold.append(float(line.strip())) elif task == 'mnli': pass ''' for line in open(basepath+'/data/' + task + '/dev_m/sim.txt'): gold.append(float(['neutral', 'entailment','contradiction'].index(line.strip()))) for line in open(basepath+'/data/' + task + '/dev_um/sim.txt'): gold_um.append(float(['neutral', 'entailment','contradiction'].index(line.strip()))) ''' else: for line in open(basepath + '/data/' + task + '/test/sim.txt'): gold.append(float(line.strip())) max_result = -1 max_result_um = -1 batch_size = 32 report_interval = 50000 for epoch in range(num_epochs): print('--' * 20) model.train() optimizer.zero_grad() start_time = time.time() data_loss = 0 indices = torch.randperm(len(lsents)) train_correct = 0 #print(len(indices)) for index, i in enumerate(indices): #print(index) #start_time = time.time() if granularity == 'word': sentA = lsents[i] sentB = rsents[i] ''' #print(lsents[i]) try: sentA = torch.cat((dict[word].view(1, EMBEDDING_DIM) for word in lsents[i]), 0) sentA = Variable(sentA)#.cuda() #print(lsents[i]) #print(sentA) #print(rsents[i]) sentB = torch.cat((dict[word].view(1, EMBEDDING_DIM) for word in rsents[i]), 0) sentB = Variable(sentB)#.cuda() except: print(lsents[i]) print(rsents[i]) sys.exit() #print(rsents[i]) #print(sentB) #sys.exit() if torch.cuda.is_available(): sentA=sentA.cuda() sentB=sentB.cuda() sentA = torch.unsqueeze(sentA, 0).view(-1, 1, EMBEDDING_DIM) sentB = torch.unsqueeze(sentB, 0).view(-1, 1, EMBEDDING_DIM) # label=torch.unsqueeze(label,0) ''' elif granularity == 'char' or granularity == 'mix' or granularity == 'cross': #sentA=[] #sentB=[] #for word in lsents[i]: # sentA.append([dict[char] for char in word]) #for word in rsents[i]: # sentB.append([dict[char] for char in word]) #print(i) sentA = lsents[i] sentB = rsents[i] if task == 'sick' or task == 'sts' or task == 'snli' or task == 'wiki': label = Variable(torch.Tensor(labels[i])) else: label = Variable(torch.LongTensor(labels[i])) #.cuda() if torch.cuda.is_available(): label = label.cuda() # Forward + Backward + Optimize #elapsed_time = time.time() - start_time #print('data preparation time: '+str(timedelta(seconds=elapsed_time))) #print(sentA) #print(sentB) #print(id[i]) #print('*'*20) output, extra_loss = model(sentA, sentB, index) #tmp_output = np.exp(output.data[0].cpu().numpy()) #print index, 'gold: ', labels[i][0], 'predict: ', np.argmax(tmp_output) #print(extra_loss) loss = criterion(output, label) + extra_loss loss.backward() data_loss += loss.data[0] output = np.exp(output.data[0].cpu().numpy()) if labels[i][0] == np.argmax(output): train_correct += 1 #print(loss-extra_loss) #print('*'*20) if (index + 1) % batch_size == 0: optimizer.step() optimizer.zero_grad() if (index + 1) % report_interval == 0: msg = '%d completed epochs, %d batches' % (epoch, index + 1) msg += '\t train batch loss: %f' % (data_loss / (index + 1)) train_acc = train_correct / (index + 1) msg += '\t train accuracy: %f' % train_acc print(msg) if (index + 1) % (int(len(lsents) / 2)) == 0: #print ('Epoch [%d/%d], Iter [%d/%d] Loss: %.6f' # % (epoch + 1, num_epochs, index + 1, len(lsents) // 1, data_loss))#loss.data[0])) #subprocess.call(['echo','Epoch ',str(epoch+1),'Loss: ',str(data_loss)]) #break #data_loss = 0 #torch.save(model.state_dict(), 'model.pkl') #model.load_state_dict(torch.load('model_char_only.pkl')) if task == 'sick' or task == 'sts' or task == 'snli' or task == 'wiki': model.eval() test_lsents, test_rsents, test_labels = testset predicted = [] tmp_result = 0 #gold=[] #for line in open('./data/sick/test/sim.txt'): # gold.append(float(line.strip())) for test_i in range(len(test_lsents)): if granularity == 'word': ''' sentA = torch.cat((dict[word].view(1, EMBEDDING_DIM) for word in test_lsents[test_i]), 0) sentA = Variable(sentA) # print(sentA) sentB = torch.cat((dict[word].view(1, EMBEDDING_DIM) for word in test_rsents[test_i]), 0) sentB = Variable(sentB) if torch.cuda.is_available(): sentA = sentA.cuda() sentB = sentB.cuda() #label = torch.unsqueeze(label, 0) sentA = torch.unsqueeze(sentA, 0).view(-1, 1, EMBEDDING_DIM) sentB = torch.unsqueeze(sentB, 0).view(-1, 1, EMBEDDING_DIM) ''' sentA = test_lsents[test_i] sentB = test_rsents[test_i] elif granularity == 'char' or granularity == 'mix': sentA = test_lsents[test_i] sentB = test_rsents[test_i] raw_output, _ = model(sentA, sentB, index) #print(output) if task == 'sick': output = raw_output output = np.exp(output.data[0].cpu().numpy()) predicted.append(1 * output[0] + 2 * output[1] + 3 * output[2] + 4 * output[3] + 5 * output[4]) elif task == 'snli': output = raw_output output = np.exp(output.data[0].cpu().numpy()) output = [output[0], output[1], output[2]] tmp_output = output.index(max(output)) predicted.append(tmp_output) if test_labels[test_i].index( max(test_labels[test_i])) == tmp_output: tmp_result += 1 elif task == 'wiki': output = torch.sigmoid(raw_output).data > 0.5 output = output.cpu() predicted = list(output.numpy()[0]) if predicted == test_labels[test_i]: tmp_result += 1 else: output = raw_output output = np.exp(output.data[0].cpu().numpy()) predicted.append(0 * output[0] + 1 * output[1] + 2 * output[2] + 3 * output[3] + 4 * output[4] + 5 * output[5]) #print(predicted) #print(gold) if task == 'sick': result = pearson(predicted, gold) print('Test Correlation: %.6f' % result) if result > max_result: max_result = result elif task == 'snli' or task == 'wiki': result = tmp_result / len(test_lsents) print('Test Accuracy: %.6f' % result) if result > max_result: max_result = result else: result1 = pearson(predicted[0:450], gold[0:450]) result2 = pearson(predicted[450:750], gold[450:750]) result3 = pearson(predicted[750:1500], gold[750:1500]) result4 = pearson(predicted[1500:2250], gold[1500:2250]) result5 = pearson(predicted[2250:3000], gold[2250:3000]) result6 = pearson(predicted[3000:3750], gold[3000:3750]) print( 'deft-forum: %.6f, deft-news: %.6f, headlines: %.6f, images: %.6f, OnWN: %.6f, tweet-news: %.6f' % (result1, result2, result3, result4, result5, result6)) wt_mean = 0.12 * result1 + 0.08 * result2 + 0.2 * result3 + 0.2 * result4 + 0.2 * result5 + 0.2 * result6 print('weighted mean: %.6f' % wt_mean) if wt_mean > max_result: max_result = wt_mean if task == 'sts': with open(basepath + '/data/sts/sts_PWIM_prob.txt', 'w') as f: for item in predicted: f.writelines(str(item) + '\n') #else: # with open('SICK_with_paragram_result.txt', 'w') as f: # for item in predicted: # f.writelines(str(item)+'\n') else: model.eval() msg = '%d completed epochs, %d batches' % (epoch, index + 1) if task == 'mnli': test_lsents, test_rsents, test_labels = devset_m else: test_lsents, test_rsents, test_labels = testset predicted = [] correct = 0 #gold=gold[:3000] #print(len(gold)) for test_i in range(len(test_lsents)): # start_time = time.time() if granularity == 'word': sentA = test_lsents[test_i] sentB = test_rsents[test_i] ''' sentA = torch.cat((dict[word].view(1, EMBEDDING_DIM) for word in test_lsents[test_i]), 0) sentA = Variable(sentA)#.cuda() # print(sentA) sentB = torch.cat((dict[word].view(1, EMBEDDING_DIM) for word in test_rsents[test_i]), 0) sentB = Variable(sentB)#.cuda() # print(sentB) if torch.cuda.is_available(): sentA=sentA.cuda() sentB=sentB.cuda() sentA = torch.unsqueeze(sentA, 0).view(-1, 1, EMBEDDING_DIM) sentB = torch.unsqueeze(sentB, 0).view(-1, 1, EMBEDDING_DIM) # label=torch.unsqueeze(label,0) ''' elif granularity == 'char' or granularity == 'mix': sentA = test_lsents[test_i] sentB = test_rsents[test_i] output, _ = model(sentA, sentB, index) #print(output) output = np.exp(output.data[0].cpu().numpy()) if test_labels[test_i][0] == np.argmax(output): correct += 1 predicted.append(output[1]) #result=float(correct)/len(test_lsents) #print('Test Accuracy: %.4f'% result) #result_acc, result_f1=URL_maxF1_eval(predict_result=predicted,test_data_label=gold) result = correct / len(test_lsents) msg += '\t dev m accuracy: %f' % result print(msg) if result > max_result: max_result = result test_lsents, test_rsents, test_labels = testset_m predicted = [] for test_i in range(len(test_lsents)): # start_time = time.time() if granularity == 'word': sentA = test_lsents[test_i] sentB = test_rsents[test_i] output, _ = model(sentA, sentB, index) output = np.exp(output.data[0].cpu().numpy()) predicted.append(np.argmax(output)) with open(basepath + '/sub_m.csv', 'w+') as f: label_dict = [ 'neutral', 'entailment', 'contradiction' ] f.write("pairID,gold_label\n") for i, k in enumerate(predicted): f.write( str(i + 9847) + "," + label_dict[k] + "\n") #with open(basepath+'/PWIM_prob_result_'+task, 'w') as f: # for item in predicted: # f.writelines(str(item)+'\n') if task == 'mnli': msg = '%d completed epochs, %d batches' % (epoch, index + 1) test_lsents, test_rsents, test_labels = devset_um predicted = [] correct = 0 for test_i in range(len(test_lsents)): # start_time = time.time() if granularity == 'word': sentA = test_lsents[test_i] sentB = test_rsents[test_i] output, _ = model(sentA, sentB, index) # print(output) output = np.exp(output.data[0].cpu().numpy()) if test_labels[test_i][0] == np.argmax(output): correct += 1 predicted.append(output[1]) #result_acc, result_f1 = URL_maxF1_eval(predict_result=predicted, test_data_label=gold_um) result_acc = correct / len(test_lsents) msg += '\t dev um accuracy: %f' % result_acc print(msg) if result_acc > max_result_um: max_result_um = result_acc test_lsents, test_rsents, test_labels = testset_um predicted = [] for test_i in range(len(test_lsents)): # start_time = time.time() if granularity == 'word': sentA = test_lsents[test_i] sentB = test_rsents[test_i] output, _ = model(sentA, sentB, index) output = np.exp(output.data[0].cpu().numpy()) predicted.append(np.argmax(output)) with open(basepath + '/sub_um.csv', 'w+') as f: label_dict = [ 'neutral', 'entailment', 'contradiction' ] f.write("pairID,gold_label\n") for i, k in enumerate(predicted): f.write( str(i) + "," + label_dict[k] + "\n") #with open('current task: '+task+', lm mode: '+str(lm_mode)+', combination mode: '+combine_mode+', c2w mode: '+str(c2w_mode)+', n = '+str(character_ngrams)+', overlap = '+str(character_ngrams_overlap)+'.txt','w') as f: # for item in predicted: # f.writelines(str(item)+'\n') #torch.save(model, 'model_URL_unigram_CNN.pkl') #torch.save(model, 'model_word_inv_18k.pkl') #torch.save(model, 'model_word_inv_3k.pkl') #torch.save(model, 'model_char_only.pkl') #torch.save(model, 'model_word_only_pit.pkl') #torch.save(model, 'model_word_char_backoff.pkl') #torch.save(model, 'model_word_char_g_0.5.pkl') #torch.save(model, 'model_word_char_adaptive.pkl') #torch.save(model, 'model_word_char_attention.pkl') #with open('model_word_inv_0k_result.txt', 'w') as f: #with open('sts_model_word_only_inv_17k_result.txt', 'w') as f: #with open('model_word_inv_3k_result.txt', 'w') as f: #with open('model_char_only_result.txt', 'w') as f: #with open('model_word_only_result_pit.txt', 'w') as f: #with open('model_word_char_g_0.5_result.txt', 'w') as f: #with open('model_word_char_backoff_result.txt', 'w') as f: #with open('model_word_char_adaptive.txt', 'w') as f: #with open('model_word_char_attention_result.txt','w') as f: # for item in predicted: # f.writelines(str(item)+'\n') ''' h = Variable(torch.zeros(2, 1, model.embedding_dim)) # 2 for bidirection c = Variable(torch.zeros(2, 1, model.embedding_dim)) if torch.cuda.is_available(): h = h.cuda() c = c.cuda() subword_embedding={} for word in org_tokens: tmp_indices = model.generate_word_indices(word) if not model.c2w_mode: if len(tmp_indices) < 20: tmp_indices = tmp_indices + [0 for i in range(model.charcnn_max_word_length - len(tmp_indices))] else: tmp_indices = tmp_indices[0:20] if model.c2w_mode: output = model.c2w_cell([tmp_indices], h, c) else: output = model.charCNN_cell([tmp_indices]) subword_embedding[word]=output.data[0].cpu().numpy() pickle.dump(subword_embedding, open('URL_subword_lm_embedding.p', "wb")) ''' elapsed_time = time.time() - start_time print('Epoch ' + str(epoch + 1) + ' finished within ' + str(timedelta(seconds=elapsed_time)) + ', and current time:' + str(datetime.now())) print('Best result until now: %.6f' % max_result) print('Best um result until now: %.6f' % max_result_um) #subprocess.call(['echo','Epoch ' , str(epoch + 1) , ' finished within ' , str(timedelta(seconds=elapsed_time)),', and current time:', str(datetime.now())]) #subprocess.call(['echo','Best result until now: ',str(max_result)]) model.train()