def train_lm(data_path): save_path = os.path.join( "/tmp", ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(6))) indices = [] noise = Variable(torch.ones(100, args.z_size).cuda()) for i in range(1000): noise.data.normal_(0, 1) fake_hidden = gan_gen(noise) max_indices = autoencoder.generate(fake_hidden, args.maxlen, sample=args.sample) indices.append(max_indices.data.cpu().numpy()) indices = np.concatenate(indices, axis=0) with open(save_path, "w") as f: # laplacian smoothing for word in corpus.dictionary.word2idx.keys(): f.write(word + '\n') for idx in indices: words = [corpus.dictionary.idx2word[x] for x in idx] # truncate sentences to first occurrence of <eos> truncated_sent = [] for w in words: if w != '<eos>': truncated_sent.append(w) else: break chars = " ".join(truncated_sent) f.write(chars + '\n') # reverse ppl try: rev_lm = train_ngram_lm(kenlm_path=args.kenlm_path, data_path=save_path, output_path=save_path + ".arpa", N=args.N) with open(os.path.join(args.data_path, 'test.txt'), 'r') as f: lines = f.readlines() if args.lowercase: lines = list(map(lambda x: x.lower(), lines)) sentences = [l.replace('\n', '') for l in lines] rev_ppl = get_ppl(rev_lm, sentences) except: print( "reverse ppl error: it maybe the generated files aren't valid to obtain an LM" ) rev_ppl = 1e15 # forward ppl for_lm = train_ngram_lm(kenlm_path=args.kenlm_path, data_path=os.path.join(args.data_path, 'train.txt'), output_path=save_path + ".arpa", N=args.N) with open(save_path, 'r') as f: lines = f.readlines() sentences = [l.replace('\n', '') for l in lines] for_ppl = get_ppl(for_lm, sentences) return rev_ppl, for_ppl
def eval_epoch_linear_probe(self, eval_list, epoch, vocab_dict, print_sample =False): label_binarizer = sklearn.preprocessing.LabelBinarizer() label_binarizer.fit(range(self.target_vocab_size)) vocab_dict_rev = {v: k for k, v in vocab_dict.items()} self.linear.eval() total_loss = list([]) map_list = list([]) ppl_list = list([]) query_map_list = list([]) query_ppl_list = list([]) target_map_list = list([]) target_ppl_list = list([]) with torch.no_grad(): for i, instance in enumerate(eval_list): labels_onehot, masks_onehot, labels, _ = self.get_training_labels(instance[self.query_indices], instance[self.fact_indices], instance[self.negative_indices]) output_ = self.linear(instance[self.input_type].to(self.device)) # output size is (6600) output = nn.functional.sigmoid(output_) loss = self.get_loss(output, torch.tensor(labels_onehot, dtype=torch.float32).to(self.device), torch.tensor(masks_onehot, dtype=torch.float32).to(self.device)) total_loss.append(loss.detach().cpu().numpy()) map_list.append(get_map(output.detach().cpu().numpy(), labels)) ppl_list.append(get_ppl(output.detach().cpu().numpy(), labels)) query_labels_eval = np.array(instance[self.query_indices]) target_labels_eval = np.array(list(set(instance[self.fact_indices])-set(instance[self.query_indices]))) if len(query_labels_eval)>0: query_map_list.append(get_map(output.detach().cpu().numpy(), query_labels_eval)) query_ppl_list.append(get_ppl(output.detach().cpu().numpy(), query_labels_eval)) if len(target_labels_eval)>0: target_map_list.append(get_map(output.detach().cpu().numpy(), target_labels_eval)) target_ppl_list.append(get_ppl(output.detach().cpu().numpy(), target_labels_eval)) result_dict = {"eval_loss":total_loss, "avg map":map_list, "avg ppl":ppl_list, "query map:": query_map_list, "query ppl:": query_ppl_list, "target map:": target_map_list, "target ppl:": target_ppl_list} print("-" * 20) result_summary = {x:sum(result_dict[x])/len(result_dict[x]) for x in result_dict.keys()} print(result_summary) return result_summary , result_dict
def train_epoch_linear_probe(self, train_list, epoch, save_folder_path): self.linear.train() total_loss = 0 random.shuffle(train_list) map_list = list([]) ppl_list = list([]) for i, instance in enumerate(train_list): self.optimizer.zero_grad() labels_onehot, masks_onehot, labels, label_masks = self.get_training_labels(instance[self.query_indices], instance[self.fact_indices], instance[self.negative_indices]) output_ = self.linear(instance[self.input_type].to(self.device)) # output size is (6600) output = nn.functional.sigmoid(output_) loss = self.get_loss(output, torch.tensor(labels_onehot, dtype = torch.float32).to(self.device), torch.tensor(masks_onehot, dtype = torch.float32).to(self.device)) loss.backward() self.optimizer.step() total_loss+=loss.detach().cpu().numpy() map_list.append(get_map(output.detach().cpu().numpy(), labels)) ppl_list.append(get_ppl(output.detach().cpu().numpy(), labels)) # if (i + 1) % 10 == 0: # print("\tsample ",i+1, " loss:", total_loss/(i+1)) print("epoch ", epoch,"\tbert total training loss:", total_loss/len(train_list)) return total_loss/len(train_list)
def compute_ppl(file_path): sentences = [] with open(file_path, 'r') as f: lines = f.readlines() sentences = [l.replace('\n', '') for l in lines] lm = load_ngram_lm(os.environ["ROOT"] + "/kenlm/models/snli_3gram.arpa") ppl = get_ppl(lm, sentences) return ppl
def train_reverse_lm(eval_path, save_path): ''' train reverse LM and calculate reverse perplexity eval_path: path to file containing test sentences save_path: file name (no extension) for saving generated sentences and ngrams ''' # generate positive and negative examples indices = [] noise = to_gpu(args.cuda, Variable(torch.ones(eval_batch_size, args.z_size))) for i in range(1000 // eval_batch_size): noise.data.normal_(0, 1) fake_hidden = gan_gen(noise) whichdecoder = int(i % 2 == 0) + 1 max_indices = autoencoder.generate( whichdecoder, hidden=fake_hidden, maxlen=args.maxlen) indices.append(max_indices.data.cpu().numpy()) indices = np.concatenate(indices, axis=0) # write generated sentences to text file with open(save_path+".txt", "w") as f: # laplacian smoothing for word in corpus.dictionary.word2idx.keys(): f.write(word+"\n") for idx in indices: # generated sentence words = [corpus.dictionary.idx2word[x] for x in idx] # truncate sentences to first occurrence of <eos> truncated_sent = [] for w in words: if w != '<eos>': truncated_sent.append(w) else: break chars = " ".join(truncated_sent) f.write(chars+"\n") # train language model on generated examples lm = train_ngram_lm(kenlm_path=args.kenlm_path, data_path=save_path+".txt", output_path=save_path, N=args.N) # load sentences to evaluate on with open(eval_path, 'r') as f: lines = f.readlines() sentences = [l.replace('\n', '') for l in lines] ppl = get_ppl(lm, sentences) return ppl
def train_lm(ae_index, eval_path, save_path): gan_gen, autoencoder, ae_args = \ gan_gens[ae_index], autoencoders[ae_index], autoencoders_args[ae_index] # generate examples indices = [] noise = to_gpu(args.cuda, Variable(torch.ones(100, args.z_size))) for i in range(1000): noise.data.normal_(0, 1) fake_hidden = gan_gen(noise) # print ("Calling AE.generate") max_indices = autoencoder.generate(fake_hidden, ae_args.maxlen) indices.append(max_indices.data.cpu().numpy()) indices = np.concatenate(indices, axis=0) # write generated sentences to text file with open(save_path + ".txt", "w") as f: # laplacian smoothing for word in ae_args.corpus.dictionary.word2idx.keys(): f.write(word + "\n") for idx in indices: # generated sentence words = [ae_args.corpus.dictionary.idx2word[x] for x in idx] # truncate sentences to first occurrence of <eos> truncated_sent = [] for w in words: if w != '<eos>': truncated_sent.append(w) else: break chars = " ".join(truncated_sent) f.write(chars + "\n") # train language model on generated examples lm = train_ngram_lm(kenlm_path=args.kenlm_path, data_path=save_path + ".txt", dedup_data_path=save_path + ".uniq.txt", output_path=save_path + ".arpa", N=args.N) # load sentences to evaluate on with open(eval_path, 'r') as f: lines = f.readlines() sentences = [l.replace('\n', '') for l in lines] ppl = get_ppl(lm, sentences) return ppl
def train_lm(eval_path, save_path): # generate examples indices = [] noise = to_gpu(cuda, Variable(torch.ones(100, z_size))) for i in range(1000): noise.data.normal_(0, 1) fake_hidden = gan_gen(noise) max_indices = autoencoder.generate(fake_hidden, maxlen) indices.append(max_indices.data.cpu().numpy()) indices = np.concatenate(indices, axis=0) # write generated sentences to text file #1204delete # with open(save_path+".txt", "w") as f: # # laplacian smoothing # for word in corpus.dictionary.word2idx.keys(): # f.write(word+"\n") # for idx in indices: # # generated sentence # words = [corpus.dictionary.idx2word[x] for x in idx] # # truncate sentences to first occurrence of <eos> # truncated_sent = [] # for w in words: # if w != '<eos>': # truncated_sent.append(w) # else: # break # chars = " ".join(truncated_sent) # f.write(chars+"\n") # train language model on generated examples # lm = train_ngram_lm(kenlm_path=kenlm_path, # data_path=save_path+".txt", # output_path=save_path+".arpa", # N=N) # load sentences to evaluate on with open(eval_path, 'r') as f: lines = f.readlines() sentences = [l.replace('\n', '') for l in lines] ppl = get_ppl(lm, sentences) return ppl
def train_lm(eval_path, save_path): # generate examples indices = [] noise = to_gpu(args.cuda, Variable(torch.ones(100, args.z_size))) for i in range(1000): noise.data.normal_(0, 1) fake_hidden = gan_gen(noise) max_indices = autoencoder.generate(fake_hidden, args.maxlen) indices.append(max_indices.data.cpu().numpy()) indices = np.concatenate(indices, axis=0) # write generated sentences to text file with open(save_path+".txt", "w") as f: # laplacian smoothing for word in corpus.dictionary.word2idx.keys(): f.write(word+"\n") for idx in indices: # generated sentence words = [corpus.dictionary.idx2word[x] for x in idx] # truncate sentences to first occurrence of <eos> truncated_sent = [] for w in words: if w != '<eos>': truncated_sent.append(w) else: break chars = " ".join(truncated_sent) f.write(chars+"\n") # train language model on generated examples lm = train_ngram_lm(kenlm_path=args.kenlm_path, data_path=save_path+".txt", output_path=save_path+".arpa", N=args.N) # load sentences to evaluate on with open(eval_path, 'r') as f: lines = f.readlines() sentences = [l.replace('\n', '') for l in lines] ppl = get_ppl(lm, sentences) return ppl
with open(ft_train_file, 'w') as f: for sent in original1: f.write("__label__1 "+sent+"\n") for sent in original2: f.write("__label__2 "+sent+"\n") ft_file = "{}/eval/sentiment_epoch{}.ft".format(args.load_path, args.epoch) with open(ft_file, 'w') as f: for sent in transfer1: f.write("__label__2 "+sent+"\n") for sent in transfer2: f.write("__label__1 "+sent+"\n") # Perplexity (NOT reverse ppl) model = kenlm.Model(args.lm_path) ppl = get_ppl(model, transfer1+transfer2) print("Perplexity: {}".format(ppl)) curdir = os.getcwd() # BLEU print("\nBLEU") BLEU_CMD = "perl ./tool/multi-bleu.perl -lc {} < {}".format(original_file, transfer_file) result = subprocess.check_output(BLEU_CMD, shell=True) #os.system(BLEU_CMD) print(result) # FastText print("\nFast Text") FT_CMD = "cd ~/fastText-0.1.0; ./fasttext supervised -input {} -output {}; ./fasttext test {} {} 1".format(
def experiments_squad_manual_check(device, data_partition="train", print_text=False, embd_type="useqa", label_type="gold", seed=0, epoch=1): def get_training_labels(label_binarizer, query_indices, fact_indices, negative_indices): label_masks = list(set(query_indices + fact_indices + negative_indices)) label_masks_onehot = np.sum(label_binarizer.transform(label_masks), axis=0) labels = list(set(query_indices + fact_indices)) labels_onehot = np.sum(label_binarizer.transform(labels), axis=0) return labels_onehot, label_masks_onehot, np.array(labels), np.array( label_masks) def get_loss(criterion, prediction, target, mask): loss = torch.sum( criterion(prediction, target) * mask) / torch.sum(mask) return loss probe_model_root_path = "data_generated/squad/probe_experiment_2020-05-30_215643/" input_type = "query_" + embd_type + "_embd" probe_model_path = probe_model_root_path + "query_" + embd_type + "_embd_" + label_type + "_result_seed_" + str( seed) + "/best_linear_prober" saved_data_folder = 'data_generated/squad/' train_list, dev_list, kb = utils_dataset_squad.load_squad_probe_raw_data() vocab_dict, tfidf_vectorizer = utils_probe_squad.get_vocabulary( train_list, kb, saved_data_folder + "squad_vocab_dict.pickle", saved_data_folder + "squad_tfidf_vectorizer.pickle") instances_all_seeds = utils_probe_squad.get_probe_dataset( train_list, dev_list, kb, "", vocab_dict, tfidf_vectorizer, saved_data_folder, "squad_probe.pickle") linear_probe = torch.load(probe_model_path).to(device) linear_probe.eval() criterion = nn.BCELoss(reduction="none") target_vocab_size = len(vocab_dict) label_binarizer = sklearn.preprocessing.LabelBinarizer() label_binarizer.fit(range(target_vocab_size)) vocab_dict_rev = {v: k for k, v in vocab_dict.items()} query_indices = "lemma_query_indices_" + label_type fact_indices = "lemma_fact_indices_" + label_type negative_indices = "lemma_negative_indices_" + label_type data_list = instances_all_seeds[seed][data_partition] total_loss = 0 map_list = list([]) ppl_list = list([]) query_map_list = list([]) query_ppl_list = list([]) target_map_list = list([]) target_ppl_list = list([]) pred_score_dict = {} target_occur_dict = {} with torch.no_grad(): for i, instance in enumerate(data_list): labels_onehot, masks_onehot, labels, label_masks = get_training_labels( label_binarizer, instance[query_indices], instance[fact_indices], instance[negative_indices]) output_ = linear_probe( instance[input_type].to(device)) # output size is (6600) output = nn.functional.sigmoid(output_) if print_text: output_numpy = output.detach().cpu().numpy() top_preds = np.flip(np.argsort(output_numpy)) print("=" * 20) print("\tquery:", instance["lemmas_query"]) print("\tfact:", instance["lemmas_fact"]) print('\ttop pred lemma:', [vocab_dict_rev[idx] for idx in top_preds[:20]]) input("A") loss = get_loss( criterion, output, torch.tensor(labels_onehot, dtype=torch.float32).to(device), torch.tensor(masks_onehot, dtype=torch.float32).to(device)) total_loss += loss.detach().cpu().numpy() map_list.append(get_map(output.detach().cpu().numpy(), labels)) ppl_list.append(get_ppl(output.detach().cpu().numpy(), labels)) query_map_list.append( get_map(output.detach().cpu().numpy(), np.array(instance[query_indices]))) query_ppl_list.append( get_ppl(output.detach().cpu().numpy(), np.array(instance[query_indices]))) if len(set(instance[fact_indices]) - set(instance[query_indices])) > 0: target_map_list.append( get_map( output.detach().cpu().numpy(), np.array(list( set(instance[fact_indices]) - set(instance[query_indices])), dtype=np.int64))) target_ppl_list.append( get_ppl( output.detach().cpu().numpy(), np.array(list( set(instance[fact_indices]) - set(instance[query_indices])), dtype=np.int64))) for pred_lemma_indices in list( set(instance[fact_indices]) - set(instance[query_indices])): pred_lemma = vocab_dict_rev[pred_lemma_indices] if pred_lemma not in pred_score_dict: pred_score_dict[pred_lemma] = 0 target_occur_dict[pred_lemma] = 0 target_occur_dict[pred_lemma] += 1 pred_score_dict[pred_lemma] += output[pred_lemma_indices].item( ) if print_text: print("=" * 20) print("query:", instance["lemmas_query"]) print("fact", instance["lemmas_fact"]) print("negative", instance["lemmas_negative"]) print("positive token reconstructed:", [vocab_dict_rev[lemma_idx] for lemma_idx in labels]) print("negative token reconstructed:", [ vocab_dict_rev[lemma_idx] for lemma_idx in list(set(label_masks) - set(labels)) ]) print("query reconstructed", [ vocab_dict_rev[lemma_idx] for lemma_idx in instance[query_indices] ]) print("fact alone reconstructed:", [ vocab_dict_rev[lemma_idx] for lemma_idx in instance[fact_indices] ]) input("--------") result_dict = { "eval_loss": total_loss / len(dev_list), "avg map": sum(map_list) / len(map_list), "avg ppl": sum(ppl_list) / len(ppl_list), "query map:": sum(query_map_list) / len(query_map_list), "query ppl:": sum(query_ppl_list) / len(query_ppl_list), "target map:": sum(target_map_list) / len(target_map_list), "target ppl:": sum(target_ppl_list) / len(target_ppl_list) } print("-" * 20) print(result_dict) print("-" * 20) pred_freq_dict_avg = {} for k in pred_score_dict.keys(): pred_freq_dict_avg[k] = pred_score_dict[k] / target_occur_dict[k] tokens_sorted_by_occur = sorted(target_occur_dict.items(), key=lambda kv: kv[1]) for histo_tuple in list(reversed(tokens_sorted_by_occur)): print("token:", histo_tuple[0], "\tn occur:", histo_tuple[1], "\tavg prob:", pred_freq_dict_avg[histo_tuple[0]]) return 0