def plot_wvecs(filename): wv = load_pickle(filename) word_to_index = load_pickle("../data/word_to_index.pkl") index_to_word = invert_dict(word_to_index) counts = load_pickle("../data/counts.pkl") reduced_word_to_index = {} for k,v in counts.iteritems(): if v > 1800: reduced_word_to_index[k] = v indices = [word_to_index[k] for k, v in reduced_word_to_index.iteritems()] words = [index_to_word[i].replace("edu.stanford.nlp.sempre.","") for i in indices] word_vecs = [wv[i] for i in indices] temp = (word_vecs - np.mean(word_vecs, axis=0)) covariance = 1.0 / len(word_vecs) * temp.T.dot(temp) U, S, V = np.linalg.svd(covariance) coord = temp.dot(U[:,0:2]) for i in xrange(len(words)): plt.text(coord[i,0], coord[i,1], words[i], bbox=dict(facecolor='green', alpha=0.1)) plt.xlim((np.min(coord[:,0]), np.max(coord[:,0]) + 2)) plt.ylim((np.min(coord[:,1]), np.max(coord[:,1]))) plt.show()
def make_templates(args, templates_emb, w2emb, w2i): """ Gets embedded + unembedded templates if they don't exist. """ if os.path.exists("templates.pkl"): templates = load_pickle("templates.pkl") templates_emb_pad = load_pickle("templates_emb.pkl") else: # Flatten templates templates_emb = [y for x in templates_emb for y in x] # Cut templates to maximum length templates_emb = [temp[-args.max_length:] for temp in templates_emb] # Pad embeddings templates_emb_pad = [ np.pad(temp1, ((0, args.max_length - len(temp1)), (0, 0)), "constant", constant_values=(len(w2i))) for temp1 in templates_emb ] # Convert embedded templates to word templates templates = [convert_to_words(sent, w2emb) for sent in templates_emb] save_pickle("templates.pkl", templates) save_pickle("templates_emb.pkl", templates_emb_pad) return templates, templates_emb_pad
def __init__(self): ''' 提前准备好计算句向量必备的文件:word2id,id2embed,id2weight ''' self.word_id = load_json(config.word_id_path) self.id_emb = load_pickle(config.id_emb_path) self.id_weight = load_pickle(config.id_weight_path) self.tokenizer = clean_seg
def get_data(filename, data, embeddings, w2i, gensim_model, args): """ Retrieves all data. Load it from a Pickle file if it exists, and create it otherwise. """ global num_words if os.path.exists(filename): all_examples = data_utils.load_pickle(filename) else: all_examples = [] for example in tqdm(data[:10]): resources = [] embedded_resources = [] data_utils.get_resources(example["documents"]["comments"], resources, embedded_resources) data_utils.get_resources(example["documents"]["fact_table"], resources, embedded_resources) data_utils.get_resources(example["documents"]["plot"], resources, embedded_resources) data_utils.get_resources(example["documents"]["review"], resources, embedded_resources) chat = example["chat"] # Loop over each of the last three utterances in the chat (context). for i in range(3, len(chat) - 1): last_utterances = chat[i - 3:i] response = chat[i + 1] if len(response) > 0: exp = [] embedded_utterances = [ data_utils.embed_sentence(utterance) for utterance in last_utterances ] context, embedded_context = \ data_utils.get_context(last_utterances) # Retrieve: Takes context and resources. Uses Word Mover's Distance # to obtain relevant resource candidates. similarities = retrieve(context, resources, gensim_model) padd_resource = embedded_resources[np.argmax( similarities)][-args.max_length:] padd_resource = np.pad( padd_resource, ((0, args.max_length - len(padd_resource)), (0, 0)), "constant", constant_values=(num_words)) exp.append(padd_resource) exp.append(data_utils.clean_sentence(chat[i + 1])) all_examples.append(tuple(exp)) save_data(filename, all_examples) return all_examples
def load(self, dataset="train"): path = self.get_path(dataset) return du.load_pickle(path)
def split_files(args): assert os.path.isfile( args.label_file), 'there is no label files, --label_file [{}]'.format( args.label_file) dirname, filename = os.path.split(args.label_file) data = load_pickle(args.label_file) ## SPLIT data train_idx, leftover_idx, _, leftover_label = train_test_split( list(range(len(data['label']))), data['label'], train_size=args.labeled_data_size, stratify=data['label']) if len(leftover_idx) > args.valid_data_size: valid_idx, unlabel_idx, _, _ = train_test_split( leftover_idx, leftover_label, train_size=args.valid_data_size, stratify=leftover_label) else: valid_idx = leftover_idx unlabel_idx = [] train_data = dict((key, np.array(item)[train_idx].tolist()) for key, item in zip(data.keys(), data.values())) valid_data = dict((key, np.array(item)[valid_idx].tolist()) for key, item in zip(data.keys(), data.values())) unlabel_data = dict((key, np.array(item)[unlabel_idx].tolist()) for key, item in zip(data.keys(), data.values())) if args.unlabel_file is not None and os.path.isfile(args.unlabel_file): additional_data = load_pickle(args.unlabel_file) for key in unlabel_data.keys(): unlabel_data[key] += additional_data[key] if args.train_file is None: args.train_file = TRAIN_NAME.format(args.labeled_data_size, args.valid_data_size) train_path = os.path.join(args.output_dir, args.train_file) save_pickle(train_path, train_data) try: os.remove(os.path.join(args.output_dir, "cache_" + args.train_file)) except: pass if args.valid_file is None: args.valid_file = VALID_NAME.format(args.labeled_data_size, args.valid_data_size) valid_path = os.path.join(args.output_dir, args.valid_file) save_pickle(valid_path, valid_data) try: os.remove(os.path.join(args.output_dir, "cache_" + args.valid_file)) except: pass if args.augment_file is None: args.augment_file = AUGMENT_NAME.format(args.labeled_data_size, args.valid_data_size) augment_path = os.path.join(args.output_dir, args.augment_file) save_pickle(augment_path, unlabel_data) try: os.remove(os.path.join(args.output_dir, "cache_" + args.augment_file)) except: pass args.train_file = train_path args.valid_file = valid_path args.augment_file = augment_path
def train(args): print("Load data...") data_train = load_data(args.folder + "/train_data.json") data_test = load_data(args.folder + "/dev_data.json") embeddings = load_pickle(args.embeddings) w2i = load_pickle(args.w2i) w2emb = load_pickle(args.w2emb) templates_emb = get_templates("../../data/templates.pkl") print("Do the templates...") templates, templates_emb = make_templates(args, templates_emb, w2emb, w2i) print("Now load the model...") emb_size = len(embeddings[0]) model = SaliencyPrediction(emb_size * args.max_length, device).to(device) loss_func = nn.BCELoss() optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) rouge = Rouge() print("Read in train data...") resources = [] embedded_resources = [] for example in tqdm(data_train): get_resources(example["documents"]["comments"], resources, embedded_resources, embeddings, w2i) num_comments = len(resources) get_resources(example["documents"]["fact_table"], resources, embedded_resources, embeddings, w2i) num_facts = len(resources) - num_comments get_resources(example["documents"]["plot"], resources, embedded_resources, embeddings, w2i) num_plots = len(resources) - num_comments - num_facts get_resources(example["documents"]["review"], resources, embedded_resources, embeddings, w2i) num_reviews = len(resources) - num_comments - num_facts - num_plots print("Read in test data...") resources_test = [] embedded_resources_test = [] for example in tqdm(data_test): get_resources(example["documents"]["comments"], resources_test, embedded_resources_test, embeddings, w2i) num_comments = len(resources) get_resources(example["documents"]["fact_table"], resources_test, embedded_resources_test, embeddings, w2i) num_facts = len(resources) - num_comments get_resources(example["documents"]["plot"], resources_test, embedded_resources_test, embeddings, w2i) num_plots = len(resources) - num_comments - num_facts get_resources(example["documents"]["review"], resources_test, embedded_resources_test, embeddings, w2i) num_reviews = len(resources) - num_comments - num_facts - num_plots print("Now learn.....") total_resources = len(embedded_resources) all_temps = torch.Tensor(templates_emb) for epoch in range(5): print("Epoch: " + str(epoch)) avg_loss = 0 for i, resource in tqdm(enumerate(embedded_resources)): sent = " ".join(resources[i]) if sent == "" or sent == "eod": continue optimizer.zero_grad() padd_resource = resource[-args.max_length:] padd_resource = np.pad(padd_resource, ((0, args.max_length - len(padd_resource)), (0, 0)), "constant", constant_values=(len(w2i))) actual_scores = [] all_res = torch.Tensor(padd_resource).unsqueeze(0).repeat(20, 1, 1) size_inp = all_res.size() for j, template in enumerate(templates_emb): try: actual_score = rouge.get_scores(templates[j], " ".join( resources[i]))[0]["rouge-1"]["f"] except: actual_score = 0 actual_scores.append(actual_score) x1 = all_res.reshape(size_inp[0], size_inp[1] * size_inp[2]).to(device) x2 = all_temps.reshape(size_inp[0], size_inp[1] * size_inp[2]).to(device) actual_scores = torch.Tensor(actual_scores).unsqueeze(1).to(device) scores = model(x1, x2) loss = loss_func(scores, actual_scores) avg_loss += loss.item() loss.backward() optimizer.step() print("For this epoch, we found avg_loss: " + str(avg_loss / total_resources)) torch.save(model, "../../models/rewrite/saliency_" + str(epoch) + ".pt") model.eval() with torch.no_grad(): total_loss = 0 amount_res = len(resources) for i, resource in tqdm(enumerate(embedded_resources_test)): sent = " ".join(resources_test[i]) if sent == "" or sent == "eod": continue padd_resource = resource[-args.max_length:] padd_resource = np.pad( padd_resource, ((0, args.max_length - len(padd_resource)), (0, 0)), "constant", constant_values=(len(w2i))) actual_scores = [] all_res = torch.Tensor(padd_resource).unsqueeze(0).repeat( 20, 1, 1) size_inp = all_res.size() for j, template in enumerate(templates_emb): try: actual_score = rouge.get_scores( templates[j], " ".join(resources[i]))[0]["rouge-1"]["f"] except: actual_score = 0 actual_scores.append(actual_score) x1 = all_res.reshape(size_inp[0], size_inp[1] * size_inp[2]).to(device) x2 = all_temps.reshape(size_inp[0], size_inp[1] * size_inp[2]).to(device) actual_scores = torch.Tensor(actual_scores).unsqueeze(1).to( device) scores = model(x1, x2) loss = loss_func(scores, actual_scores) total_loss += loss.item() print("Average loss is: " + str(total_loss / amount_res)) model.train()