def make_vocab_files(opt, filename, ques_or_ans): save_path = os.path.join(root_path, "vocab/%s_vocab.json" % ques_or_ans) # load data if (ques_or_ans == "question"): _, sentence_ls, _ = VQADataProvider.load_raw_iqa(filename) elif (ques_or_ans == "answer"): _, _, sentence_ls = VQADataProvider.load_raw_iqa(filename) else: sentence_ls = None vocab_dict = make_vocab(sentence_ls) # save to json file with open(save_path, "w") as f: json.dump(vocab_dict, f) print("%s-%s vocabulary saved" % (filename, ques_or_ans))
def gen_txt(): with open(q_voc_path, "r") as f: q_dic = json.load(f) word_list = [] exc_list = ["<break>", "<END>", "<START>", "<UNKNOWN>", "<UNK>"] for k, _ in q_dic.items(): # exclude <break>, <END>, <START>, <UNKNOWN> if (k not in exc_list): word_list.append(k) sent_list = [] _, raw_ques, _ = VQADataProvider.load_raw_iqa(q_a_i_path) for ques in raw_ques: sent_list.append(VQADataProvider.text_to_list(ques)) # with open(q_a_i_path, "r") as csvfile: # # QA = csv.reader(csvfile, delimiter="\t", quotechar='\n') # for row in QA: # sent_list.append(data_provider.VQADataProvider.seq_to_list(row[2])) sent_idx_list = [] for sent in sent_list: sent_idx_list.append( [word_list.index(x) for x in sent if x not in exc_list]) with open(wordlist_path, "w") as f: for item in word_list: f.write("%s\n" % item) with open(doc_path, "w") as f: for sent in sent_idx_list: f.write(" ".join([str(i) for i in sent]) + "\n")
def make_ans_vocab_file(opt, filename): save_path = os.path.join(root_path, "vocab/answer_vocab.pkl") # loada data _, _, sentence_ls = VQADataProvider.load_raw_iqa(filename) vocab_ls = make_vocab_ans(sentence_ls) with open(save_path, "wb") as f: pickle.dump(vocab_ls, f) #for debug print(vocab_ls[:10]) return len(vocab_ls)
def check_ans_vocab(filename): _, _, sentence_ls = VQADataProvider.load_raw_iqa(filename) word_fre_dict = {} for sent in sentence_ls: word_ls = VQADataProvider.text_to_list(sent) for word in word_ls: if (word in word_fre_dict): word_fre_dict[word] += 1 else: word_fre_dict[word] = 1 # sort word_fre_dict = sorted(word_fre_dict.items(), key=lambda kv: kv[1], reverse=True) return word_fre_dict