def make_vocab_for_embedding(): embedding_path = os.path.join( root_path, "embedding/BioWordVec_PubMed_MIMICIII_d200.vec.bin") MED = KeyedVectors.load_word2vec_format(embedding_path, binary=True) # print(type(MED)) word_dict = MED.wv.vocab # print(type(words)) # print(len(words)) raw_words = list(word_dict.keys()) print(len(raw_words)) # remove the word not appear in the dataset (training + val + test) # load all word all_word_in_data = set() train_val_path = os.path.join(root_path, "data/train_val/All_QA_Pairs_train_val.txt") test_path = os.path.join(root_path, "data/test/VQAMed2019_Test_Questions.txt") with open(train_val_path, "r") as f: print("process train_val file") for row in f: i_q_a = row.rstrip().split("|") q_words = VQADataProvider.text_to_list(i_q_a[1]) q_words = q_words[:-1] for w in q_words: all_word_in_data.add(w) with open(test_path, "r") as f: print("process test file") for row in f: i_q = row.rstrip().split("|") q_words = VQADataProvider.text_to_list(i_q[1]) q_words = q_words[:-1] for w in q_words: all_word_in_data.add(w) # filter words = [] print("filter") for w in all_word_in_data: if w in word_dict: words.append(w) print(len(words)) # add padding words.insert(0, "<PAD>") # # add UNK # words.append("<UNK>") # save words list to file for mapping questions to a list of indices save_path = os.path.join(root_path, "embedding/embed_mapping.pkl") with open(save_path, "wb") as f: pickle.dump(words, f) # save the matrix of embedding to file save_path_2 = os.path.join(root_path, "embedding/embedding_matrix.npy") embedding_matrix = np.zeros((len(words), 200)) for i in range(1, len(words)): embedding_matrix[i] = MED[words[i]] np.save(save_path_2, embedding_matrix) return words, embedding_matrix
def gen_txt(): with open(q_voc_path, "r") as f: q_dic = json.load(f) word_list = [] exc_list = ["<break>", "<END>", "<START>", "<UNKNOWN>", "<UNK>"] for k, _ in q_dic.items(): # exclude <break>, <END>, <START>, <UNKNOWN> if (k not in exc_list): word_list.append(k) sent_list = [] _, raw_ques, _ = VQADataProvider.load_raw_iqa(q_a_i_path) for ques in raw_ques: sent_list.append(VQADataProvider.text_to_list(ques)) # with open(q_a_i_path, "r") as csvfile: # # QA = csv.reader(csvfile, delimiter="\t", quotechar='\n') # for row in QA: # sent_list.append(data_provider.VQADataProvider.seq_to_list(row[2])) sent_idx_list = [] for sent in sent_list: sent_idx_list.append( [word_list.index(x) for x in sent if x not in exc_list]) with open(wordlist_path, "w") as f: for item in word_list: f.write("%s\n" % item) with open(doc_path, "w") as f: for sent in sent_idx_list: f.write(" ".join([str(i) for i in sent]) + "\n")
def make_vocab(sentence_ls, vocab_size=-1): word_fre_dic = {} for sent in sentence_ls: word_ls = VQADataProvider.text_to_list(sent) for word in word_ls: if (word in word_fre_dic): word_fre_dic[word] += 1 else: word_fre_dic[word] = 1 # sort vocab_ls = [ k for (k, v) in sorted( word_fre_dic.items(), key=lambda x: x[1], reverse=True) ] if (vocab_size != -1 and vocab_size <= len(vocab_ls)): vocab_ls = vocab_ls[:vocab_size] vocab_ls.reverse() # create dict with index vocab_dict = {} for i in range(len(vocab_ls)): vocab_dict[vocab_ls[i]] = i return vocab_dict
def check_len(filename): q_len_num = {} a_len_num = {} exclude_ls = ["<break>", "<START>", "<END>", "<UNKNOWN>"] # exclude_ls = ["<START>", "<END>"] with open(filename, "r") as f: for row in f: qa = row.rstrip().split("|") # words_q = qa[1].split() words_q = VQADataProvider.text_to_list(qa[1]) words_q = [word for word in words_q if word not in exclude_ls] if (len(words_q) in q_len_num): q_len_num[len(words_q)] += 1 else: q_len_num[len(words_q)] = 1 # words_a = qa[2].split() words_a = VQADataProvider.text_to_list(qa[2]) words_a = [word for word in words_a if word not in exclude_ls] if (len(words_a) in a_len_num): a_len_num[len(words_a)] += 1 else: a_len_num[len(words_a)] = 1 return q_len_num, a_len_num
def check_ans_vocab(filename): _, _, sentence_ls = VQADataProvider.load_raw_iqa(filename) word_fre_dict = {} for sent in sentence_ls: word_ls = VQADataProvider.text_to_list(sent) for word in word_ls: if (word in word_fre_dict): word_fre_dict[word] += 1 else: word_fre_dict[word] = 1 # sort word_fre_dict = sorted(word_fre_dict.items(), key=lambda kv: kv[1], reverse=True) return word_fre_dict
def make_vocab_ans(sentence_ls, vocab_size=-1): word_fre_dic = {} for sent in sentence_ls: word_ls = VQADataProvider.text_to_list(sent) for word in word_ls: if (word in word_fre_dic): word_fre_dic[word] += 1 else: word_fre_dic[word] = 1 # sort vocab_ls = [ k for (k, v) in sorted( word_fre_dic.items(), key=lambda x: x[1], reverse=True) ] if (vocab_size != -1 and vocab_size <= len(vocab_ls)): vocab_ls = vocab_ls[:vocab_size - 1] # add <unknown> vocab_ls.append("<UNK>") # # add <ZERO> for padding # vocab_ls.insert(0, "<ZERO>") return vocab_ls
def label_img_with_ques_etm(): opt = config.parse_opt() q_i_a_path = os.path.join(root_path, "data/train/All_QA_Pairs_train.txt") img_ques_dict = {} with open(q_i_a_path, "r") as f: for row in f: q_i_a = row.strip().split("|") img = q_i_a[0] ques = q_i_a[1] if (img in img_ques_dict): img_ques_dict[img].append(ques) else: img_ques_dict[img] = [ques] img_topic_dict = {} for img, qs in img_ques_dict.items(): img_topic_vector = np.zeros(opt.ETM_TOP_NUM) for q in qs: words = VQADataProvider.text_to_list(q) q_t_v = etm_topic_distrib(words) img_topic_vector = np.add(img_topic_vector, q_t_v) img_topic_dict[img] = (np.argmax(img_topic_vector)).item() return img_topic_dict
for row in f: val_img_ids.append(row.strip()) print("train+val images number: ", len(train_img_ids) + len(val_img_ids)) comb_ids = train_img_ids + val_img_ids comb_ids = set(comb_ids) print("unique total images number: ", len(comb_ids)) if __name__ == "__main__": filename = os.path.join( root_path, "data/train_val/QAPairsByCategory/C4_Abnormality_train_val.txt") _, _, ans_set = load_all2set(filename) unq_words = set() for ans in ans_set: words = VQADataProvider.text_to_list(ans) for w in words: unq_words.add(w) print(len(unq_words)) print(list(unq_words)[:5]) print("<END>" in unq_words) # filename = "/Users/leishi/Desktop/Internship/vqa2019/ImageClef-2019-VQA-Med-Training/QAPairsByCategory/C4_Abnormality_train.txt" # all_ans, all_img, all_ques, all_q_i_pairs, all_a_a_pairs = check_raw_data(filename) # print("\nunique answer length: ", len(all_ans)) # print("\nunique img length: ", len(all_img)) # print("\nunique question length: ", len(all_ques)) # # print("\nall answer") # # print(all_ans) # print("\nunique question-image pairs: ", len(all_q_i_pairs)) # print("\nunique question-answer pairs: ", len(all_a_a_pairs))