示例#1
0
def make_vocab_for_embedding():
    embedding_path = os.path.join(
        root_path, "embedding/BioWordVec_PubMed_MIMICIII_d200.vec.bin")
    MED = KeyedVectors.load_word2vec_format(embedding_path, binary=True)
    # print(type(MED))
    word_dict = MED.wv.vocab
    # print(type(words))
    # print(len(words))
    raw_words = list(word_dict.keys())
    print(len(raw_words))

    # remove the word not appear in the dataset (training + val + test)
    # load all word
    all_word_in_data = set()
    train_val_path = os.path.join(root_path,
                                  "data/train_val/All_QA_Pairs_train_val.txt")
    test_path = os.path.join(root_path,
                             "data/test/VQAMed2019_Test_Questions.txt")
    with open(train_val_path, "r") as f:
        print("process train_val file")
        for row in f:
            i_q_a = row.rstrip().split("|")
            q_words = VQADataProvider.text_to_list(i_q_a[1])
            q_words = q_words[:-1]
            for w in q_words:
                all_word_in_data.add(w)

    with open(test_path, "r") as f:
        print("process test file")
        for row in f:
            i_q = row.rstrip().split("|")
            q_words = VQADataProvider.text_to_list(i_q[1])
            q_words = q_words[:-1]
            for w in q_words:
                all_word_in_data.add(w)
    # filter
    words = []
    print("filter")
    for w in all_word_in_data:
        if w in word_dict:
            words.append(w)
    print(len(words))

    # add padding
    words.insert(0, "<PAD>")
    # # add UNK
    # words.append("<UNK>")
    # save words list to file for mapping questions to a list of indices
    save_path = os.path.join(root_path, "embedding/embed_mapping.pkl")
    with open(save_path, "wb") as f:
        pickle.dump(words, f)

    # save the matrix of embedding to file
    save_path_2 = os.path.join(root_path, "embedding/embedding_matrix.npy")
    embedding_matrix = np.zeros((len(words), 200))
    for i in range(1, len(words)):
        embedding_matrix[i] = MED[words[i]]
    np.save(save_path_2, embedding_matrix)

    return words, embedding_matrix
示例#2
0
def gen_txt():
    with open(q_voc_path, "r") as f:
        q_dic = json.load(f)

    word_list = []

    exc_list = ["<break>", "<END>", "<START>", "<UNKNOWN>", "<UNK>"]

    for k, _ in q_dic.items():
        # exclude <break>, <END>, <START>, <UNKNOWN>
        if (k not in exc_list):
            word_list.append(k)

    sent_list = []
    _, raw_ques, _ = VQADataProvider.load_raw_iqa(q_a_i_path)
    for ques in raw_ques:
        sent_list.append(VQADataProvider.text_to_list(ques))
    # with open(q_a_i_path, "r") as csvfile:
    # 	# QA = csv.reader(csvfile, delimiter="\t", quotechar='\n')
    # 	for row in QA:
    # 		sent_list.append(data_provider.VQADataProvider.seq_to_list(row[2]))

    sent_idx_list = []
    for sent in sent_list:
        sent_idx_list.append(
            [word_list.index(x) for x in sent if x not in exc_list])

    with open(wordlist_path, "w") as f:
        for item in word_list:
            f.write("%s\n" % item)

    with open(doc_path, "w") as f:
        for sent in sent_idx_list:
            f.write(" ".join([str(i) for i in sent]) + "\n")
示例#3
0
def make_vocab(sentence_ls, vocab_size=-1):
    word_fre_dic = {}
    for sent in sentence_ls:
        word_ls = VQADataProvider.text_to_list(sent)
        for word in word_ls:
            if (word in word_fre_dic):
                word_fre_dic[word] += 1
            else:
                word_fre_dic[word] = 1

    # sort
    vocab_ls = [
        k for (k, v) in sorted(
            word_fre_dic.items(), key=lambda x: x[1], reverse=True)
    ]
    if (vocab_size != -1 and vocab_size <= len(vocab_ls)):
        vocab_ls = vocab_ls[:vocab_size]

    vocab_ls.reverse()
    # create dict with index
    vocab_dict = {}
    for i in range(len(vocab_ls)):
        vocab_dict[vocab_ls[i]] = i

    return vocab_dict
示例#4
0
def check_len(filename):
    q_len_num = {}
    a_len_num = {}
    exclude_ls = ["<break>", "<START>", "<END>", "<UNKNOWN>"]
    # exclude_ls = ["<START>", "<END>"]
    with open(filename, "r") as f:
        for row in f:
            qa = row.rstrip().split("|")
            # words_q = qa[1].split()
            words_q = VQADataProvider.text_to_list(qa[1])
            words_q = [word for word in words_q if word not in exclude_ls]
            if (len(words_q) in q_len_num):
                q_len_num[len(words_q)] += 1
            else:
                q_len_num[len(words_q)] = 1

            # words_a = qa[2].split()
            words_a = VQADataProvider.text_to_list(qa[2])
            words_a = [word for word in words_a if word not in exclude_ls]
            if (len(words_a) in a_len_num):
                a_len_num[len(words_a)] += 1
            else:
                a_len_num[len(words_a)] = 1
    return q_len_num, a_len_num
示例#5
0
def check_ans_vocab(filename):
    _, _, sentence_ls = VQADataProvider.load_raw_iqa(filename)
    word_fre_dict = {}
    for sent in sentence_ls:
        word_ls = VQADataProvider.text_to_list(sent)
        for word in word_ls:
            if (word in word_fre_dict):
                word_fre_dict[word] += 1
            else:
                word_fre_dict[word] = 1
    # sort
    word_fre_dict = sorted(word_fre_dict.items(),
                           key=lambda kv: kv[1],
                           reverse=True)
    return word_fre_dict
示例#6
0
def make_vocab_ans(sentence_ls, vocab_size=-1):
    word_fre_dic = {}
    for sent in sentence_ls:
        word_ls = VQADataProvider.text_to_list(sent)
        for word in word_ls:
            if (word in word_fre_dic):
                word_fre_dic[word] += 1
            else:
                word_fre_dic[word] = 1

    # sort
    vocab_ls = [
        k for (k, v) in sorted(
            word_fre_dic.items(), key=lambda x: x[1], reverse=True)
    ]
    if (vocab_size != -1 and vocab_size <= len(vocab_ls)):
        vocab_ls = vocab_ls[:vocab_size - 1]
    # add <unknown>
    vocab_ls.append("<UNK>")
    # # add <ZERO> for padding
    # vocab_ls.insert(0, "<ZERO>")

    return vocab_ls
示例#7
0
def label_img_with_ques_etm():
    opt = config.parse_opt()
    q_i_a_path = os.path.join(root_path, "data/train/All_QA_Pairs_train.txt")

    img_ques_dict = {}
    with open(q_i_a_path, "r") as f:
        for row in f:
            q_i_a = row.strip().split("|")
            img = q_i_a[0]
            ques = q_i_a[1]
            if (img in img_ques_dict):
                img_ques_dict[img].append(ques)
            else:
                img_ques_dict[img] = [ques]

    img_topic_dict = {}
    for img, qs in img_ques_dict.items():
        img_topic_vector = np.zeros(opt.ETM_TOP_NUM)
        for q in qs:
            words = VQADataProvider.text_to_list(q)
            q_t_v = etm_topic_distrib(words)
            img_topic_vector = np.add(img_topic_vector, q_t_v)
        img_topic_dict[img] = (np.argmax(img_topic_vector)).item()
    return img_topic_dict
示例#8
0
        for row in f:
            val_img_ids.append(row.strip())
    print("train+val images number: ", len(train_img_ids) + len(val_img_ids))
    comb_ids = train_img_ids + val_img_ids
    comb_ids = set(comb_ids)
    print("unique total images number: ", len(comb_ids))


if __name__ == "__main__":
    filename = os.path.join(
        root_path,
        "data/train_val/QAPairsByCategory/C4_Abnormality_train_val.txt")
    _, _, ans_set = load_all2set(filename)
    unq_words = set()
    for ans in ans_set:
        words = VQADataProvider.text_to_list(ans)
        for w in words:
            unq_words.add(w)
    print(len(unq_words))
    print(list(unq_words)[:5])
    print("<END>" in unq_words)

    # filename = "/Users/leishi/Desktop/Internship/vqa2019/ImageClef-2019-VQA-Med-Training/QAPairsByCategory/C4_Abnormality_train.txt"
    # all_ans, all_img, all_ques, all_q_i_pairs, all_a_a_pairs = check_raw_data(filename)
    # print("\nunique answer length: ", len(all_ans))
    # print("\nunique img length: ", len(all_img))
    # print("\nunique question length: ", len(all_ques))
    # # print("\nall answer")
    # # print(all_ans)
    # print("\nunique question-image pairs: ", len(all_q_i_pairs))
    # print("\nunique question-answer pairs: ", len(all_a_a_pairs))