def count_context_words(mecab, title, context, save_dir): genkeis = [] node = mecab.parseToNode(context) while node: features = node.feature.split(",") hinsi = features[0] if hinsi == "BOS/EOS": node = node.next continue genkei = features[6] genkeis.append(genkei) node = node.next counter = collections.Counter(genkeis) title_hash = hashing.get_md5_hash(title) save_filepath = os.path.join(save_dir, title_hash + ".txt") with open(save_filepath, "w", encoding="utf_8", newline="") as w: w.write(str(len(genkeis))) w.write("\n") for tup in counter.most_common(): w.write(tup[0]) w.write("\t") w.write(str(tup[1])) w.write("\n")
def create_inputs_embeds_and_token_type_ids(bert_model,input_ids,indices,options,im_features_dir): batch_size=input_ids.size()[0] num_options=input_ids.size()[1] inputs_embeds=torch.empty(batch_size,num_options,512,768).to(device) inputs_token_type_ids=torch.empty(batch_size,num_options,512,dtype=torch.long).to(device) for i in range(batch_size): text_embeddings=create_text_embeddings(bert_model,input_ids[i]) ops=options[indices[i]] for j in range(num_options): article_name=ops.get(j) article_hash=hashing.get_md5_hash(article_name) option_embedding=None inputs_token_type_ids_tmp=None im_features_filepath=os.path.join(im_features_dir,article_hash+".pt") if os.path.exists(im_features_filepath): if torch.cuda.is_available(): im_embedding=torch.load(im_features_filepath).to(device) else: im_embedding=torch.load(im_features_filepath,map_location=torch.device("cpu")).to(device) option_embedding,inputs_token_type_ids_tmp=create_option_embedding(text_embeddings[j],im_embedding) else: option_embedding=text_embeddings[j] inputs_token_type_ids_tmp=torch.zeros(512,dtype=torch.long).to(device) inputs_embeds[i,j]=option_embedding inputs_token_type_ids[i,j]=inputs_token_type_ids_tmp return inputs_embeds,inputs_token_type_ids
def calc_score(mecab, question, option, count_dir, nqis, ignores, k1=1.6, b=0.75, delta=1.0): genkeis = [] node = mecab.parseToNode(question) while node: features = node.feature.split(",") hinsi = features[0] if hinsi == "BOS/EOS": node = node.next continue genkei = features[6] genkeis.append(genkei) node = node.next score = 0 for genkei in genkeis: idf = 0 #if genkei not in ignores: nqi = 0 if genkei in nqis: nqi = nqis[genkei] idf = math.log((NUM_TOTAL_DOCS - nqi + 0.5) / (nqi + 0.5)) idf = max(0, idf) option_hash = hashing.get_md5_hash(option) count_filepath = os.path.join(count_dir, option_hash + ".txt") d, freq = get_d_and_frequency(count_filepath, genkei) numerator = freq * (k1 + 1) denominator = freq + k1 * (1 - b + b * d / AVGDL) score += idf * (numerator / denominator + delta) return score
def main(im_base_dir, feature_dim, features_save_dir): """ Main function Args: im_base_dir (str): Base directory of the image files feature_dim (int): Dimension of image features features_save_dir (str): Directory name to save the image features in """ #Load the article list. article_list_filepath = os.path.join(im_base_dir, "article_list.txt") df = pd.read_table(article_list_filepath, header=None) articles = {} for row in df.itertuples(name=None): article_name = row[1] dir_1 = row[2] dir_2 = row[3] article_hash = hashing.get_md5_hash(article_name) im_dir = os.path.join(im_base_dir, "Images", str(dir_1), str(dir_2)) articles[article_hash] = im_dir #Create a directory to save the image features in. os.makedirs(features_save_dir, exist_ok=True) #Create a VGG16 model. vgg16 = torchvision.models.vgg16(pretrained=True) vgg16.classifier[6] = torch.nn.Linear(4096, feature_dim) vgg16.to(device) vgg16.eval() #Create image features. for article_hash, im_dir in tqdm(articles.items()): regions = get_pred_regions_as_images(im_dir) features = get_vgg16_output_from_regions(regions, vgg16, feature_dim) features_save_filepath = os.path.join(features_save_dir, article_hash + ".pt") torch.save(features, features_save_filepath)
def create_image_embeddings(image_root_dir, article_list_filepath, embed_save_dir, predictor): os.makedirs(embed_save_dir, exist_ok=True) df = pd.read_csv(article_list_filepath, encoding="utf_8", sep="\t") for row in tqdm(df.values): article_name, sec1, sec2 = row[:3] image_dir = os.path.join(image_root_dir, str(sec1), str(sec2)) #Load images. pathname = os.path.join(image_dir, "*") image_files = glob.glob(pathname) images = list() for image_file in image_files: image = cv2.imread(image_file) images.append(image) features = get_region_features(images, predictor) article_hash = hashing.get_md5_hash(article_name) save_filepath = os.path.join(embed_save_dir, article_hash + ".pt") torch.save(features, save_filepath)
def main(im_base_dir, embedding_dim, embeddings_save_dir): #Load the article list. article_list_filepath = os.path.join(im_base_dir, "article_list.txt") df = pd.read_table(article_list_filepath, header=None) articles = {} for row in df.itertuples(name=None): article_name = row[1] dir_1 = row[2] dir_2 = row[3] article_hash = hashing.get_md5_hash(article_name) im_dir = os.path.join(im_base_dir, "Images", str(dir_1), str(dir_2)) articles[article_hash] = im_dir #Create a directory to save the image embeddings in. os.makedirs(embeddings_save_dir, exist_ok=True) #Create a VGG16 model. vgg16 = torchvision.models.vgg16(pretrained=True) vgg16.to(device) vgg16.eval() fc_vgg16 = nn.Linear(1000, embedding_dim).to(device) fc_c = nn.Linear(5, embedding_dim).to(device) #Create image embeddings. for article_hash, im_dir in tqdm(articles.items()): pred_regions = get_pred_regions_as_images(im_dir) im_embeddings = create_im_embeddings(pred_regions, vgg16, embedding_dim, fc_vgg16, fc_c) embeddings_save_filepath = os.path.join(embeddings_save_dir, article_hash + ".pt") torch.save(im_embeddings, embeddings_save_filepath)
def main(input_dir, im_features_dir, save_dir): #Load encoded text from a cached file. logger.info("Load encoded text from {}.".format(input_dir)) input_ids = torch.load(os.path.join(input_dir, "input_ids.pt")) attention_mask = torch.load(os.path.join(input_dir, "attention_mask.pt")) token_type_ids = torch.load(os.path.join(input_dir, "token_type_ids.pt")) labels = torch.load(os.path.join(input_dir, "labels.pt")) #Load the list of options. list_filepath = os.path.join(input_dir, "options_list.txt") logger.info("Load the list of options. {}".format(list_filepath)) with open(list_filepath, "r", encoding="UTF-8") as r: lines = r.read().splitlines() options = [] ops = None for line in lines: if ops is None: ops = Options() if line == "": options.append(ops) ops = None else: ops.append(line) #Load a BERT model. logger.info("Load a pre-trained BERT model.") bert_model = BertModel.from_pretrained( "cl-tohoku/bert-base-japanese-whole-word-masking") bert_model.to(device) #Create a directory to save the cache files in. os.makedirs(save_dir, exist_ok=True) #Create input embeddings. logger.info("Start creating input embeddings.") for i in tqdm(range(input_ids.size()[0])): inputs_embeds = torch.empty(20, 512, 768) inputs_token_type_ids = torch.empty(20, 512) text_embeddings = create_text_embeddings(bert_model, input_ids[i]) ops = options[i] for j in range(20): article_name = ops.get(j) article_hash = hashing.get_md5_hash(article_name) option_embedding = None inputs_token_type_ids_tmp = None im_features_filepath = os.path.join(im_features_dir, article_hash + ".pt") if os.path.exists(im_features_filepath): if torch.cuda.is_available(): im_embedding = torch.load(im_features_filepath).to(device) else: im_embedding = torch.load( im_features_filepath, map_location=torch.device("cpu")).to(device) option_embedding, inputs_token_type_ids_tmp = create_option_embedding( text_embeddings[j], im_embedding) else: option_embedding = text_embeddings[j] inputs_token_type_ids_tmp = torch.zeros( 512, dtype=torch.long).to(device) inputs_embeds[j] = option_embedding inputs_token_type_ids[j] = inputs_token_type_ids_tmp inputs_save_dir = os.path.join(save_dir, str(i)) os.makedirs(inputs_save_dir, exist_ok=True) torch.save(inputs_embeds, os.path.join(inputs_save_dir, "inputs_embeds.pt")) torch.save(inputs_token_type_ids, os.path.join(inputs_save_dir, "inputs_token_type_ids.pt"))