예제 #1
0
def count_context_words(mecab, title, context, save_dir):
    genkeis = []

    node = mecab.parseToNode(context)
    while node:
        features = node.feature.split(",")

        hinsi = features[0]
        if hinsi == "BOS/EOS":
            node = node.next
            continue

        genkei = features[6]
        genkeis.append(genkei)

        node = node.next

    counter = collections.Counter(genkeis)

    title_hash = hashing.get_md5_hash(title)
    save_filepath = os.path.join(save_dir, title_hash + ".txt")
    with open(save_filepath, "w", encoding="utf_8", newline="") as w:
        w.write(str(len(genkeis)))
        w.write("\n")

        for tup in counter.most_common():
            w.write(tup[0])
            w.write("\t")
            w.write(str(tup[1]))
            w.write("\n")
예제 #2
0
def create_inputs_embeds_and_token_type_ids(bert_model,input_ids,indices,options,im_features_dir):
    batch_size=input_ids.size()[0]
    num_options=input_ids.size()[1]

    inputs_embeds=torch.empty(batch_size,num_options,512,768).to(device)
    inputs_token_type_ids=torch.empty(batch_size,num_options,512,dtype=torch.long).to(device)

    for i in range(batch_size):
        text_embeddings=create_text_embeddings(bert_model,input_ids[i])

        ops=options[indices[i]]
        for j in range(num_options):
            article_name=ops.get(j)
            article_hash=hashing.get_md5_hash(article_name)

            option_embedding=None
            inputs_token_type_ids_tmp=None
            im_features_filepath=os.path.join(im_features_dir,article_hash+".pt")

            if os.path.exists(im_features_filepath):
                if torch.cuda.is_available():
                    im_embedding=torch.load(im_features_filepath).to(device)
                else:
                    im_embedding=torch.load(im_features_filepath,map_location=torch.device("cpu")).to(device)

                option_embedding,inputs_token_type_ids_tmp=create_option_embedding(text_embeddings[j],im_embedding)
            else:
                option_embedding=text_embeddings[j]
                inputs_token_type_ids_tmp=torch.zeros(512,dtype=torch.long).to(device)

            inputs_embeds[i,j]=option_embedding
            inputs_token_type_ids[i,j]=inputs_token_type_ids_tmp

    return inputs_embeds,inputs_token_type_ids
예제 #3
0
def calc_score(mecab,
               question,
               option,
               count_dir,
               nqis,
               ignores,
               k1=1.6,
               b=0.75,
               delta=1.0):
    genkeis = []
    node = mecab.parseToNode(question)
    while node:
        features = node.feature.split(",")

        hinsi = features[0]
        if hinsi == "BOS/EOS":
            node = node.next
            continue

        genkei = features[6]
        genkeis.append(genkei)

        node = node.next

    score = 0
    for genkei in genkeis:
        idf = 0
        #if genkei not in ignores:
        nqi = 0
        if genkei in nqis:
            nqi = nqis[genkei]

        idf = math.log((NUM_TOTAL_DOCS - nqi + 0.5) / (nqi + 0.5))
        idf = max(0, idf)

        option_hash = hashing.get_md5_hash(option)
        count_filepath = os.path.join(count_dir, option_hash + ".txt")
        d, freq = get_d_and_frequency(count_filepath, genkei)

        numerator = freq * (k1 + 1)
        denominator = freq + k1 * (1 - b + b * d / AVGDL)

        score += idf * (numerator / denominator + delta)

    return score
예제 #4
0
def main(im_base_dir, feature_dim, features_save_dir):
    """
    Main function

    Args:
        im_base_dir (str): Base directory of the image files
        feature_dim (int): Dimension of image features
        features_save_dir (str): Directory name to save the image features in
    """
    #Load the article list.
    article_list_filepath = os.path.join(im_base_dir, "article_list.txt")
    df = pd.read_table(article_list_filepath, header=None)

    articles = {}
    for row in df.itertuples(name=None):
        article_name = row[1]
        dir_1 = row[2]
        dir_2 = row[3]

        article_hash = hashing.get_md5_hash(article_name)

        im_dir = os.path.join(im_base_dir, "Images", str(dir_1), str(dir_2))
        articles[article_hash] = im_dir

    #Create a directory to save the image features in.
    os.makedirs(features_save_dir, exist_ok=True)

    #Create a VGG16 model.
    vgg16 = torchvision.models.vgg16(pretrained=True)
    vgg16.classifier[6] = torch.nn.Linear(4096, feature_dim)
    vgg16.to(device)
    vgg16.eval()

    #Create image features.
    for article_hash, im_dir in tqdm(articles.items()):
        regions = get_pred_regions_as_images(im_dir)
        features = get_vgg16_output_from_regions(regions, vgg16, feature_dim)

        features_save_filepath = os.path.join(features_save_dir,
                                              article_hash + ".pt")
        torch.save(features, features_save_filepath)
def create_image_embeddings(image_root_dir, article_list_filepath,
                            embed_save_dir, predictor):
    os.makedirs(embed_save_dir, exist_ok=True)

    df = pd.read_csv(article_list_filepath, encoding="utf_8", sep="\t")
    for row in tqdm(df.values):
        article_name, sec1, sec2 = row[:3]
        image_dir = os.path.join(image_root_dir, str(sec1), str(sec2))

        #Load images.
        pathname = os.path.join(image_dir, "*")
        image_files = glob.glob(pathname)
        images = list()
        for image_file in image_files:
            image = cv2.imread(image_file)
            images.append(image)

        features = get_region_features(images, predictor)

        article_hash = hashing.get_md5_hash(article_name)
        save_filepath = os.path.join(embed_save_dir, article_hash + ".pt")

        torch.save(features, save_filepath)
예제 #6
0
def main(im_base_dir, embedding_dim, embeddings_save_dir):
    #Load the article list.
    article_list_filepath = os.path.join(im_base_dir, "article_list.txt")
    df = pd.read_table(article_list_filepath, header=None)

    articles = {}
    for row in df.itertuples(name=None):
        article_name = row[1]
        dir_1 = row[2]
        dir_2 = row[3]

        article_hash = hashing.get_md5_hash(article_name)

        im_dir = os.path.join(im_base_dir, "Images", str(dir_1), str(dir_2))
        articles[article_hash] = im_dir

    #Create a directory to save the image embeddings in.
    os.makedirs(embeddings_save_dir, exist_ok=True)

    #Create a VGG16 model.
    vgg16 = torchvision.models.vgg16(pretrained=True)
    vgg16.to(device)
    vgg16.eval()

    fc_vgg16 = nn.Linear(1000, embedding_dim).to(device)
    fc_c = nn.Linear(5, embedding_dim).to(device)

    #Create image embeddings.
    for article_hash, im_dir in tqdm(articles.items()):
        pred_regions = get_pred_regions_as_images(im_dir)
        im_embeddings = create_im_embeddings(pred_regions, vgg16,
                                             embedding_dim, fc_vgg16, fc_c)

        embeddings_save_filepath = os.path.join(embeddings_save_dir,
                                                article_hash + ".pt")
        torch.save(im_embeddings, embeddings_save_filepath)
def main(input_dir, im_features_dir, save_dir):
    #Load encoded text from a cached file.
    logger.info("Load encoded text from {}.".format(input_dir))

    input_ids = torch.load(os.path.join(input_dir, "input_ids.pt"))
    attention_mask = torch.load(os.path.join(input_dir, "attention_mask.pt"))
    token_type_ids = torch.load(os.path.join(input_dir, "token_type_ids.pt"))
    labels = torch.load(os.path.join(input_dir, "labels.pt"))

    #Load the list of options.
    list_filepath = os.path.join(input_dir, "options_list.txt")

    logger.info("Load the list of options. {}".format(list_filepath))

    with open(list_filepath, "r", encoding="UTF-8") as r:
        lines = r.read().splitlines()

    options = []
    ops = None
    for line in lines:
        if ops is None:
            ops = Options()

        if line == "":
            options.append(ops)
            ops = None
        else:
            ops.append(line)

    #Load a BERT model.
    logger.info("Load a pre-trained BERT model.")

    bert_model = BertModel.from_pretrained(
        "cl-tohoku/bert-base-japanese-whole-word-masking")
    bert_model.to(device)

    #Create a directory to save the cache files in.
    os.makedirs(save_dir, exist_ok=True)

    #Create input embeddings.
    logger.info("Start creating input embeddings.")

    for i in tqdm(range(input_ids.size()[0])):
        inputs_embeds = torch.empty(20, 512, 768)
        inputs_token_type_ids = torch.empty(20, 512)

        text_embeddings = create_text_embeddings(bert_model, input_ids[i])

        ops = options[i]
        for j in range(20):
            article_name = ops.get(j)
            article_hash = hashing.get_md5_hash(article_name)

            option_embedding = None
            inputs_token_type_ids_tmp = None
            im_features_filepath = os.path.join(im_features_dir,
                                                article_hash + ".pt")
            if os.path.exists(im_features_filepath):
                if torch.cuda.is_available():
                    im_embedding = torch.load(im_features_filepath).to(device)
                else:
                    im_embedding = torch.load(
                        im_features_filepath,
                        map_location=torch.device("cpu")).to(device)

                option_embedding, inputs_token_type_ids_tmp = create_option_embedding(
                    text_embeddings[j], im_embedding)
            else:
                option_embedding = text_embeddings[j]
                inputs_token_type_ids_tmp = torch.zeros(
                    512, dtype=torch.long).to(device)

            inputs_embeds[j] = option_embedding
            inputs_token_type_ids[j] = inputs_token_type_ids_tmp

        inputs_save_dir = os.path.join(save_dir, str(i))
        os.makedirs(inputs_save_dir, exist_ok=True)
        torch.save(inputs_embeds,
                   os.path.join(inputs_save_dir, "inputs_embeds.pt"))
        torch.save(inputs_token_type_ids,
                   os.path.join(inputs_save_dir, "inputs_token_type_ids.pt"))