示例#1
0
    def __getitem__(self, index, raw=False):
        idx = index / 5

        idx_cap = index % 5

        path = self.content[int(idx)][0]
        target = self.content[int(idx)][1][idx_cap]

        if raw:
            return path, target

        img = Image.open(os.path.join(self.root, path)).convert('RGB')

        if self.transform is not None:
            img = self.transform(img)

        #target = encode_sentence(target, self.params, self.dico)
        if self.word2id is None:
            if self.bpe:
                target = bpe_encode(target, self.embed)
            else:
                target = encode_sentence_fasttext(target, self.embed)
        else:
            target = encode_sentence(target, self.embed, self.word2id)
        return img, target
示例#2
0
    def __init__(
        self,
        root=path["COCO_ROOT"],
        coco_json_file_path=path["COCO_RESTVAL_SPLIT"],
        sset="train",
        transform=None,
        embed_type="bin",
        embed_file="/data/m.portaz/wiki.en.bin",
        embed_size=300,
    ):
        self.root = os.path.join(root, "images/")
        self.transform = transform

        with open(coco_json_file_path, "r") as f:
            datas = json.load(f)

        if sset == "train":
            self.content = [x for x in datas["images"] if x["split"] == "train"]
        elif sset == "trainrv":
            self.content = [
                x
                for x in datas["images"]
                if x["split"] == "train" or x["split"] == "restval"
            ]
        elif sset == "val":
            self.content = [x for x in datas["images"] if x["split"] == "val"]
        else:
            self.content = [x for x in datas["images"] if x["split"] == "test"]

        self.content = [
            (
                os.path.join(y["filepath"], y["filename"]),
                [x["raw"] for x in y["sentences"]],
            )
            for y in self.content
        ]

        self.word2id = None
        self.bert = False
        if embed_type == "bin":
            print("Using binary file dictionary")
            self.tokenizer = lambda x: encode_sentence_fasttext(x,fastText.load_model(embed_file))
        elif embed_type == "multi" or embed_type == "bivec":
            print("Using .vec file")
            embed, _, word2id = load_vec(embed_file)
            self.tokenizer = lambda x: encode_sentence(x, embed, word2id)
        elif embed_type == "bert":
            print("Using bert embeddings")
            self.bertTokenizer = BertTokenizer.from_pretrained(
                "bert-base-uncased"
            )
            self.tokenizer = lambda x: torch.LongTensor(self.bertTokenizer.encode(text=x))
        elif embed_type == "xlm":
            print("Using XLM embeddings")
            self.xlmTokens = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
            self.tokenizer = lambda x: torch.LongTensor(self.xlmTokens.encode(text=x))
        else:
            print("Unknown embed type :", embed_type)
            print("Must be : bin, multi or xlm")
            sys.exit(0)
示例#3
0
 def __getitem__(self, index):
     #return self.sentences[index]
     if self.fastText:
         return encode_sentence_fasttext(
             self.sentences[index][0], self.embed), self.sentences[index][1]
     else:
         return encode_sentence(self.sentences[index][0],
                                self.embed[0],
                                self.embed[2],
                                tokenize=False), self.sentences[index][1]
示例#4
0
    def __getitem__(self, index, raw=False):
        path = self.imList[int(index)]
        target = self.capList[int(index)]
        img = Image.open(path).convert('RGB')

        img = self.transform(img)

        target = encode_sentence_fasttext(target, self.embed, False)

        return img, target
示例#5
0
def embedFile(args):
    print("Compute embeddings for the file:", args.file)
    embed = fastText.load_model(args.dict)
    nl = file_len(args.file)
    fout = open(args.output, 'w')
    with open(args.file) as f:
        for i, line in enumerate(f):
            line = line.rstrip()
            print("%2.2f" % (i / nl * 100.0), "\%", end='\r')
            i, t = line.split("\t")
            es = encode_sentence_fasttext(t.split(' '), embed, tokenize=False)
            fout.write(i + '\t' + str(es) + '\n')
示例#6
0
    def __init__(self, text_path, word_dict="/data/m.portaz/wiki.multi.en.vec"):
        lines = open(text_path).readlines()
        self.sent_list = [line.rstrip("\n") for line in lines]
        self.fastText = False

        if word_dict[-4:] == ".vec":
            emb, _, wordsID = load_vec("/data/m.portaz/wiki.multi.en.vec")
            self.encoding = lambda x: encode_sentence(x, emb, wordsID, tokenize=False)
        elif word_dict[-4:] == ".bin":
            emb = fastText.load_model(word_dict)
            self.encoding = lambda x: encode_sentence_fasttext(x, emb)
            self.fastText = True
        else:
            print("Error : unknown dictionary type : ", word_dict)
            sys.exit(1)