def __getitem__(self, index, raw=False): idx = index / 5 idx_cap = index % 5 path = self.content[int(idx)][0] target = self.content[int(idx)][1][idx_cap] if raw: return path, target img = Image.open(os.path.join(self.root, path)).convert('RGB') if self.transform is not None: img = self.transform(img) #target = encode_sentence(target, self.params, self.dico) if self.word2id is None: if self.bpe: target = bpe_encode(target, self.embed) else: target = encode_sentence_fasttext(target, self.embed) else: target = encode_sentence(target, self.embed, self.word2id) return img, target
def __init__( self, root=path["COCO_ROOT"], coco_json_file_path=path["COCO_RESTVAL_SPLIT"], sset="train", transform=None, embed_type="bin", embed_file="/data/m.portaz/wiki.en.bin", embed_size=300, ): self.root = os.path.join(root, "images/") self.transform = transform with open(coco_json_file_path, "r") as f: datas = json.load(f) if sset == "train": self.content = [x for x in datas["images"] if x["split"] == "train"] elif sset == "trainrv": self.content = [ x for x in datas["images"] if x["split"] == "train" or x["split"] == "restval" ] elif sset == "val": self.content = [x for x in datas["images"] if x["split"] == "val"] else: self.content = [x for x in datas["images"] if x["split"] == "test"] self.content = [ ( os.path.join(y["filepath"], y["filename"]), [x["raw"] for x in y["sentences"]], ) for y in self.content ] self.word2id = None self.bert = False if embed_type == "bin": print("Using binary file dictionary") self.tokenizer = lambda x: encode_sentence_fasttext(x,fastText.load_model(embed_file)) elif embed_type == "multi" or embed_type == "bivec": print("Using .vec file") embed, _, word2id = load_vec(embed_file) self.tokenizer = lambda x: encode_sentence(x, embed, word2id) elif embed_type == "bert": print("Using bert embeddings") self.bertTokenizer = BertTokenizer.from_pretrained( "bert-base-uncased" ) self.tokenizer = lambda x: torch.LongTensor(self.bertTokenizer.encode(text=x)) elif embed_type == "xlm": print("Using XLM embeddings") self.xlmTokens = XLMTokenizer.from_pretrained("xlm-mlm-en-2048") self.tokenizer = lambda x: torch.LongTensor(self.xlmTokens.encode(text=x)) else: print("Unknown embed type :", embed_type) print("Must be : bin, multi or xlm") sys.exit(0)
def __getitem__(self, index): #return self.sentences[index] if self.fastText: return encode_sentence_fasttext( self.sentences[index][0], self.embed), self.sentences[index][1] else: return encode_sentence(self.sentences[index][0], self.embed[0], self.embed[2], tokenize=False), self.sentences[index][1]
def __getitem__(self, index, raw=False): path = self.imList[int(index)] target = self.capList[int(index)] img = Image.open(path).convert('RGB') img = self.transform(img) target = encode_sentence_fasttext(target, self.embed, False) return img, target
def embedFile(args): print("Compute embeddings for the file:", args.file) embed = fastText.load_model(args.dict) nl = file_len(args.file) fout = open(args.output, 'w') with open(args.file) as f: for i, line in enumerate(f): line = line.rstrip() print("%2.2f" % (i / nl * 100.0), "\%", end='\r') i, t = line.split("\t") es = encode_sentence_fasttext(t.split(' '), embed, tokenize=False) fout.write(i + '\t' + str(es) + '\n')
def __init__(self, text_path, word_dict="/data/m.portaz/wiki.multi.en.vec"): lines = open(text_path).readlines() self.sent_list = [line.rstrip("\n") for line in lines] self.fastText = False if word_dict[-4:] == ".vec": emb, _, wordsID = load_vec("/data/m.portaz/wiki.multi.en.vec") self.encoding = lambda x: encode_sentence(x, emb, wordsID, tokenize=False) elif word_dict[-4:] == ".bin": emb = fastText.load_model(word_dict) self.encoding = lambda x: encode_sentence_fasttext(x, emb) self.fastText = True else: print("Error : unknown dictionary type : ", word_dict) sys.exit(1)