def test_character_token_embedder(self): vocab = Dictionary() vocab.add_symbol('hello') vocab.add_symbol('there') embedder = CharacterTokenEmbedder(vocab, [(2, 16), (4, 32), (8, 64), (16, 2)], 64, 5, 2) test_sents = [['hello', 'unk', 'there'], ['there'], ['hello', 'there']] max_len = max(len(s) for s in test_sents) input = torch.LongTensor(len(test_sents), max_len + 2).fill_(vocab.pad()) for i in range(len(test_sents)): input[i][0] = vocab.eos() for j in range(len(test_sents[i])): input[i][j + 1] = vocab.index(test_sents[i][j]) input[i][j + 2] = vocab.eos() embs = embedder(input) assert embs.size() == (len(test_sents), max_len + 2, 5) self.assertAlmostEqual(embs[0][0], embs[1][0]) self.assertAlmostEqual(embs[0][0], embs[0][-1]) self.assertAlmostEqual(embs[0][1], embs[2][1]) self.assertAlmostEqual(embs[0][3], embs[1][1]) embs.sum().backward() assert embedder.char_embeddings.weight.grad is not None
def create_lang_dictionary(cls, langs): unk = "<unk>" # hack to remove symbols other than unk as they are not needed by lang dict lang_dict = Dictionary(pad=unk, eos=unk, unk=unk, bos=unk) for lang in langs: lang_dict.add_symbol(lang) return lang_dict
def setup_task(cls, args, **kwargs): """Setup the task. """ dictionary = Dictionary() for i in range(args.dict_size): dictionary.add_symbol("word{}".format(i)) logger.info("dictionary: {} types".format(len(dictionary))) return cls(args, dictionary)
def build_vocab(data: tp.List[tp.List[str]]) -> Dictionary: d = Dictionary() for s in data: for token in s: d.add_symbol(token) d.finalize() return d
def dummy_dictionary(vocab_size, prefix='token_'): d = Dictionary() for i in range(vocab_size): token = prefix + str(i) d.add_symbol(token) d.finalize(padding_factor=1) # don't add extra padding symbols return d
def test_huffman_compresses(self): data = make_data() builder = make_code_builder(data) coder = builder.build_code() with TemporaryDirectory() as dirname: prefix = os.path.join(dirname, "huffman") build_dataset(prefix, data, coder) prefix_mmap = os.path.join(dirname, "mmap") mmap_builder = indexed_dataset.make_builder( indexed_dataset.data_file_path(prefix_mmap), "mmap", vocab_size=len(POPULATION), ) dictionary = Dictionary() for c in POPULATION: dictionary.add_symbol(c) dictionary.finalize() for sentence in data: mmap_builder.add_item(dictionary.encode_line(" ".join(sentence))) mmap_builder.finalize(indexed_dataset.index_file_path(prefix_mmap)) huff_size = os.stat(indexed_dataset.data_file_path(prefix)).st_size mmap_size = os.stat(indexed_dataset.data_file_path(prefix_mmap)).st_size self.assertLess(huff_size, mmap_size)
def setup_task(cls, args, **kwargs): """Setup the task. """ dictionary = Dictionary() for i in range(args.dict_size): dictionary.add_symbol('word{}'.format(i)) print('| dictionary: {} types'.format(len(dictionary))) return cls(args, dictionary)
def build_word_dict(word_embed_path): word_dict = Dictionary() with open(word_embed_path, 'r') as f: for line in f: word = line.split(' ', 1)[0] word_dict.add_symbol(word) word_dict.finalize(padding_factor=1) return word_dict
def pad_dict(d: Dictionary, num_extra_symbols: int, padding_factor: int = 8) -> None: i = 0 while (len(d) + num_extra_symbols) % padding_factor != 0: symbol = f"madeupword{i:04d}" d.add_symbol(symbol, n=0) i += 1
def _t2c_to_tsr(self, t2c: Dict[str, List[str]], dict: Dictionary) -> Dict[int, torch.LongTensor]: res_dict = {} for k in t2c.keys(): res_dict[dict.add_symbol(k)] = torch.LongTensor( [dict.add_symbol(v) for v in t2c[k]]) return res_dict
class DummyMaskedLMTask(FairseqTask): def __init__(self, cfg: DummyMaskedLMConfig): super().__init__(cfg) self.dictionary = Dictionary() for i in range(cfg.dict_size): self.dictionary.add_symbol("word{}".format(i)) logger.info("dictionary: {} types".format(len(self.dictionary))) # add mask token self.mask_idx = self.dictionary.add_symbol("<mask>") self.dictionary.pad_to_multiple_(8) # often faster if divisible by 8 mask_idx = 0 pad_idx = 1 seq = torch.arange(cfg.tokens_per_sample) + pad_idx + 1 mask = torch.arange(2, cfg.tokens_per_sample, 7) # ~15% src = seq.clone() src[mask] = mask_idx tgt = torch.full_like(seq, pad_idx) tgt[mask] = seq[mask] self.dummy_src = src self.dummy_tgt = tgt def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ if self.cfg.batch_size is not None: bsz = self.cfg.batch_size else: bsz = max(1, self.cfg.max_tokens // self.cfg.tokens_per_sample) self.datasets[split] = DummyDataset( { "id": 1, "net_input": { "src_tokens": torch.stack([self.dummy_src for _ in range(bsz)]), "src_lengths": torch.full( (bsz, ), self.cfg.tokens_per_sample, dtype=torch.long), }, "target": torch.stack([self.dummy_tgt for _ in range(bsz)]), "nsentences": bsz, "ntokens": bsz * self.cfg.tokens_per_sample, }, num_items=self.cfg.dataset_size, item_size=self.cfg.tokens_per_sample, ) @property def source_dictionary(self): return self.dictionary @property def target_dictionary(self): return self.dictionary
def build_sememe_dict(datapath): sememe_dict = Dictionary() with open(os.path.join(datapath, 'HowNet.edge'), 'r') as f: for line in f: sememes = line.strip().split('\t')[1] for s in sememes.split(): sememe_dict.add_symbol(s) sememe_dict.finalize(threshold=5, padding_factor=1) return sememe_dict
def to_dictionary(self) -> Dictionary: dictionary = Dictionary(bos=self.bos, unk=self.unk, pad=self.pad, eos=self.eos) for n in self: dictionary.add_symbol(n.symbol, n=n.count) dictionary.finalize() return dictionary
def label_schema_as_dictionary(label_schema): label_dict = Dictionary() labels = list(label_schema.labels) assert len(labels) == len(set(labels)) for label in labels: label_dict.add_symbol(label) return label_dict
def setup_task(cls, args, **kwargs): """Setup the task. """ dictionary = Dictionary() for i in range(args.dict_size): dictionary.add_symbol('word{}'.format(i)) logger.info('dictionary: {} types'.format(len(dictionary))) args.max_source_positions = args.src_len + dictionary.pad() + 2 args.max_target_positions = args.tgt_len + dictionary.pad() + 2 return cls(args, dictionary)
def get_bnids_dictionary(cls) -> Dictionary: if cls._bnids_dictionary is None: src_dictionary = cls.get_offsets_dictionary() tgt_dictionary = Dictionary() string_map = cls.get_offset_to_bnids_map() for idx, wn in enumerate(src_dictionary.symbols): if wn.startswith('wn:'): tgt_dictionary.add_symbol(string_map[wn]) tgt_dictionary.finalize() cls._bnids_dictionary = tgt_dictionary return cls._bnids_dictionary
def write_dictionary(model_dir, lst): '''Write out dictionary in fair seq format.''' joined_dict = Dictionary() for toks in lst: for t in toks: joined_dict.add_symbol(t) print('| dictionary: {} types'.format(len(joined_dict))) with open(model_dir + '/dict.txt', 'w') as fd: joined_dict.save(fd)
def _get_test_data_with_word_vocab(self, append_eos=True): """ Args: append_eos: if True, each input sentence in the source tokens tensor will have an EOS appended to the end. Returns: vocabs: word vocab x: input tensor containing numberized source tokens, with EOS at the end if append_eos is true src_lengths: and source lengths. """ vocab = Dictionary() vocab.add_symbol("hello") vocab.add_symbol("how") vocab.add_symbol("are") vocab.add_symbol("you") vocab.add_symbol("new") vocab.add_symbol("york") src_tokens = [ ["hello", "new", "york", "you"], ["how", "are", "you", "new", "york"], ] x, src_lengths = self._convert_src_tokens_to_tensor( vocab=vocab, src_tokens=src_tokens, append_eos=append_eos ) return vocab, x, src_lengths
def load_target_dictionary(self): if self.cfg.labels: dict = Dictionary(bos=self.bos, pad=self.pad, eos=self.eos, unk=self.unk, from_tokenizer=True) dict.bos_index = self.tokenizer.encoder[self.bos] dict.pad_index = self.tokenizer.encoder[self.pad] dict.eos_index = self.tokenizer.encoder[self.eos] dict.unk_index = self.tokenizer.encoder[self.unk] for symbol in self.tokenizer.encoder.keys(): dict.add_symbol(symbol) return dict return None
def augment_dictionary( dictionary: Dictionary, language_list: List[str], lang_tok_style: str, langtoks_specs: Sequence[str] = (LangTokSpec.main.value,), extra_data: Optional[Dict[str, str]] = None, ) -> None: for spec in langtoks_specs: for language in language_list: dictionary.add_symbol( get_lang_tok(lang=language, lang_tok_style=lang_tok_style, spec=spec) ) if lang_tok_style == LangTokStyle.mbart.value or ( extra_data is not None and LangTokSpec.mono_dae.value in extra_data ): dictionary.add_symbol("<mask>")
def ensure_symbols_are_present(dictionary: Dictionary, symbols: List[str], ok_to_increase_dict_size: bool) -> None: """ Ensure that the symbols in the source and target dictionary are present. Makes changes to the dictionaries in-place. """ original_size = len(dictionary) _ = remove_madeupwords_from_dictionary(dictionary) for symbol in symbols: dictionary.add_symbol(symbol) dictionary.pad_to_multiple_(8) if not ok_to_increase_dict_size: # Let's not crash - but rather point out that we are not allowed to increase the dictionary size. if len(dictionary) != original_size: logger.warning( "The dictionary size changed. The model loading will probably fail." )
def load_label_dictionary(cls, args, filename, **kwargs): """Load the dictionary from the filename Args: filename (str): the filename """ label_schema = parse_label_schema(filename) label_dict = Dictionary() labels = list(label_schema.labels) assert labels[0] == "NULL", "Expected label at index 0 to be 'NULL'" assert len(labels) == len(set(labels)) for label in labels: label_dict.add_symbol(label) assert label_dict.symbols[ label_dict. nspecial] == "NULL", "Expected first nonspecial token to be 'NULL'" return label_dict, label_schema
def setup_task(cls, args, **kwargs): data_cfg = data_cfg = S2SDataConfig(Path(args.data) / args.config_yaml) tgt_dict = None infer_tgt_lang_id = None if args.target_is_code: if data_cfg.prepend_tgt_lang_tag_as_bos: # dictionary with language tags dict_path = Path(args.data) / data_cfg.vocab_filename if not dict_path.is_file(): raise FileNotFoundError( f"Dict has to be provided when setting prepend_tgt_lang_tag_as_bos: true, but dict not found: {dict_path}" ) tgt_dict = Dictionary.load(dict_path.as_posix()) # target langauge for inference if args.infer_target_lang != "": tgt_lang_tag = SpeechToTextDataset.LANG_TAG_TEMPLATE.format( args.infer_target_lang) infer_tgt_lang_id = tgt_dict.index(tgt_lang_tag) assert infer_tgt_lang_id != tgt_dict.unk() else: assert args.target_code_size is not None tgt_dict = Dictionary() for i in range(args.target_code_size): tgt_dict.add_symbol(str(i)) logger.info(f"dictionary size: " f"{len(tgt_dict):,}") if getattr(args, "train_subset", None) is not None: if not all( s.startswith("train") for s in args.train_subset.split(",")): raise ValueError('Train splits should be named like "train*".') assert args.n_frames_per_step >= 1 assert (not args.eval_inference or (args.target_is_code and args.vocoder == "code_hifigan") or (not args.target_is_code and args.vocoder != "code_hifigan")) return cls(args, tgt_dict, infer_tgt_lang_id=infer_tgt_lang_id)
def setup_task(cls, args, **kwargs): tgt_dict = None if args.target_is_code: assert args.target_code_size is not None tgt_dict = Dictionary() for i in range(args.target_code_size): tgt_dict.add_symbol(str(i)) logger.info(f"dictionary size: " f"{len(tgt_dict):,}") if getattr(args, "train_subset", None) is not None: if not all( s.startswith("train") for s in args.train_subset.split(",")): raise ValueError('Train splits should be named like "train*".') assert args.n_frames_per_step >= 1 assert (not args.eval_inference or (args.target_is_code and args.vocoder == "code_hifigan") or (not args.target_is_code and args.vocoder != "code_hifigan")) return cls(args, tgt_dict)
def _get_test_data_with_bpe_end_marker(self, append_eos=True): """ Args: append_eos: if True, each input sentence in the source tokens tensor will have an EOS appended to the end. Returns: vocabs: BPE vocab with end-of-word markers as suffixes to denote tokens at the end of a word. This is an alternative to fairseq's standard preprocessing framework and is not generally supported within fairseq. x: input tensor containing numberized source tokens, with EOS at the end if append_eos is true src_lengths: and source lengths. """ vocab = Dictionary() vocab.add_symbol("he") vocab.add_symbol("llo_EOW") vocab.add_symbol("how_EOW") vocab.add_symbol("are_EOW") vocab.add_symbol("y") vocab.add_symbol("ou_EOW") vocab.add_symbol("n") vocab.add_symbol("ew_EOW") vocab.add_symbol("or") vocab.add_symbol("k_EOW") src_tokens = [ ["he", "llo_EOW", "n", "ew_EOW", "y", "or", "k_EOW"], ["how_EOW", "are_EOW", "y", "ou_EOW"], ] x, src_lengths = x, src_lengths = self._convert_src_tokens_to_tensor( vocab=vocab, src_tokens=src_tokens, append_eos=append_eos ) return vocab, x, src_lengths
def _get_test_data_with_bpe_cont_marker(self, append_eos=True): """ Args: append_eos: if True, each input sentence in the source tokens tensor will have an EOS appended to the end. Returns: vocabs: BPE vocab with continuation markers as suffixes to denote non-end of word tokens. This is the standard BPE format used in fairseq's preprocessing. x: input tensor containing numberized source tokens, with EOS at the end if append_eos is true src_lengths: and source lengths. """ vocab = Dictionary() vocab.add_symbol("he@@") vocab.add_symbol("llo") vocab.add_symbol("how") vocab.add_symbol("are") vocab.add_symbol("y@@") vocab.add_symbol("ou") vocab.add_symbol("n@@") vocab.add_symbol("ew") vocab.add_symbol("or@@") vocab.add_symbol("k") src_tokens = [ ["he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"], ["how", "are", "y@@", "ou"], ] x, src_lengths = x, src_lengths = self._convert_src_tokens_to_tensor( vocab=vocab, src_tokens=src_tokens, append_eos=append_eos ) return vocab, x, src_lengths
def __init__(self, root, split='train', samples_filter=None, vocabs_from=None, parse_programs=True, limit=None, programs_mapping=None): questions_path = f'{root}/questions/CLEVR_{split}_questions.json' scenes_path = f'{root}/scenes/CLEVR_{split}_scenes.json' scenes_data = None with open(scenes_path) as data: scenes_data = json.load(data)['scenes'] questions_file = os.path.join(root, questions_path) samples = None with open(questions_file) as data: samples = json.load(data)['questions'] if vocabs_from is not None: programs_vocab = vocabs_from.programs_vocab else: programs_vocab = Dictionary() final_samples = [] for sample in samples: sample['prompt'] = sample['question'] sample['target'] = sample['answer'] img_idx = sample['image_index'] scene = scenes_data[img_idx] sample['image_path'] = os.path.join('images', scene['image_filename']) sample['viz_rep'] = self.scene_to_canonical_rep(scene) sample['scene'] = scene if parse_programs: if programs_mapping is None: prog_str = CLEVR.build_prog_str(sample['program']) sample['program_str'] = prog_str program_tokens = [] for token in CLEVR.tokenize_program(prog_str): program_tokens.append(programs_vocab.add_symbol(token)) sample['program_tokens'] = torch.tensor(program_tokens) else: program_str, program_tokens = programs_mapping.get( sample['question_family_index'], ('NONE', [])) sample['program_str'] = program_str sample['program_tokens'] = torch.tensor(program_tokens) if samples_filter is None or samples_filter(sample): final_samples.append(sample) self.programs_vocab = programs_vocab img_transform = tv.transforms.Compose([ tv.transforms.Pad((0, 150), fill=300, padding_mode='constant'), tv.transforms.Resize(224), tv.transforms.ToTensor(), tv.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) super().__init__(os.path.join(root, split), final_samples, img_transform, vocabs_from=vocabs_from, prompt_mode='natural', target_mode='natural', limit=limit)
def _get_test_data(self): vocab = Dictionary() vocab.add_symbol("he@@") vocab.add_symbol("llo") vocab.add_symbol("how") vocab.add_symbol("are") vocab.add_symbol("y@@") vocab.add_symbol("ou") vocab.add_symbol("n@@") vocab.add_symbol("ew") vocab.add_symbol("or@@") vocab.add_symbol("k") src_tokens = [ ["he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"], ["how", "are", "y@@", "ou"], ] src_len = [len(x) for x in src_tokens] x = torch.LongTensor(len(src_tokens), max(src_len) + 1).fill_(vocab.pad()) for i in range(len(src_tokens)): for j in range(len(src_tokens[i])): x[i][j] = vocab.index(src_tokens[i][j]) x[i][j + 1] = vocab.eos() x = x.transpose(1, 0) return vocab, x, torch.LongTensor([i + 1 for i in src_len])
def _get_test_data(self, append_eos=True, bpe=True): vocab = Dictionary() if bpe: vocab.add_symbol("he@@") vocab.add_symbol("llo") vocab.add_symbol("how") vocab.add_symbol("are") vocab.add_symbol("y@@") vocab.add_symbol("ou") vocab.add_symbol("n@@") vocab.add_symbol("ew") vocab.add_symbol("or@@") vocab.add_symbol("k") src_tokens = [ ["he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"], ["how", "are", "y@@", "ou"], ] else: vocab.add_symbol("hello") vocab.add_symbol("how") vocab.add_symbol("are") vocab.add_symbol("you") vocab.add_symbol("new") vocab.add_symbol("york") src_tokens = [ ["hello", "new", "york", "you"], ["how", "are", "you", "new", "york"], ] src_len = [len(x) for x in src_tokens] # If we have to append EOS, we include EOS in counting src length if append_eos: src_len = [length + 1 for length in src_len] x = torch.LongTensor(len(src_tokens), max(src_len)).fill_(vocab.pad()) for i in range(len(src_tokens)): for j in range(len(src_tokens[i])): x[i][j] = vocab.index(src_tokens[i][j]) if append_eos: x[i][j + 1] = vocab.eos() x = x.transpose(1, 0) return vocab, x, torch.LongTensor(src_len)
def build_dict(symbol_list): d = Dictionary() for symbol in symbol_list: d.add_symbol(symbol) return d