def test_character_token_embedder(self):
        vocab = Dictionary()
        vocab.add_symbol('hello')
        vocab.add_symbol('there')

        embedder = CharacterTokenEmbedder(vocab, [(2, 16), (4, 32), (8, 64),
                                                  (16, 2)], 64, 5, 2)

        test_sents = [['hello', 'unk', 'there'], ['there'], ['hello', 'there']]
        max_len = max(len(s) for s in test_sents)
        input = torch.LongTensor(len(test_sents),
                                 max_len + 2).fill_(vocab.pad())
        for i in range(len(test_sents)):
            input[i][0] = vocab.eos()
            for j in range(len(test_sents[i])):
                input[i][j + 1] = vocab.index(test_sents[i][j])
            input[i][j + 2] = vocab.eos()
        embs = embedder(input)

        assert embs.size() == (len(test_sents), max_len + 2, 5)
        self.assertAlmostEqual(embs[0][0], embs[1][0])
        self.assertAlmostEqual(embs[0][0], embs[0][-1])
        self.assertAlmostEqual(embs[0][1], embs[2][1])
        self.assertAlmostEqual(embs[0][3], embs[1][1])

        embs.sum().backward()
        assert embedder.char_embeddings.weight.grad is not None
 def create_lang_dictionary(cls, langs):
     unk = "<unk>"
     # hack to remove symbols other than unk as they are not needed by lang dict
     lang_dict = Dictionary(pad=unk, eos=unk, unk=unk, bos=unk)
     for lang in langs:
         lang_dict.add_symbol(lang)
     return lang_dict
示例#3
0
 def setup_task(cls, args, **kwargs):
     """Setup the task. """
     dictionary = Dictionary()
     for i in range(args.dict_size):
         dictionary.add_symbol("word{}".format(i))
     logger.info("dictionary: {} types".format(len(dictionary)))
     return cls(args, dictionary)
示例#4
0
def build_vocab(data: tp.List[tp.List[str]]) -> Dictionary:
    d = Dictionary()
    for s in data:
        for token in s:
            d.add_symbol(token)
    d.finalize()
    return d
示例#5
0
def dummy_dictionary(vocab_size, prefix='token_'):
    d = Dictionary()
    for i in range(vocab_size):
        token = prefix + str(i)
        d.add_symbol(token)
    d.finalize(padding_factor=1)  # don't add extra padding symbols
    return d
示例#6
0
文件: utils.py 项目: fyabc/fairseq
def dummy_dictionary(vocab_size, prefix='token_'):
    d = Dictionary()
    for i in range(vocab_size):
        token = prefix + str(i)
        d.add_symbol(token)
    d.finalize(padding_factor=1)  # don't add extra padding symbols
    return d
示例#7
0
    def test_huffman_compresses(self):
        data = make_data()
        builder = make_code_builder(data)
        coder = builder.build_code()

        with TemporaryDirectory() as dirname:
            prefix = os.path.join(dirname, "huffman")
            build_dataset(prefix, data, coder)

            prefix_mmap = os.path.join(dirname, "mmap")
            mmap_builder = indexed_dataset.make_builder(
                indexed_dataset.data_file_path(prefix_mmap),
                "mmap",
                vocab_size=len(POPULATION),
            )
            dictionary = Dictionary()
            for c in POPULATION:
                dictionary.add_symbol(c)
            dictionary.finalize()
            for sentence in data:
                mmap_builder.add_item(dictionary.encode_line(" ".join(sentence)))
            mmap_builder.finalize(indexed_dataset.index_file_path(prefix_mmap))

            huff_size = os.stat(indexed_dataset.data_file_path(prefix)).st_size
            mmap_size = os.stat(indexed_dataset.data_file_path(prefix_mmap)).st_size
            self.assertLess(huff_size, mmap_size)
示例#8
0
    def setup_task(cls, args, **kwargs):
        """Setup the task. """
        dictionary = Dictionary()
        for i in range(args.dict_size):
            dictionary.add_symbol('word{}'.format(i))
        print('| dictionary: {} types'.format(len(dictionary)))

        return cls(args, dictionary)
示例#9
0
def build_word_dict(word_embed_path):
    word_dict = Dictionary()
    with open(word_embed_path, 'r') as f:
        for line in f:
            word = line.split(' ', 1)[0]
            word_dict.add_symbol(word)
    word_dict.finalize(padding_factor=1)
    return word_dict
def pad_dict(d: Dictionary,
             num_extra_symbols: int,
             padding_factor: int = 8) -> None:
    i = 0
    while (len(d) + num_extra_symbols) % padding_factor != 0:
        symbol = f"madeupword{i:04d}"
        d.add_symbol(symbol, n=0)
        i += 1
示例#11
0
    def _t2c_to_tsr(self, t2c: Dict[str, List[str]],
                    dict: Dictionary) -> Dict[int, torch.LongTensor]:

        res_dict = {}
        for k in t2c.keys():
            res_dict[dict.add_symbol(k)] = torch.LongTensor(
                [dict.add_symbol(v) for v in t2c[k]])
        return res_dict
示例#12
0
class DummyMaskedLMTask(FairseqTask):
    def __init__(self, cfg: DummyMaskedLMConfig):
        super().__init__(cfg)

        self.dictionary = Dictionary()
        for i in range(cfg.dict_size):
            self.dictionary.add_symbol("word{}".format(i))
        logger.info("dictionary: {} types".format(len(self.dictionary)))
        # add mask token
        self.mask_idx = self.dictionary.add_symbol("<mask>")
        self.dictionary.pad_to_multiple_(8)  # often faster if divisible by 8

        mask_idx = 0
        pad_idx = 1
        seq = torch.arange(cfg.tokens_per_sample) + pad_idx + 1
        mask = torch.arange(2, cfg.tokens_per_sample, 7)  # ~15%
        src = seq.clone()
        src[mask] = mask_idx
        tgt = torch.full_like(seq, pad_idx)
        tgt[mask] = seq[mask]

        self.dummy_src = src
        self.dummy_tgt = tgt

    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
        """Load a given dataset split.
        Args:
            split (str): name of the split (e.g., train, valid, test)
        """
        if self.cfg.batch_size is not None:
            bsz = self.cfg.batch_size
        else:
            bsz = max(1, self.cfg.max_tokens // self.cfg.tokens_per_sample)
        self.datasets[split] = DummyDataset(
            {
                "id": 1,
                "net_input": {
                    "src_tokens":
                    torch.stack([self.dummy_src for _ in range(bsz)]),
                    "src_lengths":
                    torch.full(
                        (bsz, ), self.cfg.tokens_per_sample, dtype=torch.long),
                },
                "target": torch.stack([self.dummy_tgt for _ in range(bsz)]),
                "nsentences": bsz,
                "ntokens": bsz * self.cfg.tokens_per_sample,
            },
            num_items=self.cfg.dataset_size,
            item_size=self.cfg.tokens_per_sample,
        )

    @property
    def source_dictionary(self):
        return self.dictionary

    @property
    def target_dictionary(self):
        return self.dictionary
示例#13
0
def build_sememe_dict(datapath):
    sememe_dict = Dictionary()
    with open(os.path.join(datapath, 'HowNet.edge'), 'r') as f:
        for line in f:
            sememes = line.strip().split('\t')[1]
            for s in sememes.split():
                sememe_dict.add_symbol(s)
    sememe_dict.finalize(threshold=5, padding_factor=1)
    return sememe_dict
示例#14
0
 def to_dictionary(self) -> Dictionary:
     dictionary = Dictionary(bos=self.bos,
                             unk=self.unk,
                             pad=self.pad,
                             eos=self.eos)
     for n in self:
         dictionary.add_symbol(n.symbol, n=n.count)
     dictionary.finalize()
     return dictionary
示例#15
0
def label_schema_as_dictionary(label_schema):
    label_dict = Dictionary()

    labels = list(label_schema.labels)
    assert len(labels) == len(set(labels))

    for label in labels:
        label_dict.add_symbol(label)

    return label_dict
示例#16
0
    def setup_task(cls, args, **kwargs):
        """Setup the task. """
        dictionary = Dictionary()
        for i in range(args.dict_size):
            dictionary.add_symbol('word{}'.format(i))
        logger.info('dictionary: {} types'.format(len(dictionary)))

        args.max_source_positions = args.src_len + dictionary.pad() + 2
        args.max_target_positions = args.tgt_len + dictionary.pad() + 2

        return cls(args, dictionary)
示例#17
0
 def get_bnids_dictionary(cls) -> Dictionary:
     if cls._bnids_dictionary is None:
         src_dictionary = cls.get_offsets_dictionary()
         tgt_dictionary = Dictionary()
         string_map = cls.get_offset_to_bnids_map()
         for idx, wn in enumerate(src_dictionary.symbols):
             if wn.startswith('wn:'):
                 tgt_dictionary.add_symbol(string_map[wn])
         tgt_dictionary.finalize()
         cls._bnids_dictionary = tgt_dictionary
     return cls._bnids_dictionary
示例#18
0
def write_dictionary(model_dir, lst):
    '''Write out dictionary in fair seq format.'''
    joined_dict = Dictionary()
    for toks in lst:
        for t in toks:
            joined_dict.add_symbol(t)

    print('| dictionary: {} types'.format(len(joined_dict)))

    with open(model_dir + '/dict.txt', 'w') as fd:
        joined_dict.save(fd)
示例#19
0
    def _get_test_data_with_word_vocab(self, append_eos=True):
        """
        Args:
            append_eos: if True, each input sentence in the source tokens tensor
                will have an EOS appended to the end.

        Returns:
            vocabs: word vocab
            x: input tensor containing numberized source tokens, with EOS at the
                end if append_eos is true
            src_lengths: and source lengths.
        """
        vocab = Dictionary()

        vocab.add_symbol("hello")
        vocab.add_symbol("how")
        vocab.add_symbol("are")
        vocab.add_symbol("you")
        vocab.add_symbol("new")
        vocab.add_symbol("york")
        src_tokens = [
            ["hello", "new", "york", "you"],
            ["how", "are", "you", "new", "york"],
        ]
        x, src_lengths = self._convert_src_tokens_to_tensor(
            vocab=vocab, src_tokens=src_tokens, append_eos=append_eos
        )
        return vocab, x, src_lengths
示例#20
0
 def load_target_dictionary(self):
     if self.cfg.labels:
         dict = Dictionary(bos=self.bos,
                           pad=self.pad,
                           eos=self.eos,
                           unk=self.unk,
                           from_tokenizer=True)
         dict.bos_index = self.tokenizer.encoder[self.bos]
         dict.pad_index = self.tokenizer.encoder[self.pad]
         dict.eos_index = self.tokenizer.encoder[self.eos]
         dict.unk_index = self.tokenizer.encoder[self.unk]
         for symbol in self.tokenizer.encoder.keys():
             dict.add_symbol(symbol)
         return dict
     return None
def augment_dictionary(
    dictionary: Dictionary,
    language_list: List[str],
    lang_tok_style: str,
    langtoks_specs: Sequence[str] = (LangTokSpec.main.value,),
    extra_data: Optional[Dict[str, str]] = None,
) -> None:
    for spec in langtoks_specs:
        for language in language_list:
            dictionary.add_symbol(
                get_lang_tok(lang=language, lang_tok_style=lang_tok_style, spec=spec)
            )

    if lang_tok_style == LangTokStyle.mbart.value or (
        extra_data is not None and LangTokSpec.mono_dae.value in extra_data
    ):
        dictionary.add_symbol("<mask>")
示例#22
0
def ensure_symbols_are_present(dictionary: Dictionary, symbols: List[str],
                               ok_to_increase_dict_size: bool) -> None:
    """
    Ensure that the symbols in the source and target dictionary are present.

    Makes changes to the dictionaries in-place.
    """
    original_size = len(dictionary)
    _ = remove_madeupwords_from_dictionary(dictionary)
    for symbol in symbols:
        dictionary.add_symbol(symbol)
    dictionary.pad_to_multiple_(8)
    if not ok_to_increase_dict_size:
        # Let's not crash - but rather point out that we are not allowed to increase the dictionary size.
        if len(dictionary) != original_size:
            logger.warning(
                "The dictionary size changed. The model loading will probably fail."
            )
示例#23
0
    def load_label_dictionary(cls, args, filename, **kwargs):
        """Load the dictionary from the filename
        Args:
            filename (str): the filename
        """
        label_schema = parse_label_schema(filename)
        label_dict = Dictionary()

        labels = list(label_schema.labels)
        assert labels[0] == "NULL", "Expected label at index 0 to be 'NULL'"
        assert len(labels) == len(set(labels))

        for label in labels:
            label_dict.add_symbol(label)

        assert label_dict.symbols[
            label_dict.
            nspecial] == "NULL", "Expected first nonspecial token to be 'NULL'"
        return label_dict, label_schema
示例#24
0
    def setup_task(cls, args, **kwargs):
        data_cfg = data_cfg = S2SDataConfig(Path(args.data) / args.config_yaml)
        tgt_dict = None
        infer_tgt_lang_id = None
        if args.target_is_code:
            if data_cfg.prepend_tgt_lang_tag_as_bos:
                # dictionary with language tags
                dict_path = Path(args.data) / data_cfg.vocab_filename
                if not dict_path.is_file():
                    raise FileNotFoundError(
                        f"Dict has to be provided when setting prepend_tgt_lang_tag_as_bos: true, but dict not found: {dict_path}"
                    )
                tgt_dict = Dictionary.load(dict_path.as_posix())

                # target langauge for inference
                if args.infer_target_lang != "":
                    tgt_lang_tag = SpeechToTextDataset.LANG_TAG_TEMPLATE.format(
                        args.infer_target_lang)
                    infer_tgt_lang_id = tgt_dict.index(tgt_lang_tag)
                    assert infer_tgt_lang_id != tgt_dict.unk()
            else:
                assert args.target_code_size is not None

                tgt_dict = Dictionary()
                for i in range(args.target_code_size):
                    tgt_dict.add_symbol(str(i))
            logger.info(f"dictionary size: " f"{len(tgt_dict):,}")

        if getattr(args, "train_subset", None) is not None:
            if not all(
                    s.startswith("train")
                    for s in args.train_subset.split(",")):
                raise ValueError('Train splits should be named like "train*".')

        assert args.n_frames_per_step >= 1
        assert (not args.eval_inference
                or (args.target_is_code and args.vocoder == "code_hifigan") or
                (not args.target_is_code and args.vocoder != "code_hifigan"))

        return cls(args, tgt_dict, infer_tgt_lang_id=infer_tgt_lang_id)
示例#25
0
    def setup_task(cls, args, **kwargs):
        tgt_dict = None
        if args.target_is_code:
            assert args.target_code_size is not None

            tgt_dict = Dictionary()
            for i in range(args.target_code_size):
                tgt_dict.add_symbol(str(i))
            logger.info(f"dictionary size: " f"{len(tgt_dict):,}")

        if getattr(args, "train_subset", None) is not None:
            if not all(
                    s.startswith("train")
                    for s in args.train_subset.split(",")):
                raise ValueError('Train splits should be named like "train*".')

        assert args.n_frames_per_step >= 1
        assert (not args.eval_inference
                or (args.target_is_code and args.vocoder == "code_hifigan") or
                (not args.target_is_code and args.vocoder != "code_hifigan"))

        return cls(args, tgt_dict)
示例#26
0
    def _get_test_data_with_bpe_end_marker(self, append_eos=True):
        """
        Args:
            append_eos: if True, each input sentence in the source tokens tensor
                will have an EOS appended to the end.

        Returns:
            vocabs: BPE vocab with end-of-word markers as suffixes to denote
                tokens at the end of a word. This is an alternative to fairseq's
                standard preprocessing framework and is not generally supported
                within fairseq.
            x: input tensor containing numberized source tokens, with EOS at the
                end if append_eos is true
            src_lengths: and source lengths.
        """
        vocab = Dictionary()
        vocab.add_symbol("he")
        vocab.add_symbol("llo_EOW")
        vocab.add_symbol("how_EOW")
        vocab.add_symbol("are_EOW")
        vocab.add_symbol("y")
        vocab.add_symbol("ou_EOW")
        vocab.add_symbol("n")
        vocab.add_symbol("ew_EOW")
        vocab.add_symbol("or")
        vocab.add_symbol("k_EOW")

        src_tokens = [
            ["he", "llo_EOW", "n", "ew_EOW", "y", "or", "k_EOW"],
            ["how_EOW", "are_EOW", "y", "ou_EOW"],
        ]
        x, src_lengths = x, src_lengths = self._convert_src_tokens_to_tensor(
            vocab=vocab, src_tokens=src_tokens, append_eos=append_eos
        )
        return vocab, x, src_lengths
示例#27
0
    def _get_test_data_with_bpe_cont_marker(self, append_eos=True):
        """
        Args:
            append_eos: if True, each input sentence in the source tokens tensor
                will have an EOS appended to the end.

        Returns:
            vocabs: BPE vocab with continuation markers as suffixes to denote
                non-end of word tokens. This is the standard BPE format used in
                fairseq's preprocessing.
            x: input tensor containing numberized source tokens, with EOS at the
                end if append_eos is true
            src_lengths: and source lengths.
        """
        vocab = Dictionary()
        vocab.add_symbol("he@@")
        vocab.add_symbol("llo")
        vocab.add_symbol("how")
        vocab.add_symbol("are")
        vocab.add_symbol("y@@")
        vocab.add_symbol("ou")
        vocab.add_symbol("n@@")
        vocab.add_symbol("ew")
        vocab.add_symbol("or@@")
        vocab.add_symbol("k")

        src_tokens = [
            ["he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"],
            ["how", "are", "y@@", "ou"],
        ]
        x, src_lengths = x, src_lengths = self._convert_src_tokens_to_tensor(
            vocab=vocab, src_tokens=src_tokens, append_eos=append_eos
        )
        return vocab, x, src_lengths
示例#28
0
    def __init__(self,
                 root,
                 split='train',
                 samples_filter=None,
                 vocabs_from=None,
                 parse_programs=True,
                 limit=None,
                 programs_mapping=None):
        questions_path = f'{root}/questions/CLEVR_{split}_questions.json'
        scenes_path = f'{root}/scenes/CLEVR_{split}_scenes.json'

        scenes_data = None
        with open(scenes_path) as data:
            scenes_data = json.load(data)['scenes']

        questions_file = os.path.join(root, questions_path)
        samples = None
        with open(questions_file) as data:
            samples = json.load(data)['questions']

        if vocabs_from is not None:
            programs_vocab = vocabs_from.programs_vocab
        else:
            programs_vocab = Dictionary()

        final_samples = []
        for sample in samples:
            sample['prompt'] = sample['question']
            sample['target'] = sample['answer']
            img_idx = sample['image_index']
            scene = scenes_data[img_idx]
            sample['image_path'] = os.path.join('images',
                                                scene['image_filename'])
            sample['viz_rep'] = self.scene_to_canonical_rep(scene)
            sample['scene'] = scene
            if parse_programs:
                if programs_mapping is None:
                    prog_str = CLEVR.build_prog_str(sample['program'])
                    sample['program_str'] = prog_str
                    program_tokens = []
                    for token in CLEVR.tokenize_program(prog_str):
                        program_tokens.append(programs_vocab.add_symbol(token))
                    sample['program_tokens'] = torch.tensor(program_tokens)
                else:
                    program_str, program_tokens = programs_mapping.get(
                        sample['question_family_index'], ('NONE', []))
                    sample['program_str'] = program_str
                    sample['program_tokens'] = torch.tensor(program_tokens)

            if samples_filter is None or samples_filter(sample):
                final_samples.append(sample)

        self.programs_vocab = programs_vocab
        img_transform = tv.transforms.Compose([
            tv.transforms.Pad((0, 150), fill=300, padding_mode='constant'),
            tv.transforms.Resize(224),
            tv.transforms.ToTensor(),
            tv.transforms.Normalize([0.485, 0.456, 0.406],
                                    [0.229, 0.224, 0.225])
        ])

        super().__init__(os.path.join(root, split),
                         final_samples,
                         img_transform,
                         vocabs_from=vocabs_from,
                         prompt_mode='natural',
                         target_mode='natural',
                         limit=limit)
示例#29
0
    def _get_test_data(self):
        vocab = Dictionary()
        vocab.add_symbol("he@@")
        vocab.add_symbol("llo")
        vocab.add_symbol("how")
        vocab.add_symbol("are")
        vocab.add_symbol("y@@")
        vocab.add_symbol("ou")
        vocab.add_symbol("n@@")
        vocab.add_symbol("ew")
        vocab.add_symbol("or@@")
        vocab.add_symbol("k")

        src_tokens = [
            ["he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"],
            ["how", "are", "y@@", "ou"],
        ]
        src_len = [len(x) for x in src_tokens]
        x = torch.LongTensor(len(src_tokens), max(src_len) + 1).fill_(vocab.pad())
        for i in range(len(src_tokens)):
            for j in range(len(src_tokens[i])):
                x[i][j] = vocab.index(src_tokens[i][j])
            x[i][j + 1] = vocab.eos()

        x = x.transpose(1, 0)
        return vocab, x, torch.LongTensor([i + 1 for i in src_len])
示例#30
0
    def _get_test_data(self, append_eos=True, bpe=True):
        vocab = Dictionary()
        if bpe:
            vocab.add_symbol("he@@")
            vocab.add_symbol("llo")
            vocab.add_symbol("how")
            vocab.add_symbol("are")
            vocab.add_symbol("y@@")
            vocab.add_symbol("ou")
            vocab.add_symbol("n@@")
            vocab.add_symbol("ew")
            vocab.add_symbol("or@@")
            vocab.add_symbol("k")

            src_tokens = [
                ["he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"],
                ["how", "are", "y@@", "ou"],
            ]
        else:
            vocab.add_symbol("hello")
            vocab.add_symbol("how")
            vocab.add_symbol("are")
            vocab.add_symbol("you")
            vocab.add_symbol("new")
            vocab.add_symbol("york")
            src_tokens = [
                ["hello", "new", "york", "you"],
                ["how", "are", "you", "new", "york"],
            ]

        src_len = [len(x) for x in src_tokens]
        # If we have to append EOS, we include EOS in counting src length
        if append_eos:
            src_len = [length + 1 for length in src_len]

        x = torch.LongTensor(len(src_tokens), max(src_len)).fill_(vocab.pad())
        for i in range(len(src_tokens)):
            for j in range(len(src_tokens[i])):
                x[i][j] = vocab.index(src_tokens[i][j])
            if append_eos:
                x[i][j + 1] = vocab.eos()

        x = x.transpose(1, 0)
        return vocab, x, torch.LongTensor(src_len)
示例#31
0
 def build_dict(symbol_list):
     d = Dictionary()
     for symbol in symbol_list:
         d.add_symbol(symbol)
     return d