def create_lang_dictionary(cls, langs):
     unk = "<unk>"
     # hack to remove symbols other than unk as they are not needed by lang dict
     lang_dict = Dictionary(pad=unk, eos=unk, unk=unk, bos=unk)
     for lang in langs:
         lang_dict.add_symbol(lang)
     return lang_dict
예제 #2
0
 def setup_task(cls, args, **kwargs):
     """Setup the task. """
     dictionary = Dictionary()
     for i in range(args.dict_size):
         dictionary.add_symbol("word{}".format(i))
     logger.info("dictionary: {} types".format(len(dictionary)))
     return cls(args, dictionary)
예제 #3
0
 def build_dictionary(
     cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8
 ):
     d = BertDictionary()
     for filename in filenames:
         Dictionary.add_file_to_dictionary(
             filename, d, tokenizer.tokenize_line, workers
         )
     d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor)
     return d
예제 #4
0
    def setup_task(cls, args, **kwargs):
        """Setup the task. """
        dictionary = Dictionary()
        for i in range(args.dict_size):
            dictionary.add_symbol("word{}".format(i))
        logger.info("dictionary: {} types".format(len(dictionary)))

        args.max_source_positions = args.src_len + dictionary.pad() + 2
        args.max_target_positions = args.tgt_len + dictionary.pad() + 2

        return cls(args, dictionary)
예제 #5
0
class DummyLMTask(FairseqTask):
    def __init__(self, cfg: DummyLMConfig):
        super().__init__(cfg)

        # load dictionary
        self.dictionary = Dictionary()
        for i in range(cfg.dict_size):
            self.dictionary.add_symbol("word{}".format(i))
        self.dictionary.pad_to_multiple_(8)  # often faster if divisible by 8
        logger.info("dictionary: {} types".format(len(self.dictionary)))

        seq = torch.arange(cfg.tokens_per_sample +
                           1) + self.dictionary.pad() + 1

        self.dummy_src = seq[:-1]
        self.dummy_tgt = seq[1:]

    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
        """Load a given dataset split.
        Args:
            split (str): name of the split (e.g., train, valid, test)
        """
        if self.cfg.batch_size is not None:
            bsz = self.cfg.batch_size
        else:
            bsz = max(1, self.cfg.max_tokens // self.cfg.tokens_per_sample)
        self.datasets[split] = DummyDataset(
            {
                "id": 1,
                "net_input": {
                    "src_tokens":
                    torch.stack([self.dummy_src for _ in range(bsz)]),
                    "src_lengths":
                    torch.full(
                        (bsz, ), self.cfg.tokens_per_sample, dtype=torch.long),
                },
                "target": torch.stack([self.dummy_tgt for _ in range(bsz)]),
                "nsentences": bsz,
                "ntokens": bsz * self.cfg.tokens_per_sample,
            },
            num_items=self.cfg.dataset_size,
            item_size=self.cfg.tokens_per_sample,
        )

    @property
    def source_dictionary(self):
        return self.dictionary

    @property
    def target_dictionary(self):
        return self.dictionary
예제 #6
0
    def __init__(self, cfg: DummyLMConfig):
        super().__init__(cfg)

        # load dictionary
        self.dictionary = Dictionary()
        for i in range(cfg.dict_size):
            self.dictionary.add_symbol("word{}".format(i))
        self.dictionary.pad_to_multiple_(8)  # often faster if divisible by 8
        logger.info("dictionary: {} types".format(len(self.dictionary)))

        seq = torch.arange(cfg.tokens_per_sample +
                           1) + self.dictionary.pad() + 1

        self.dummy_src = seq[:-1]
        self.dummy_tgt = seq[1:]
예제 #7
0
    def load_dictionary(cls, filename):
        """Load the dictionary from the filename

        Args:
            filename (str): the filename
        """
        return Dictionary.load(filename)
예제 #8
0
 def setup_task(cls, args, **kwargs):
     """Setup the task."""
     dictionary = Dictionary.load(os.path.join(args.data, "dict.txt"))
     logger.info("dictionary: {} types".format(len(dictionary)))
     if not hasattr(args, "shuffle_instance"):
         args.shuffle_instance = False
     return cls(args, dictionary)
예제 #9
0
def augment_dictionary(
    dictionary: Dictionary,
    language_list: List[str],
    lang_tok_style: str,
    langtoks_specs: Sequence[str] = (LangTokSpec.main.value, ),
    extra_data: Optional[Dict[str, str]] = None,
) -> None:
    for spec in langtoks_specs:
        for language in language_list:
            dictionary.add_symbol(
                get_lang_tok(lang=language,
                             lang_tok_style=lang_tok_style,
                             spec=spec))

    if lang_tok_style == LangTokStyle.mbart.value or (
            extra_data is not None
            and LangTokSpec.mono_dae.value in extra_data):
        dictionary.add_symbol("<mask>")
예제 #10
0
    def load_dictionary(cls, args, filename, source=True):
        """Load the dictionary from the filename

        Args:
            filename (str): the filename
        """
        dictionary = Dictionary.load(filename)
        dictionary.add_symbol("<mask>")
        return dictionary
예제 #11
0
    def build_dictionary(cls,
                         filenames,
                         workers=1,
                         threshold=-1,
                         nwords=-1,
                         padding_factor=8):
        """Build the dictionary

        Args:
            filenames (list): list of filenames
            workers (int): number of concurrent workers
            threshold (int): defines the minimum word count
            nwords (int): defines the total number of words in the final dictionary,
                including special symbols
            padding_factor (int): can be used to pad the dictionary size to be a
                multiple of 8, which is important on some hardware (e.g., Nvidia
                Tensor Cores).
        """
        d = Dictionary()
        for filename in filenames:
            Dictionary.add_file_to_dictionary(filename, d,
                                              tokenizer.tokenize_line, workers)
        d.finalize(threshold=threshold,
                   nwords=nwords,
                   padding_factor=padding_factor)
        return d
예제 #12
0
 def setup_dictionary(cls, args, **kwargs):
     dictionary = None
     output_dictionary = None
     if args.data:
         paths = utils.split_paths(args.data)
         assert len(paths) > 0
         dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt"))
         logger.info("dictionary: {} types".format(len(dictionary)))
         output_dictionary = dictionary
         if args.output_dictionary_size >= 0:
             output_dictionary = TruncatedDictionary(
                 dictionary, args.output_dictionary_size
             )
     return (dictionary, output_dictionary)
예제 #13
0
    def setup_task(cls, args, **kwargs):
        data_cfg = S2TDataConfig(op.join(args.data, args.config_yaml))
        dict_path = op.join(args.data, data_cfg.vocab_filename)
        if not op.isfile(dict_path):
            raise FileNotFoundError(f"Dict not found: {dict_path}")
        tgt_dict = Dictionary.load(dict_path)
        logger.info(f"dictionary size ({data_cfg.vocab_filename}): "
                    f"{len(tgt_dict):,}")

        if getattr(args, "train_subset", None) is not None:
            if not all(
                    s.startswith("train")
                    for s in args.train_subset.split(",")):
                raise ValueError('Train splits should be named like "train*".')
        return cls(args, tgt_dict)
예제 #14
0
 def load_target_dictionary(self):
     if self.cfg.labels:
         dict_path = os.path.join(self.cfg.data, f"dict.{self.cfg.labels}.txt")
         return Dictionary.load(dict_path)
     return None
예제 #15
0
def _lang_id(dic: Dictionary, lang: str):
    """Return language ID index."""
    idx = dic.index(lang)
    assert idx != dic.unk_index, "cannot find language ID for lang {}".format(
        lang)
    return idx
예제 #16
0
 def padding_idx(self):
     return Dictionary().pad() if self.vocab is None else self.vocab.pad()
예제 #17
0
 def setup_task(cls, args, **kwargs):
     paths = utils.split_paths(args.data)
     assert len(paths) > 0
     dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt"))
     logger.info("dictionary: {} types".format(len(dictionary)))
     return cls(args, dictionary)
예제 #18
0
def _lang_token_index(dic: Dictionary, lang: str):
    """Return language token index."""
    idx = dic.index(_lang_token(lang))
    assert idx != dic.unk_index, "cannot find language token for lang {}".format(
        lang)
    return idx