Exemplo n.º 1
0
    def build_dictionary(cls,
                         filenames,
                         tokenize_func=None,
                         workers=1,
                         threshold=-1,
                         nwords=-1,
                         padding_factor=8):
        """Build the dictionary

        Args:
            filenames (list): list of filenames
            workers (int): number of concurrent workers
            threshold (int): defines the minimum word count
            nwords (int): defines the total number of words in the final dictionary,
                including special symbols
            padding_factor (int): can be used to pad the dictionary size to be a
                multiple of 8, which is important on some hardware (e.g., Nvidia
                Tensor Cores).
        """
        d = Dictionary()

        for filename in filenames:
            Dictionary.add_file_to_dictionary(filename,
                                              d,
                                              tokenize_func,
                                              num_workers=workers)

        d.finalize(threshold=threshold,
                   nwords=nwords,
                   padding_factor=padding_factor)
        return d
Exemplo n.º 2
0
    def build_dictionary(
        cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8,
        tokenize_func=SPACE_SPLITTER,
        **kwargs,
    ):
        """Build the dictionary

        Args:
            filenames (list): list of filenames
            workers (int): number of concurrent workers
            threshold (int): defines the minimum word count
            nwords (int): defines the total number of words in the final dictionary,
                including special symbols
            padding_factor (int): can be used to pad the dictionary size to be a
                multiple of 8, which is important on some hardware (e.g., Nvidia
                Tensor Cores).
        """
        d = Dictionary(
            pad=kwargs.get('pad', constants.PAD),
            bos=kwargs.get('bos', constants.BOS),
            eos=kwargs.get('eos', constants.EOS),
            unk=kwargs.get('unk', constants.UNK),
            extra_special_symbols=kwargs.get('extra_special_symbols', None),
        )
        for filename in filenames:
            Dictionary.add_file_to_dictionary(
                filename, d, tokenize_func, d.eos_word, workers
            )
        d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor)
        return d