Exemplo n.º 1
0
def load_vocab_from_file(file_path: str,
                         min_freq: int = 1,
                         num_cpus: int = 4) -> Vocab:
    r"""Create a `Vocab` object from a text file.
    The `file_path` should contain tokens separated by new lines.
    Format for txt file:

        token1
        token2
        ...
        token_n

    Args:
        file_object: A file like object to read data from.
        min_freq: The minimum frequency needed to include a token in the vocabulary.
        num_cpus: the number of cpus to use when loading the vectors from file.

    Returns:
        torchtext.vocab.Vocab: a `Vocab` object.

    Examples:
        >>> from torchtext.vocab import load_vocab_from_file
        >>> v = load_vocab_from_file('vocab.txt')
    """

    vocab_obj = _load_vocab_from_file(file_path, min_freq, num_cpus)
    return Vocab(vocab_obj)
Exemplo n.º 2
0
def vocab_from_file_object(file_like_object,
                           min_freq=1,
                           unk_token='<unk>',
                           num_cpus=1):
    r"""Create a `Vocab` object from a file like object.
    The `file_like_object` should contain tokens seperated by new lines. Note that the vocab
    will be created in the order that the tokens first appear in the file (and not by the frequency of tokens).
    Format for txt file:
        token1
        token2
        ...
        token_n
    Args:
        file_like_object (FileObject): a file like object to read data from.
        min_freq: The minimum frequency needed to include a token in the vocabulary.
            Values less than 1 will be set to 1. Default: 1.
        unk_token: The default unknown token to use. Default: '<unk>'.
        num_cpus (int): the number of cpus to use when loading the vectors from file. Default: 10.

    Returns:
        Vocab: a `Vocab` object.
    Examples:
        >>> from torchtext.experimental.vocab import vocab_from_file_object
        >>> f = open('vocab.txt', 'r')
        >>> v = vocab_from_file_object(f)
    """
    vocab_obj = _load_vocab_from_file(file_like_object.name, unk_token,
                                      min_freq, num_cpus)
    return Vocab(vocab_obj)
Exemplo n.º 3
0
def load_vocab_from_file(file_path, min_freq=1, unk_token='<unk>', num_cpus=4):
    r"""Create a `Vocab` object from a text file.
    The `file_path` should contain tokens separated by new lines.
    Format for txt file:

        token1
        token2
        ...
        token_n

    Args:
        file_object (FileObject): a file like object to read data from.
        min_freq: The minimum frequency needed to include a token in the vocabulary.
            Values less than 1 will be set to 1. Default: 1.
        unk_token: The default unknown token to use. Default: '<unk>'.
        num_cpus (int): the number of cpus to use when loading the vectors from file. Default: 4.

    Returns:
        torchtext.experimental.vocab.Vocab: a `Vocab` object.

    Examples:
        >>> from torchtext.experimental.vocab import load_vocab_from_file
        >>> v = load_vocab_from_file('vocab.txt')
    """

    vocab_obj = _load_vocab_from_file(file_path, unk_token, min_freq, num_cpus)
    return Vocab(vocab_obj)