Exemplo n.º 1
0
 def test_ngrams_func(self):
     func = ngrams_func(1)
     assert func(['A', 'string', 'particularly', 'one', 'with', 'slightly']) == \
         ['A', 'string', 'particularly', 'one', 'with', 'slightly']
     func = ngrams_func(2)
     assert func(['A', 'string', 'particularly', 'one', 'with', 'slightly']) == \
         ['A', 'string', 'particularly', 'one', 'with', 'slightly', 'A string', 'string particularly',
          'particularly one', 'one with', 'with slightly']
     func = ngrams_func(3)
     assert func(['A', 'string', 'particularly', 'one', 'with', 'slightly']) == \
         ['A', 'string', 'particularly', 'one', 'with', 'slightly', 'A string', 'string particularly',
          'particularly one', 'one with', 'with slightly', 'A string particularly',
          'string particularly one', 'particularly one with', 'one with slightly']
Exemplo n.º 2
0
def _setup_datasets(dataset_name, root, ngrams, vocab, tokenizer, split_):
    text_transform = []
    if tokenizer is None:
        tokenizer = get_tokenizer("basic_english")
    text_transform = sequential_transforms(tokenizer, ngrams_func(ngrams))
    split = check_default_set(split_, ('train', 'test'), dataset_name)
    raw_datasets = raw.DATASETS[dataset_name](root=root, split=split)
    # Materialize raw text iterable dataset
    raw_data = {
        name: list(raw_dataset)
        for name, raw_dataset in zip(split, raw_datasets)
    }

    if vocab is None:
        if "train" not in split:
            raise TypeError("Must pass a vocab if train is not selected.")
        logger_.info('Building Vocab based on train data')
        vocab = build_vocab(raw_data["train"], text_transform)
    logger_.info('Vocab has %d entries', len(vocab))
    text_transform = sequential_transforms(text_transform, vocab_func(vocab),
                                           totensor(dtype=torch.long))
    if dataset_name == 'IMDB':
        label_transform = sequential_transforms(
            lambda x: 1 if x == 'pos' else 0, totensor(dtype=torch.long))
    else:
        label_transform = sequential_transforms(totensor(dtype=torch.long))
    logger_.info('Building datasets for {}'.format(split))
    return wrap_datasets(
        tuple(
            TextClassificationDataset(raw_data[item], vocab, (label_transform,
                                                              text_transform))
            for item in split), split_)
Exemplo n.º 3
0
def _setup_datasets(dataset_name, root, ngrams, vocab, tokenizer, data_select):
    text_transform = []
    if tokenizer is None:
        tokenizer = get_tokenizer("basic_english")
    text_transform = sequential_transforms(tokenizer, ngrams_func(ngrams))
    data_select = check_default_set(data_select, ('train', 'test'))
    raw_datasets = raw.DATASETS[dataset_name](root=root,
                                              data_select=data_select)
    # Materialize raw text iterable dataset
    raw_data = {
        name: list(raw_dataset)
        for name, raw_dataset in zip(data_select, raw_datasets)
    }

    if vocab is None:
        if "train" not in data_select:
            raise TypeError("Must pass a vocab if train is not selected.")
        vocab = build_vocab(raw_data["train"], text_transform)
    text_transform = sequential_transforms(text_transform, vocab_func(vocab),
                                           totensor(dtype=torch.long))
    if dataset_name == 'IMDB':
        label_transform = sequential_transforms(
            lambda x: 1 if x == 'pos' else 0, totensor(dtype=torch.long))
    else:
        label_transform = sequential_transforms(totensor(dtype=torch.long))
    return tuple(
        TextClassificationDataset(raw_data[item], vocab, (label_transform,
                                                          text_transform))
        for item in data_select)
Exemplo n.º 4
0
def _setup_datasets(
        dataset_name,
        root=".data",
        ngrams=1,
        vocab=None,
        tokenizer=None,
        data_select=("train", "test"),
):
    text_transform = []
    if tokenizer is None:
        tokenizer = get_tokenizer("basic_english")
    text_transform = sequential_transforms(tokenizer, ngrams_func(ngrams))

    if isinstance(data_select, str):
        data_select = [data_select]
    if not set(data_select).issubset(set(("train", "test"))):
        raise TypeError(
            "Given data selection {} is not supported!".format(data_select))
    train, test = raw.DATASETS[dataset_name](root=root)
    # Cache raw text iterable dataset
    raw_data = {
        "train": [(label, txt) for (label, txt) in train],
        "test": [(label, txt) for (label, txt) in test],
    }

    if vocab is None:
        if "train" not in data_select:
            raise TypeError("Must pass a vocab if train is not selected.")
        vocab = build_vocab(raw_data["train"], text_transform)
    text_transform = sequential_transforms(text_transform, vocab_func(vocab),
                                           totensor(dtype=torch.long))
    if dataset_name == 'IMDB':
        label_transform = sequential_transforms(
            lambda x: 1 if x == 'pos' else 0, totensor(dtype=torch.long))
    else:
        label_transform = sequential_transforms(totensor(dtype=torch.long))
    return tuple(
        TextClassificationDataset(raw_data[item], vocab, (label_transform,
                                                          text_transform))
        for item in data_select)