def IMDB(root='.data', split=('train', 'test'), offset=0): """ Defines raw IMDB datasets. Create supervised learning dataset: IMDB Separately returns the raw training and test dataset Args: root: Directory where the datasets are saved. Default: ".data" split: a string or tuple for the returned datasets. Default: ('train', 'test') By default, both datasets (train, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. offset: the number of the starting line. Default: 0 Examples: >>> train, test = torchtext.experimental.datasets.raw.IMDB() """ split_ = check_default_set(split, ('train', 'test'), 'IMDB') dataset_tar = download_from_url(URLS['IMDB'], root=root, hash_value=MD5['IMDB'], hash_type='md5') extracted_files = extract_archive(dataset_tar) return wrap_datasets( tuple( RawTextIterableDataset("IMDB", NUM_LINES["IMDB"][item], generate_imdb_data(item, extracted_files), offset=offset) for item in split_), split)
def _setup_datasets(dataset_name, root, split_, offset): split = check_default_set(split_, ('train', 'test'), dataset_name) if dataset_name == 'AG_NEWS': extracted_files = [ download_from_url(URLS[dataset_name][item], root=root, hash_value=MD5['AG_NEWS'][item], hash_type='md5') for item in ('train', 'test') ] else: dataset_tar = download_from_url(URLS[dataset_name], root=root, hash_value=MD5[dataset_name], hash_type='md5') extracted_files = extract_archive(dataset_tar) cvs_path = {} for fname in extracted_files: if fname.endswith('train.csv'): cvs_path['train'] = fname if fname.endswith('test.csv'): cvs_path['test'] = fname return wrap_datasets( tuple( RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name][item], _create_data_from_csv(cvs_path[item]), offset=offset) for item in split), split_)
def _setup_datasets(dataset_name, tokenizer, root, vocab, split_, year, language): if tokenizer is None: tokenizer = get_tokenizer('basic_english') split = check_default_set(split_, ('train', 'test', 'valid'), dataset_name) if vocab is None: if 'train' not in split: raise TypeError("Must pass a vocab if train is not selected.") if dataset_name == 'WMTNewsCrawl': raw_train, = raw.DATASETS[dataset_name](root=root, split=('train',), year=year, language=language) else: raw_train, = raw.DATASETS[dataset_name](root=root, split=('train',)) logger_.info('Building Vocab based on train data') vocab = build_vocab(raw_train, tokenizer) logger_.info('Vocab has %d entries', len(vocab)) def text_transform(line): return torch.tensor([vocab[token] for token in tokenizer(line)], dtype=torch.long) if dataset_name == 'WMTNewsCrawl': raw_datasets = raw.DATASETS[dataset_name](root=root, split=split, year=year, language=language) else: raw_datasets = raw.DATASETS[dataset_name](root=root, split=split) raw_data = {name: list(map(text_transform, raw_dataset)) for name, raw_dataset in zip(split, raw_datasets)} logger_.info('Building datasets for {}'.format(split)) return wrap_datasets(tuple(LanguageModelingDataset(raw_data[item], vocab, text_transform) for item in split), split_)
def _setup_datasets(dataset_name, separator, root, split_, offset): split = check_default_set(split_, ('train', 'valid', 'test'), dataset_name) extracted_files = [] if isinstance(URLS[dataset_name], dict): for name, item in URLS[dataset_name].items(): dataset_tar = download_from_url(item, root=root, hash_value=MD5[dataset_name][name], hash_type='md5') extracted_files.extend(extract_archive(dataset_tar)) elif isinstance(URLS[dataset_name], str): dataset_tar = download_from_url(URLS[dataset_name], root=root, hash_value=MD5[dataset_name], hash_type='md5') extracted_files.extend(extract_archive(dataset_tar)) else: raise ValueError( "URLS for {} has to be in a form of dictionary or string".format( dataset_name)) data_filenames = { "train": _construct_filepath(extracted_files, "train.txt"), "valid": _construct_filepath(extracted_files, "dev.txt"), "test": _construct_filepath(extracted_files, "test.txt") } return wrap_datasets( tuple( RawTextIterableDataset( dataset_name, NUM_LINES[dataset_name][item], _create_data_from_iob(data_filenames[item], separator), offset=offset) if data_filenames[item] is not None else None for item in split), split_)
def _setup_datasets(dataset_name, root, ngrams, vocab, tokenizer, split_): text_transform = [] if tokenizer is None: tokenizer = get_tokenizer("basic_english") text_transform = sequential_transforms(tokenizer, ngrams_func(ngrams)) split = check_default_set(split_, ('train', 'test'), dataset_name) raw_datasets = raw.DATASETS[dataset_name](root=root, split=split) # Materialize raw text iterable dataset raw_data = {name: list(raw_dataset) for name, raw_dataset in zip(split, raw_datasets)} if vocab is None: if "train" not in split: raise TypeError("Must pass a vocab if train is not selected.") logger_.info('Building Vocab based on train data') vocab = build_vocab(raw_data["train"], text_transform) logger_.info('Vocab has %d entries', len(vocab)) text_transform = sequential_transforms( text_transform, vocab_func(vocab), totensor(dtype=torch.long) ) if dataset_name == 'IMDB': label_transform = sequential_transforms(lambda x: 1 if x == 'pos' else 0, totensor(dtype=torch.long)) else: label_transform = sequential_transforms(totensor(dtype=torch.long)) logger_.info('Building datasets for {}'.format(split)) return wrap_datasets(tuple( TextClassificationDataset( raw_data[item], vocab, (label_transform, text_transform) ) for item in split ), split_)
def _setup_datasets(dataset_name, root, split_, offset): split = check_default_set(split_, ('train', 'dev'), dataset_name) extracted_files = { key: download_from_url(URLS[dataset_name][key], root=root, hash_value=MD5[dataset_name][key], hash_type='md5') for key in split } return wrap_datasets( tuple( RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name][item], _create_data_from_json( extracted_files[item]), offset=offset) for item in split), split_)
def _setup_datasets(dataset_name, root, split_, year, language, offset): if dataset_name == 'WMTNewsCrawl': split = check_default_set(split_, ('train', ), dataset_name) else: split = check_default_set(split_, ('train', 'test', 'valid'), dataset_name) if dataset_name == 'PennTreebank': extracted_files = [ download_from_url(URLS['PennTreebank'][key], root=root, hash_value=MD5['PennTreebank'][key], hash_type='md5') for key in split ] else: dataset_tar = download_from_url(URLS[dataset_name], root=root, hash_value=MD5[dataset_name], hash_type='md5') extracted_files = extract_archive(dataset_tar) if dataset_name == 'WMTNewsCrawl': file_name = 'news.{}.{}.shuffled'.format(year, language) extracted_files = [f for f in extracted_files if file_name in f] path = {} for item in split: for fname in extracted_files: if item in fname: path[item] = fname datasets = [] for item in split: logging.info('Creating {} data'.format(item)) datasets.append( RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name][item], iter(io.open(path[item], encoding="utf8")), offset=offset)) return wrap_datasets(tuple(datasets), split_)
def _setup_datasets(dataset_name, root, vocab, tokenizer, split_): text_transform = [] if tokenizer is None: tokenizer = get_tokenizer('basic_english') text_transform = sequential_transforms(tokenizer) split = check_default_set(split_, ('train', 'dev'), dataset_name) raw_datasets = raw.DATASETS[dataset_name](root=root, split=split) raw_data = { name: list(raw_dataset) for name, raw_dataset in zip(split, raw_datasets) } if vocab is None: if 'train' not in split: raise TypeError("Must pass a vocab if train is not selected.") def apply_transform(data): for (_context, _question, _answers, _ans_pos) in data: tok_ans = [] for item in _answers: tok_ans += text_transform(item) yield text_transform(_context) + text_transform( _question) + tok_ans logger_.info('Building Vocab based on train data') vocab = build_vocab_from_iterator(apply_transform(raw_data['train']), len(raw_data['train'])) logger_.info('Vocab has %d entries', len(vocab)) text_transform = sequential_transforms(text_transform, vocab_func(vocab), totensor(dtype=torch.long)) transforms = { 'context': text_transform, 'question': text_transform, 'answers': text_transform, 'ans_pos': totensor(dtype=torch.long) } logger_.info('Building datasets for {}'.format(split)) return wrap_datasets( tuple( QuestionAnswerDataset(raw_data[item], vocab, transforms) for item in split), split_)
def _setup_datasets(dataset_name, root, vocabs, split_): split = check_default_set(split_, ('train', 'valid', 'test'), dataset_name) raw_iter_tuple = raw.DATASETS[dataset_name](root=root, split=split) raw_data = {} for name, raw_iter in zip(split, raw_iter_tuple): raw_data[name] = list(raw_iter) if vocabs is None: if "train" not in split: raise TypeError("Must pass a vocab if train is not selected.") logger_.info('Building Vocab based on train data') vocabs = build_vocab(raw_data["train"]) else: if not isinstance(vocabs, list): raise TypeError("vocabs must be an instance of list") # Find data that's not None notnone_data = None for key in raw_data.keys(): if raw_data[key] is not None: notnone_data = raw_data[key] break if len(vocabs) != len(notnone_data[0]): raise ValueError( "Number of vocabs must match the number of columns " "in the data") transformers = [ sequential_transforms(vocab_func(vocabs[idx]), totensor(dtype=torch.long)) for idx in range(len(vocabs)) ] logger_.info('Building datasets for {}'.format(split)) return wrap_datasets( tuple( SequenceTaggingDataset(raw_data[item], vocabs, transformers) for item in split), split_)
def _setup_datasets(dataset_name, train_filenames, valid_filenames, test_filenames, split_, root, vocab, tokenizer): split = check_default_set(split_, ('train', 'valid', 'test'), dataset_name) src_vocab, tgt_vocab = vocab if tokenizer is None: src_tokenizer = get_tokenizer("spacy", language='de_core_news_sm') tgt_tokenizer = get_tokenizer("spacy", language='en_core_web_sm') elif isinstance(tokenizer, tuple): if len(tokenizer) == 2: src_tokenizer, tgt_tokenizer = tokenizer else: raise ValueError("tokenizer must have length of two for" "source and target") else: raise ValueError( "tokenizer must be an instance of tuple with length two" "or None") raw_datasets = raw.DATASETS[dataset_name](train_filenames=train_filenames, valid_filenames=valid_filenames, test_filenames=test_filenames, split=split, root=root) raw_data = { name: list(raw_dataset) for name, raw_dataset in zip(split, raw_datasets) } src_text_vocab_transform = sequential_transforms(src_tokenizer) tgt_text_vocab_transform = sequential_transforms(tgt_tokenizer) if src_vocab is None: if 'train' not in split: raise TypeError("Must pass a vocab if train is not selected.") logger_.info('Building src Vocab based on train data') src_vocab = build_vocab(raw_data["train"], src_text_vocab_transform, index=0) else: if not isinstance(src_vocab, Vocab): raise TypeError("Passed src vocabulary is not of type Vocab") logger_.info('src Vocab has %d entries', len(src_vocab)) if tgt_vocab is None: if 'train' not in split: raise TypeError("Must pass a vocab if train is not selected.") logger_.info('Building tgt Vocab based on train data') tgt_vocab = build_vocab(raw_data["train"], tgt_text_vocab_transform, index=1) else: if not isinstance(tgt_vocab, Vocab): raise TypeError("Passed tgt vocabulary is not of type Vocab") logger_.info('tgt Vocab has %d entries', len(tgt_vocab)) logger_.info('Building datasets for {}'.format(split)) datasets = [] for key in split: src_text_transform = sequential_transforms(src_text_vocab_transform, vocab_func(src_vocab), totensor(dtype=torch.long)) tgt_text_transform = sequential_transforms(tgt_text_vocab_transform, vocab_func(tgt_vocab), totensor(dtype=torch.long)) datasets.append( TranslationDataset(raw_data[key], (src_vocab, tgt_vocab), (src_text_transform, tgt_text_transform))) return wrap_datasets(tuple(datasets), split_)
def _setup_datasets(dataset_name, train_filenames, valid_filenames, test_filenames, split_, root, offset): split = check_default_set(split_, ('train', 'valid', 'test'), dataset_name) if not isinstance(train_filenames, tuple) and not isinstance(valid_filenames, tuple) \ and not isinstance(test_filenames, tuple): raise ValueError("All filenames must be tuples") src_train, tgt_train = train_filenames src_eval, tgt_eval = valid_filenames src_test, tgt_test = test_filenames extracted_files = [] # list of paths to the extracted files if isinstance(URLS[dataset_name], list): for idx, f in enumerate(URLS[dataset_name]): dataset_tar = download_from_url(f, root=root, hash_value=MD5[dataset_name][idx], hash_type='md5') extracted_files.extend(extract_archive(dataset_tar)) elif isinstance(URLS[dataset_name], str): dataset_tar = download_from_url(URLS[dataset_name], root=root, hash_value=MD5[dataset_name], hash_type='md5') extracted_dataset_tar = extract_archive(dataset_tar) if dataset_name == 'IWSLT': # IWSLT dataset's url downloads a multilingual tgz. # We need to take an extra step to pick out the specific language pair from it. src_language = train_filenames[0].split(".")[-1] tgt_language = train_filenames[1].split(".")[-1] languages = "-".join([src_language, tgt_language]) iwslt_tar = '.data/2016-01/texts/{}/{}/{}.tgz' iwslt_tar = iwslt_tar.format(src_language, tgt_language, languages) extracted_dataset_tar = extract_archive(iwslt_tar) extracted_files.extend(extracted_dataset_tar) else: raise ValueError( "URLS for {} has to be in a form or list or string".format( dataset_name)) # Clean the xml and tag file in the archives file_archives = [] for fname in extracted_files: if 'xml' in fname: _clean_xml_file(fname) file_archives.append(os.path.splitext(fname)[0]) elif "tags" in fname: _clean_tags_file(fname) file_archives.append(fname.replace('.tags', '')) else: file_archives.append(fname) data_filenames = defaultdict(dict) data_filenames = { "train": _construct_filepaths(file_archives, src_train, tgt_train), "valid": _construct_filepaths(file_archives, src_eval, tgt_eval), "test": _construct_filepaths(file_archives, src_test, tgt_test) } for key in data_filenames.keys(): if len(data_filenames[key]) == 0 or data_filenames[key] is None: raise FileNotFoundError( "Files are not found for data type {}".format(key)) datasets = [] for key in split: src_data_iter = _read_text_iterator(data_filenames[key][0]) tgt_data_iter = _read_text_iterator(data_filenames[key][1]) def _iter(src_data_iter, tgt_data_iter): for item in zip(src_data_iter, tgt_data_iter): yield item datasets.append( RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name][key], _iter(src_data_iter, tgt_data_iter), offset=offset)) return wrap_datasets(tuple(datasets), split_)