def _setup_datasets(dataset_name, root='.data'): extracted_files = [] select_to_index = {'train': 0, 'dev': 1} extracted_files = [ download_from_url(URLS[dataset_name][select_to_index[key]], root=root) for key in select_to_index.keys() ] train_iter = _create_data_from_json(extracted_files[0]) dev_iter = _create_data_from_json(extracted_files[1]) return (RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name], train_iter), RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name], dev_iter))
def _setup_datasets(dataset_name, root, split_, offset): split = check_default_set(split_, ('train', 'test'), dataset_name) if dataset_name == 'AG_NEWS': extracted_files = [ download_from_url(URLS[dataset_name][item], root=root, hash_value=MD5['AG_NEWS'][item], hash_type='md5') for item in ('train', 'test') ] else: dataset_tar = download_from_url(URLS[dataset_name], root=root, hash_value=MD5[dataset_name], hash_type='md5') extracted_files = extract_archive(dataset_tar) cvs_path = {} for fname in extracted_files: if fname.endswith('train.csv'): cvs_path['train'] = fname if fname.endswith('test.csv'): cvs_path['test'] = fname return wrap_datasets( tuple( RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name][item], _create_data_from_csv(cvs_path[item]), offset=offset) for item in split), split_)
def _setup_datasets(dataset_name, separator, root=".data"): extracted_files = [] if isinstance(URLS[dataset_name], list): for f in URLS[dataset_name]: dataset_tar = download_from_url(f, root=root) extracted_files.extend(extract_archive(dataset_tar)) elif isinstance(URLS[dataset_name], str): dataset_tar = download_from_url(URLS[dataset_name], root=root) extracted_files.extend(extract_archive(dataset_tar)) else: raise ValueError( "URLS for {} has to be in a form or list or string".format( dataset_name)) data_filenames = { "train": _construct_filepath(extracted_files, "train.txt"), "valid": _construct_filepath(extracted_files, "dev.txt"), "test": _construct_filepath(extracted_files, "test.txt") } datasets = [] for key in data_filenames.keys(): if data_filenames[key] is not None: datasets.append( RawTextIterableDataset( dataset_name, NUM_LINES[dataset_name], _create_data_from_iob(data_filenames[key], separator))) else: datasets.append(None) return datasets
def _setup_datasets(dataset_name, separator, root, split, offset): extracted_files = [] if isinstance(URLS[dataset_name], dict): for name, item in URLS[dataset_name].items(): dataset_tar = download_from_url(item, root=root, hash_value=MD5[dataset_name][name], hash_type='md5') extracted_files.extend(extract_archive(dataset_tar)) elif isinstance(URLS[dataset_name], str): dataset_tar = download_from_url(URLS[dataset_name], root=root, hash_value=MD5[dataset_name], hash_type='md5') extracted_files.extend(extract_archive(dataset_tar)) else: raise ValueError( "URLS for {} has to be in a form of dictionary or string".format( dataset_name)) data_filenames = { "train": _construct_filepath(extracted_files, "train.txt"), "valid": _construct_filepath(extracted_files, "dev.txt"), "test": _construct_filepath(extracted_files, "test.txt") } return [ RawTextIterableDataset( dataset_name, NUM_LINES[dataset_name][item], _create_data_from_iob(data_filenames[item], separator), offset=offset) if data_filenames[item] is not None else None for item in split ]
def _setup_datasets(dataset_name, root, split, year, language, offset): if dataset_name == 'PennTreebank': extracted_files = [download_from_url(URLS['PennTreebank'][key], root=root, hash_value=MD5['PennTreebank'][key], hash_type='md5') for key in split] else: dataset_tar = download_from_url(URLS[dataset_name], root=root, hash_value=MD5[dataset_name], hash_type='md5') extracted_files = extract_archive(dataset_tar) if dataset_name == 'WMTNewsCrawl': file_name = 'news.{}.{}.shuffled'.format(year, language) extracted_files = [f for f in extracted_files if file_name in f] path = {} for item in split: for fname in extracted_files: if item in fname: path[item] = fname datasets = [] for item in split: logging.info('Creating {} data'.format(item)) datasets.append(RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name][item], iter(io.open(path[item], encoding="utf8")), offset=offset)) return datasets
def IMDB(root='.data', split=('train', 'test'), offset=0): """ Defines raw IMDB datasets. Create supervised learning dataset: IMDB Separately returns the raw training and test dataset Args: root: Directory where the datasets are saved. Default: ".data" split: a string or tuple for the returned datasets. Default: ('train', 'test') By default, both datasets (train, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. offset: the number of the starting line. Default: 0 Examples: >>> train, test = torchtext.experimental.datasets.raw.IMDB() """ split_ = check_default_set(split, ('train', 'test'), 'IMDB') dataset_tar = download_from_url(URLS['IMDB'], root=root, hash_value=MD5['IMDB'], hash_type='md5') extracted_files = extract_archive(dataset_tar) return wrap_datasets( tuple( RawTextIterableDataset("IMDB", NUM_LINES["IMDB"][item], generate_imdb_data(item, extracted_files), offset=offset) for item in split_), split)
def _setup_datasets(dataset_name, root='.data'): dataset_tar = download_from_url(URLS[dataset_name], root=root) extracted_files = extract_archive(dataset_tar) for fname in extracted_files: if fname.endswith('train.csv'): train_csv_path = fname if fname.endswith('test.csv'): test_csv_path = fname train_iter = _create_data_from_csv(train_csv_path) test_iter = _create_data_from_csv(test_csv_path) return (RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name], train_iter), RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name], test_iter))
def _setup_datasets(dataset_name, separator, root, data_select): data_select = check_default_set(data_select, target_select=('train', 'valid', 'test')) extracted_files = [] if isinstance(URLS[dataset_name], list): for f in URLS[dataset_name]: dataset_tar = download_from_url(f, root=root) extracted_files.extend(extract_archive(dataset_tar)) elif isinstance(URLS[dataset_name], str): dataset_tar = download_from_url(URLS[dataset_name], root=root) extracted_files.extend(extract_archive(dataset_tar)) else: raise ValueError( "URLS for {} has to be in a form or list or string".format( dataset_name)) data_filenames = { "train": _construct_filepath(extracted_files, "train.txt"), "valid": _construct_filepath(extracted_files, "dev.txt"), "test": _construct_filepath(extracted_files, "test.txt") } return tuple( RawTextIterableDataset( dataset_name, NUM_LINES[dataset_name], _create_data_from_iob(data_filenames[item], separator) ) if data_filenames[item] is not None else None for item in data_select)
def _setup_datasets(dataset_name, root, split, offset): if dataset_name == 'AG_NEWS': extracted_files = [ download_from_url(URLS[dataset_name][item], root=root, path=os.path.join(root, _PATHS[dataset_name][item]), hash_value=MD5['AG_NEWS'][item], hash_type='md5') for item in ('train', 'test') ] else: dataset_tar = download_from_url(URLS[dataset_name], root=root, path=os.path.join( root, _PATHS[dataset_name]), hash_value=MD5[dataset_name], hash_type='md5') extracted_files = extract_archive(dataset_tar) cvs_path = {} for fname in extracted_files: if fname.endswith('train.csv'): cvs_path['train'] = fname if fname.endswith('test.csv'): cvs_path['test'] = fname return [ RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name][item], _create_data_from_csv(cvs_path[item]), offset=offset) for item in split ]
def IMDB(root='.data', split=('train', 'test'), offset=0): """ Examples: >>> train, test = torchtext.experimental.datasets.raw.IMDB() """ dataset_tar = download_from_url(URLS['IMDB'], root=root, hash_value=MD5['IMDB'], hash_type='md5') extracted_files = extract_archive(dataset_tar) return [RawTextIterableDataset("IMDB", NUM_LINES["IMDB"][item], generate_imdb_data(item, extracted_files), offset=offset) for item in split]
def IMDB(root='.data'): """ Defines IMDB datasets. Create supervised learning dataset: IMDB Separately returns the training and test dataset Arguments: root: Directory where the datasets are saved. Default: ".data" Examples: >>> train, test = torchtext.experimental.datasets.raw.IMDB() """ dataset_tar = download_from_url(URLS['IMDB'], root=root) extracted_files = extract_archive(dataset_tar) train_iter = generate_imdb_data('train', extracted_files) test_iter = generate_imdb_data('test', extracted_files) return (RawTextIterableDataset("IMDB", NUM_LINES["IMDB"], train_iter), RawTextIterableDataset("IMDB", NUM_LINES["IMDB"], test_iter))
def _setup_datasets(dataset_name, root, data_select): data_select = check_default_set(data_select, ('train', 'dev')) extracted_files = { key: download_from_url(URLS[dataset_name][key], root=root, hash_value=MD5[dataset_name][key], hash_type='md5') for key in data_select } return tuple( RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name][item], _create_data_from_json(extracted_files[item])) for item in data_select)
def _setup_datasets(dataset_name, root, data_select, year, language): data_select = check_default_set(data_select, ('train', 'test', 'valid')) if isinstance(data_select, str): data_select = [data_select] if not set(data_select).issubset(set(('train', 'test', 'valid'))): raise TypeError('data_select is not supported!') if dataset_name == 'PennTreebank': extracted_files = [] select_to_index = {'train': 0, 'test': 1, 'valid': 2} extracted_files = [ download_from_url(URLS['PennTreebank'][select_to_index[key]], root=root, hash_value=MD5['PennTreebank'][key], hash_type='md5') for key in data_select ] elif dataset_name == 'WMTNewsCrawl': if not (data_select == ['train'] or set(data_select).issubset(set(('train', )))): raise ValueError("WMTNewsCrawl only creates a training dataset. " "data_select should be 'train' " "or ('train',), got {}.".format(data_select)) dataset_tar = download_from_url(URLS[dataset_name], root=root, hash_value=MD5['WMTNewsCrawl'], hash_type='md5') extracted_files = extract_archive(dataset_tar) file_name = 'news.{}.{}.shuffled'.format(year, language) extracted_files = [f for f in extracted_files if file_name in f] else: dataset_tar = download_from_url(URLS[dataset_name], root=root, hash_value=MD5[dataset_name], hash_type='md5') extracted_files = extract_archive(dataset_tar) _path = {} for item in data_select: for fname in extracted_files: if item in fname: _path[item] = fname data = {} for item in _path.keys(): logging.info('Creating {} data'.format(item)) data[item] = iter(io.open(_path[item], encoding="utf8")) return tuple( RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name][item], data[item]) for item in data_select)
def _setup_datasets(dataset_name, root, split, offset): extracted_files = { key: download_from_url(URLS[dataset_name][key], root=root, hash_value=MD5[dataset_name][key], hash_type='md5') for key in split } return [ RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name][item], _create_data_from_json(extracted_files[item]), offset=offset) for item in split ]
def _setup_datasets(dataset_name, root, split_, offset): split = check_default_set(split_, ('train', 'dev'), dataset_name) extracted_files = { key: download_from_url(URLS[dataset_name][key], root=root, hash_value=MD5[dataset_name][key], hash_type='md5') for key in split } return wrap_datasets( tuple( RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name][item], _create_data_from_json( extracted_files[item]), offset=offset) for item in split), split_)
def _setup_datasets(dataset_name, root, data_select): data_select = check_default_set(data_select, target_select=('train', 'test')) if dataset_name == 'AG_NEWS': extracted_files = [ download_from_url(URLS[dataset_name][item], root=root) for item in ('train', 'test') ] else: dataset_tar = download_from_url(URLS[dataset_name], root=root) extracted_files = extract_archive(dataset_tar) cvs_path = {} for fname in extracted_files: if fname.endswith('train.csv'): cvs_path['train'] = fname if fname.endswith('test.csv'): cvs_path['test'] = fname return tuple( RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name], _create_data_from_csv(cvs_path[item])) for item in data_select)
def IMDB(root='.data', data_select=('train', 'test')): """ Defines raw IMDB datasets. Create supervised learning dataset: IMDB Separately returns the raw training and test dataset Arguments: root: Directory where the datasets are saved. Default: ".data" data_select: a string or tuple for the returned datasets. Default: ('train', 'test') By default, both datasets (train, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. Examples: >>> train, test = torchtext.experimental.datasets.raw.IMDB() """ data_select = check_default_set(data_select, target_select=('train', 'test')) dataset_tar = download_from_url(URLS['IMDB'], root=root) extracted_files = extract_archive(dataset_tar) return tuple( RawTextIterableDataset("IMDB", NUM_LINES["IMDB"], generate_imdb_data(item, extracted_files)) for item in data_select)
def _setup_datasets(dataset_name, train_filenames, valid_filenames, test_filenames, split, root, offset): if not isinstance(train_filenames, tuple) and not isinstance(valid_filenames, tuple) \ and not isinstance(test_filenames, tuple): raise ValueError("All filenames must be tuples") src_train, tgt_train = train_filenames src_eval, tgt_eval = valid_filenames src_test, tgt_test = test_filenames extracted_files = [] # list of paths to the extracted files if isinstance(URLS[dataset_name], list): for idx, f in enumerate(URLS[dataset_name]): dataset_tar = download_from_url(f, root=root, hash_value=MD5[dataset_name][idx], hash_type='md5') extracted_files.extend(extract_archive(dataset_tar)) elif isinstance(URLS[dataset_name], str): dataset_tar = download_from_url(URLS[dataset_name], root=root, hash_value=MD5[dataset_name], hash_type='md5') extracted_dataset_tar = extract_archive(dataset_tar) if dataset_name == 'IWSLT': # IWSLT dataset's url downloads a multilingual tgz. # We need to take an extra step to pick out the specific language pair from it. src_language = train_filenames[0].split(".")[-1] tgt_language = train_filenames[1].split(".")[-1] languages = "-".join([src_language, tgt_language]) iwslt_tar = '.data/2016-01/texts/{}/{}/{}.tgz' iwslt_tar = iwslt_tar.format(src_language, tgt_language, languages) extracted_dataset_tar = extract_archive(iwslt_tar) extracted_files.extend(extracted_dataset_tar) else: raise ValueError( "URLS for {} has to be in a form or list or string".format( dataset_name)) # Clean the xml and tag file in the archives file_archives = [] for fname in extracted_files: if 'xml' in fname: _clean_xml_file(fname) file_archives.append(os.path.splitext(fname)[0]) elif "tags" in fname: _clean_tags_file(fname) file_archives.append(fname.replace('.tags', '')) else: file_archives.append(fname) data_filenames = defaultdict(dict) data_filenames = { "train": _construct_filepaths(file_archives, src_train, tgt_train), "valid": _construct_filepaths(file_archives, src_eval, tgt_eval), "test": _construct_filepaths(file_archives, src_test, tgt_test) } for key in data_filenames.keys(): if len(data_filenames[key]) == 0 or data_filenames[key] is None: raise FileNotFoundError( "Files are not found for data type {}".format(key)) datasets = [] for key in split: src_data_iter = _read_text_iterator(data_filenames[key][0]) tgt_data_iter = _read_text_iterator(data_filenames[key][1]) def _iter(src_data_iter, tgt_data_iter): for item in zip(src_data_iter, tgt_data_iter): yield item datasets.append( RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name][key], _iter(src_data_iter, tgt_data_iter), offset=offset)) return datasets
def _setup_datasets(dataset_name, train_filenames, valid_filenames, test_filenames, data_select, root): data_select = check_default_set(data_select, ('train', 'valid', 'test')) if not isinstance(train_filenames, tuple) and not isinstance(valid_filenames, tuple) \ and not isinstance(test_filenames, tuple): raise ValueError("All filenames must be tuples") src_train, tgt_train = train_filenames src_eval, tgt_eval = valid_filenames src_test, tgt_test = test_filenames extracted_files = [] if isinstance(URLS[dataset_name], list): for idx, f in enumerate(URLS[dataset_name]): dataset_tar = download_from_url(f, root=root, hash_value=MD5[dataset_name][idx], hash_type='md5') extracted_files.extend(extract_archive(dataset_tar)) elif isinstance(URLS[dataset_name], str): dataset_tar = download_from_url(URLS[dataset_name], root=root, hash_value=MD5[dataset_name], hash_type='md5') extracted_files.extend(extract_archive(dataset_tar)) else: raise ValueError( "URLS for {} has to be in a form or list or string".format( dataset_name)) # Clean the xml and tag file in the archives file_archives = [] for fname in extracted_files: if 'xml' in fname: _clean_xml_file(fname) file_archives.append(os.path.splitext(fname)[0]) elif "tags" in fname: _clean_tags_file(fname) file_archives.append(fname.replace('.tags', '')) else: file_archives.append(fname) data_filenames = defaultdict(dict) data_filenames = { "train": _construct_filepaths(file_archives, src_train, tgt_train), "valid": _construct_filepaths(file_archives, src_eval, tgt_eval), "test": _construct_filepaths(file_archives, src_test, tgt_test) } for key in data_filenames.keys(): if len(data_filenames[key]) == 0 or data_filenames[key] is None: raise FileNotFoundError( "Files are not found for data type {}".format(key)) datasets = [] for key in data_select: src_data_iter = _read_text_iterator(data_filenames[key][0]) tgt_data_iter = _read_text_iterator(data_filenames[key][1]) def _iter(src_data_iter, tgt_data_iter): for item in zip(src_data_iter, tgt_data_iter): yield item datasets.append( RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name][key], _iter(src_data_iter, tgt_data_iter))) return tuple(datasets)
for key in data_filenames.keys(): if len(data_filenames[key]) == 0 or data_filenames[key] is None: raise FileNotFoundError( "Files are not found for data type {}".format(key)) print('#data_filenames 2: ', data_filenames) datasets = [] for key in data_select: src_data_iter = _read_text_iterator(data_filenames[key][0]) tgt_data_iter = _read_text_iterator(data_filenames[key][1]) def _iter(src_data_iter, tgt_data_iter): for item in zip(src_data_iter, tgt_data_iter): yield item datasets.append( RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name][key], _iter(src_data_iter, tgt_data_iter))) print('#datasets: ', datasets) return tuple(datasets) def Multi30k(train_filenames=("train.de", "train.en"), valid_filenames=("val.de", "val.en"), test_filenames=("test_2016_flickr.de", "test_2016_flickr.en"), data_select=('train', 'valid', 'test'), root='.data'): """ Define translation datasets: Multi30k Separately returns train/valid/test datasets as a tuple The available dataset include: test_2016_flickr.cs test_2016_flickr.de test_2016_flickr.en test_2016_flickr.fr