def __init__(self, begin_line=0, num_lines=6348957, root='.data'): """Initiate EnWik9 dataset. Arguments: begin_line: the number of beginning line. Default: 0 num_lines: the number of lines to be loaded. Default: 6348957 root: Directory where the datasets are saved. Default: ".data" data: a list of label/tokens tuple. tokens are a tensor after Examples: >>> from torchtext.datasets import EnWik9 >>> enwik9 = EnWik9(num_lines=20000) >>> vocab = enwik9.get_vocab() """ super(EnWik9, self).__init__() processed_file = os.path.join(root, 'norm_enwik9') if not os.path.exists(processed_file): url = 'http://mattmahoney.net/dc/enwik9.zip' dataset_zip = download_from_url(url, path=os.path.join( root, 'enwik9.zip'), root=root) extracted_file = extract_archive(dataset_zip) raw_file = extracted_file[0] preprocess_raw_enwik9(raw_file, processed_file) # Meta information offsets = generate_offsets(processed_file) read_lines = read_lines_from_iterator(processed_file, offsets, begin_line, num_lines) self._data = [] for item in simple_space_split(read_lines): self._data += item self._vocab = None
def test_simple_space_split(self): test_sample = ['test simple space split function'] ref_results = ['test', 'simple', 'space', 'split', 'function'] self.assertEqual(list(simple_space_split(test_sample))[0], ref_results)