def test_serialization(self): nesting_field = data.Field(batch_first=True) field = data.NestedField(nesting_field) ex1 = data.Example.fromlist(["john loves mary"], [("words", field)]) ex2 = data.Example.fromlist(["mary cries"], [("words", field)]) dataset = data.Dataset([ex1, ex2], [("words", field)]) field.build_vocab(dataset) examples_data = [ [ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("john") + ["</w>", "<cpad>"], ["<w>"] + list("loves") + ["</w>"], ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ], [ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>"] + list("cries") + ["</w>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ["<cpad>"] * 7, ] ] field_pickle_filename = "char_field.pl" field_pickle_path = os.path.join(self.test_dir, field_pickle_filename) torch.save(field, field_pickle_path) loaded_field = torch.load(field_pickle_path) assert loaded_field == field original_numericalization = field.numericalize(examples_data) pickled_numericalization = loaded_field.numericalize(examples_data) assert torch.all(torch.eq(original_numericalization, pickled_numericalization))
def gen_dataset( self, data: Iterable[Dict[str, Any]], include_label_fields: bool = True, shard_range: Tuple[int, int] = None, ) -> textdata.Dataset: """ Generate torchtext Dataset from raw in memory data. Returns: dataset (TorchText.Dataset) """ to_process = {} to_process.update(self.features) to_process.update(self.extra_fields) if include_label_fields: to_process.update(self.labels) else: to_process.pop(Target.TARGET_LABEL_FIELD, None) fields = {name: (name, field) for name, field in to_process.items()} # generate example from dataframe examples = [ textdata.Example.fromdict(row, fields) for idx, row in enumerate(self.preprocess(data)) if not shard_range or shard_range[0] <= idx <= shard_range[1] ] return textdata.Dataset(examples, to_process)
def make_dataset(self, files, dset): dataset = data.Dataset( list( tmap(lambda d: data.Example.fromdict(d, self.example_fields), chain(*map(lambda f: dset[f], files)), mininterval=0.5)), self.dataset_fields) for file in files: del dset[file] return dataset
def sentencelist2iterator(self, sentences): examples = list() for sentence in sentences: example = self.sent2example(sentence) examples.append(example) dataset = data.Dataset(examples, fields=[('src', self.SRC), ('rsrc', self.rSRC)]) self.iterator = data.Iterator(dataset, batch_size=1, sort_key=lambda x: len(x.src), sort=True, sort_within_batch=True, device=self.device)
def filter_init(ex_val1, ex_val2, ex_val3): text_field = data.Field(sequential=True) label_field = data.Field(sequential=False) fields = [("text1", text_field), ("text2", text_field), ("label", label_field)] example1 = data.Example.fromlist(ex_val1, fields) example2 = data.Example.fromlist(ex_val2, fields) example3 = data.Example.fromlist(ex_val3, fields) examples = [example1, example2, example3] dataset = data.Dataset(examples, fields) text_field.build_vocab(dataset) return dataset, text_field
def read_data(corpus_file, datafields): with open(corpus_file, encoding='utf-8') as f: examples = [] words = [] labels = [] for line in f: line = line.strip() if line == '*': examples.append(data.Example.fromlist([words, labels], datafields)) words = [] labels = [] else: columns = line.split() words.append(columns[1]) labels.append(columns[2]) return data.Dataset(examples, datafields)
def test_build_vocab_from_dataset(self): nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>", init_token="<w>", eos_token="</w>") CHARS = data.NestedField(nesting_field, init_token="<s>", eos_token="</s>") ex1 = data.Example.fromlist(["aaa bbb c"], [("chars", CHARS)]) ex2 = data.Example.fromlist(["bbb aaa"], [("chars", CHARS)]) dataset = data.Dataset([ex1, ex2], [("chars", CHARS)]) CHARS.build_vocab(dataset, min_freq=2) expected = "a b <w> </w> <s> </s> <cunk> <cpad>".split() assert len(CHARS.vocab) == len(expected) for c in expected: assert c in CHARS.vocab.stoi expected_freqs = Counter({"a": 6, "b": 6, "c": 1}) assert CHARS.vocab.freqs == CHARS.nesting_field.vocab.freqs == expected_freqs
def split(table, path, train_prefix, validation_prefix, test_prefix, split_ratio=[0.6, 0.2, 0.2], stratified=False, strata_field='label'): """Split a pandas dataframe or CSV file into train / validation / test data sets. Args: table (pandas.Dataframe or string): The pandas dataframe or CSV file to split. path (string): The directory to save the train, validation and test CSV files to. train: Suffix to add to `path` to get the training set save path. validation: Suffix to add to `path` to get the validation set save path. test: Suffix to add to `path` to get the test set save path. split_ratio (List of floats): a list of 3 numbers denoting the relative sizes of train, test and valid splits respectively. Default is [0.6, 0.2, 0.2]. stratified (bool): whether the sampling should be stratified. Default is False. strata_field (str): name of the examples Field stratified over. Default is 'label' for the conventional label field. """ assert len(split_ratio) == 3 if not isinstance(table, pd.DataFrame): table = pd.read_csv(table) if table.index.name is not None: table = table.reset_index() examples = list(table.itertuples(index=False)) fields = [(col, None) for col in list(table)] dataset = data.Dataset(examples, fields) train, valid, test = dataset.split(split_ratio, stratified, strata_field) tables = (pd.DataFrame(train.examples), pd.DataFrame(valid.examples), pd.DataFrame(test.examples)) prefixes = (train_prefix, validation_prefix, test_prefix) for i in range(len(tables)): tables[i].columns = table.columns tables[i].to_csv(os.path.join(path, prefixes[i]), index=False)
def test_numericalize(self): nesting_field = data.Field(batch_first=True) field = data.NestedField(nesting_field) ex1 = data.Example.fromlist(["john loves mary"], [("words", field)]) ex2 = data.Example.fromlist(["mary cries"], [("words", field)]) dataset = data.Dataset([ex1, ex2], [("words", field)]) field.build_vocab(dataset) examples_data = [ [ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("john") + ["</w>", "<cpad>"], ["<w>"] + list("loves") + ["</w>"], ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ], [ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>"] + list("cries") + ["</w>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ["<cpad>"] * 7, ] ] numericalized = field.numericalize(examples_data) assert numericalized.dim() == 3 assert numericalized.size(0) == len(examples_data) for example, numericalized_example in zip(examples_data, numericalized): verify_numericalized_example( field, example, numericalized_example, batch_first=True) # test include_lengths nesting_field = data.Field(batch_first=True) field = data.NestedField(nesting_field, include_lengths=True) ex1 = data.Example.fromlist(["john loves mary"], [("words", field)]) ex2 = data.Example.fromlist(["mary cries"], [("words", field)]) dataset = data.Dataset([ex1, ex2], [("words", field)]) field.build_vocab(dataset) examples_data = [ [ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("john") + ["</w>", "<cpad>"], ["<w>"] + list("loves") + ["</w>"], ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ], [ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>"] + list("cries") + ["</w>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ["<cpad>"] * 7, ] ] numericalized, seq_len, word_len = field.numericalize( (examples_data, [5, 4], [[3, 6, 7, 6, 3], [3, 6, 7, 3, 0]])) assert numericalized.dim() == 3 assert len(seq_len) == 2 assert len(word_len) == 2 assert numericalized.size(0) == len(examples_data) for example, numericalized_example in zip(examples_data, numericalized): verify_numericalized_example( field, example, numericalized_example, batch_first=True)