def get_dataset(): data = [ { "Name": "Mark Dark", "Score": 5 }, { "Name": "Stephen Smith", "Score": 10 }, { "Name": "Ann Mann", "Score": 15 }, ] name_field = Field("Name", numericalizer=Vocab(), keep_raw=True, tokenizer="split") score_field = Field("Score", numericalizer=int, keep_raw=True, tokenizer=None, is_target=True) fields = {"Name": name_field, "Score": score_field} example_factory = ExampleFactory(fields) examples = [example_factory.from_dict(data_) for data_ in data] ds = Dataset(examples, fields) ds.finalize_fields() return ds
def dataset_with_upper_field(fields): upper_name_field = Field("upper_name", pretokenize_hooks=(str.upper, ), numericalizer=Vocab()) fields = [fields[0], upper_name_field] example_factory = ExampleFactory(fields) examples = [example_factory.from_list(e) for e in TEST_DATA] ds = Dataset(examples, fields) ds.finalize_fields() return ds
def test_concat_view_fail_no_field_intersection(dataset): upper_name_field = Field("upper_name", pretokenize_hooks=(str.upper, ), numericalizer=Vocab()) fields = [None, upper_name_field] example_factory = ExampleFactory(fields) examples = [example_factory.from_list(e) for e in TEST_DATA] other_dataset = Dataset(examples, fields) other_dataset.finalize_fields() with pytest.raises(ValueError): DatasetConcatView([dataset, other_dataset])
def predict_raw(self, raw_example: Any, **kwargs) -> np.ndarray: """ Computes the prediction of the model for the one example. The example must be of the format provided in the constructor as the `example_format` parameter. Parameters ---------- raw_example: Any Example to compute the prediction for. kwargs Keyword arguments passed to the model's `predict` method Returns ------- ndarray Tensor containing the prediction for the example. """ processed_example = self.prediction_example_factory.from_format( raw_example, self.example_format) ds = Dataset([processed_example], self.feature_fields) prediction = self.predict(ds, **kwargs) # Indexed with 0 to extract the single prediction from the prediction batch prediction = prediction[0] if self.output_transform_fn is not None: return self.output_transform_fn(prediction) else: return prediction
def __getitem__(self, i): raw_examples = self.dataset[i] # Index or slice if isinstance(i, int): return self._example_factory.from_dict(raw_examples) else: # Slice of hf.datasets.Dataset is a dictionary that maps # to a list of values. To map this to a list of our examples, # we map the single dictionary to a list of dictionaries and # then convert this to a list of podium Examples # Unpack the dict, creating a dict for each value tuple raw_examples = [ {k: v for k, v in zip(raw_examples, values)} for values in zip(*raw_examples.values()) ] # Map each raw example to a Podium example examples = [ self._example_factory.from_dict(raw_example) for raw_example in raw_examples ] # Cast to a dataset return Dataset(examples, self.fields, sort_key=None)
def fit_raw( self, examples: Iterable[Union[Dict, List]], model_kwargs: Dict = None, trainer_kwargs: Dict = None, feature_transformer: FeatureTransformer = None, trainer: AbstractTrainer = None, ): """ Fits the model to the provided examples. During fitting, the provided Iterator and Trainer are used. Each example must be of the format provided in the constructor as the `example_format` parameter. Parameters ---------- examples : Iterable[Union[Dict, List]] Examples that will be used in fitting, model_kwargs : dict Dict containing model arguments. Arguments passed to the model are the default arguments defined with `set_default_model_args` updated/overridden by model_kwargs. trainer_kwargs : dict Dict containing trainer arguments. Arguments passed to the trainer are the default arguments defined with `set_default_trainer_args` updated/overridden by 'trainer_kwargs'. feature_transformer : FeatureTransformer, Optional FeatureTransformer that transforms the input part of the batch returned by the iterator into features that can be fed into the model. Will also be fitted during Experiment fitting. If None, the default FeatureTransformer provided in the constructor will be used. Otherwise, this will overwrite the default feature transformer. trainer : AbstractTrainer, Optional Trainer used to fit the model. If None, the trainer provided in the constructor will be used. training_iterator_callable: Callable[[Dataset], Iterator] Callable used to instantiate new instances of the Iterator used in fitting the model. If None, the training_iterator_callable provided in the constructor will be used. """ processed_examples = [ self.training_example_factory.from_format(ex, self.example_format) for ex in examples ] ds = Dataset(processed_examples, self.all_fields) self.fit( ds, model_kwargs=model_kwargs, trainer_kwargs=trainer_kwargs, feature_transformer=feature_transformer, trainer=trainer, )
def as_dataset(self) -> Dataset: """ Convert the original HuggingFace dataset to a podium.Dataset. Returns ------- podium.Dataset podium.Dataset instance. """ return Dataset(list(self), self.fields)
def test_from_dataset(data, fields): example_factory = ExampleFactory(fields) examples = [example_factory.from_list(raw_example) for raw_example in data] dataset = Dataset(examples, fields) pyarrow_dataset = DiskBackedDataset.from_dataset(dataset) for ds_ex, arrow_ex in zip(dataset, pyarrow_dataset): assert ds_ex.number == arrow_ex.number assert ds_ex.tokens == arrow_ex.tokens pyarrow_dataset.delete_cache()
def test_concat_view_override_fields_eager(dataset, fields): upper_name_field = Field("name", pretokenize_hooks=(str.upper, ), numericalizer=Vocab()) other_fields = [fields[0], upper_name_field] example_factory = ExampleFactory(other_fields) examples = [example_factory.from_list(e) for e in TEST_DATA] other_dataset = Dataset(examples, other_fields) other_dataset.finalize_fields() new_field = Field("override_name_field", numericalizer=Vocab(eager=True)) dataset_concat = DatasetConcatView([dataset, other_dataset], field_overrides={"name": new_field}) assert dataset_concat.field_dict["override_name_field"].is_finalized concat_vocab = dataset_concat.field_dict["override_name_field"].vocab dataset_vocab = dataset.field_dict["name"].vocab other_vocab = other_dataset.field_dict["name"].vocab assert set( concat_vocab.itos) == set(dataset_vocab.itos) | set(other_vocab.itos)
def test_slice_view_to_dataset(dataset, tmp_path): start, stop, step = 3, 8, 2 slc = slice(start, stop, step) dataset_view = DatasetSlicedView(dataset, s=slc) # cast to Dataset ds = Dataset.from_dataset(dataset_view) assert isinstance(ds, Dataset) assert len(ds) == len(dataset_view) for ex_view, ex_dataset in zip(dataset_view, ds): for f in dataset.fields: assert ex_view[f.name] == ex_dataset[f.name] # cast to DiskBackedDataset ds = DiskBackedDataset.from_dataset(dataset_view, cache_path=tmp_path) assert isinstance(ds, DiskBackedDataset) assert len(ds) == len(dataset_view) for ex_view, ex_dataset in zip(dataset_view, ds): for f in dataset.fields: assert ex_view[f.name] == ex_dataset[f.name]
def test_iterator_batch_as_list(): raw_dataset = [("1 2 3 4",), ("2 3 4",), ("3 4",)] field = Field( "test_field", numericalizer=int, tokenizer="split", disable_batch_matrix=True ) fields = (field,) ef = ExampleFactory(fields) examples = [ef.from_list(raw_example) for raw_example in raw_dataset] ds = Dataset(examples, fields) for i, batch in enumerate(Iterator(ds, batch_size=2, shuffle=False)): assert isinstance(batch.test_field, list) field_batch = batch.test_field if i == 0: assert len(field_batch) == 2 assert np.all(field_batch[0] == [1, 2, 3, 4]) assert np.all(field_batch[1] == [2, 3, 4]) if i == 2: assert len(field_batch) == 1 assert np.all(field_batch[0] == [3, 4])
def partial_fit_raw( self, examples: Iterable[Union[Dict, List]], trainer_kwargs: Dict = None, trainer: AbstractTrainer = None, ): """ Fits the model to the data without resetting the model. Each example must be of the format provided in the constructor as the `example_format` parameter. Parameters ---------- examples: Iterable[Union[Dict, List]] Iterable of examples in raw state. trainer_kwargs : dict Dict containing trainer arguments. Arguments passed to the trainer are the default arguments defined with `set_default_trainer_args` updated/overridden by 'trainer_kwargs'. trainer: AbstractTrainer, Optional Trainer used to fit the model. If None, the trainer provided in the constructor will be used. training_iterator_callable: Callable[[Dataset], Iterator] Callable used to instantiate new instances of the Iterator used in fitting the model. If None, the training_iterator_callable provided in the constructor will be used. """ processed_examples = [ self.training_example_factory.from_format(ex, self.example_format) for ex in examples ] ds = Dataset(processed_examples, self.all_fields) self.partial_fit(dataset=ds, trainer_kwargs=trainer_kwargs, trainer=trainer)
def dataset(fields) -> DatasetBase: example_factory = ExampleFactory(fields) examples = [example_factory.from_list(e) for e in TEST_DATA] ds = Dataset(examples, fields) ds.finalize_fields() return ds