def test_multiple_output_for_input_list(expected_values): lower_case_name_field = Field("Lowercase_name", keep_raw=True) lower_case_name_field.add_pretokenize_hook(str.lower) upper_case_name_field = Field("Uppercase_name", keep_raw=True) upper_case_name_field.add_pretokenize_hook(str.upper) test_field_list = list(field_list) test_field_list[0] = ( test_field_list[0], lower_case_name_field, upper_case_name_field, ) example_factory = ExampleFactory(test_field_list) example = example_factory.from_list(expected_values) raw, tokenized = example["Name"] assert raw == expected_values[0] assert tokenized == expected_values[0].split() raw, tokenized = example["Lowercase_name"] assert raw == expected_values[0].lower() assert tokenized == expected_values[0].lower().split() raw, tokenized = example["Uppercase_name"] assert raw == expected_values[0].upper() assert tokenized == expected_values[0].upper().split() raw, tokenized = example["Score"] assert raw == expected_values[1] raw, tokenized = example["Favorite_food"] assert raw == expected_values[2]
def get_dataset(): data = [ { "Name": "Mark Dark", "Score": 5 }, { "Name": "Stephen Smith", "Score": 10 }, { "Name": "Ann Mann", "Score": 15 }, ] name_field = Field("Name", numericalizer=Vocab(), keep_raw=True, tokenizer="split") score_field = Field("Score", numericalizer=int, keep_raw=True, tokenizer=None, is_target=True) fields = {"Name": name_field, "Score": score_field} example_factory = ExampleFactory(fields) examples = [example_factory.from_dict(data_) for data_ in data] ds = Dataset(examples, fields) ds.finalize_fields() return ds
def test_cache_data_field_from_dict(expected_values): example_factory = ExampleFactory(field_dict) example = example_factory.from_dict(expected_values) for field in field_dict.values(): field_name = field.name assert field_name in example assert hasattr(example, field_name)
def test_ignore_values_dict(expected_values): fields = {"Name": name_field} example_factory = ExampleFactory(fields) example = example_factory.from_dict(expected_values) assert "Name" in example assert hasattr(example, "Name") raw, _ = example["Name"] assert raw == expected_values["Name"]
def test_ignore_values_list(expected_values): fields = [None, None, favorite_food_field] example_factory = ExampleFactory(fields) example = example_factory.from_list(expected_values) assert "Favorite_food" in example assert hasattr(example, "Favorite_food") raw, _ = example["Favorite_food"] assert raw == expected_values[2]
def dataset_with_upper_field(fields): upper_name_field = Field("upper_name", pretokenize_hooks=(str.upper, ), numericalizer=Vocab()) fields = [fields[0], upper_name_field] example_factory = ExampleFactory(fields) examples = [example_factory.from_list(e) for e in TEST_DATA] ds = Dataset(examples, fields) ds.finalize_fields() return ds
def test_from_dataset(data, fields): example_factory = ExampleFactory(fields) examples = [example_factory.from_list(raw_example) for raw_example in data] dataset = Dataset(examples, fields) pyarrow_dataset = DiskBackedDataset.from_dataset(dataset) for ds_ex, arrow_ex in zip(dataset, pyarrow_dataset): assert ds_ex.number == arrow_ex.number assert ds_ex.tokens == arrow_ex.tokens pyarrow_dataset.delete_cache()
def test_concat_view_fail_no_field_intersection(dataset): upper_name_field = Field("upper_name", pretokenize_hooks=(str.upper, ), numericalizer=Vocab()) fields = [None, upper_name_field] example_factory = ExampleFactory(fields) examples = [example_factory.from_list(e) for e in TEST_DATA] other_dataset = Dataset(examples, fields) other_dataset.finalize_fields() with pytest.raises(ValueError): DatasetConcatView([dataset, other_dataset])
def test_create_from_list(expected_values): example_factory = ExampleFactory(field_list) example = example_factory.from_list(expected_values) raw, tokenized = example["Name"] assert tokenized == expected_values[0].split() raw, tokenized = example["Score"] assert raw == expected_values[1] raw, tokenized = example["Favorite_food"] assert raw == expected_values[2]
def test_create_from_tsv(expected_values, example_tsv_string): example_factory = ExampleFactory(field_list) example = example_factory.from_csv(example_tsv_string, delimiter="\t") raw, tokenized = example["Name"] assert raw == expected_values[0] assert tokenized == expected_values[0].split() raw, tokenized = example["Score"] assert int(raw) == expected_values[1] raw, tokenized = example["Favorite_food"] assert raw == expected_values[2]
def test_create_from_json_string(expected_values, example_json_string): example_factory = ExampleFactory(field_dict) example = example_factory.from_json(example_json_string) raw, tokenized = example["Name"] assert raw == expected_values["Name"] assert tokenized == expected_values["Name"].split() raw, tokenized = example["Score"] assert raw == expected_values["Score"] raw, tokenized = example["Favorite_food"] assert raw == expected_values["Favorite_food"]
def test_from_examples(data, fields): example_factory = ExampleFactory(fields) examples = [example_factory.from_list(ex) for ex in data] ad = DiskBackedDataset.from_examples(fields, examples) for (raw, tokenized), (num, _) in zip(ad.number, data): assert raw == num assert tokenized is num for (raw, tokenized), (_, tok) in zip(ad.tokens, data): assert raw == tok assert tokenized == tok.split(" ") ad.delete_cache()
def create_dataset(): fields = ( Field("text", numericalizer=Vocab()), Field("source", numericalizer=Vocab(), tokenizer=list), ) example_factory = ExampleFactory(fields) examples = [ example_factory.from_list(data) for data in zip(TABULAR_TEXT, TABULAR_SOURCES) ] dataset = Dataset(examples, fields) return dataset
def test_text_clean_up(kwargs, data, expected_output): pytest.importorskip("cleantext") field = Field(name="data", tokenizer=None, keep_raw=True) field.add_pretokenize_hook(TextCleanUp(**kwargs)) example = ExampleFactory([field]).from_list([data]) assert expected_output == example["data"][1]
def test_remove_stopwords(): data = "I'll tell you a joke" field = Field(name="data") field.add_posttokenize_hook(remove_stopwords("en")) example = ExampleFactory([field]).from_list([data]) assert "you" not in example["data"][1] assert "a" not in example["data"][1]
def test_regex_replace(): data = "This item costs 100$." field = Field(name="data", tokenizer=None, keep_raw=True) regex_replace = RegexReplace([(r"\d+", "<NUMBER>"), (r"\s+", "<WHITESPACE>")]) field.add_pretokenize_hook(regex_replace) example = ExampleFactory([field]).from_list([data]) expected_raw = "This<WHITESPACE>item<WHITESPACE>costs<WHITESPACE><NUMBER>$." assert expected_raw == example["data"][1]
def test_truecase(): pytest.importorskip("truecase") data = "hey how are you" field = Field(name="data", tokenizer=None, keep_raw=True) field.add_pretokenize_hook(truecase()) example = ExampleFactory([field]).from_list([data]) assert "Hey how are you" == example["data"][0]
def test_keyword_extractor(alg, alg_pkg_name): pytest.importorskip(alg_pkg_name) field = Field(name="data", tokenizer=None, keep_raw=True) field.add_posttokenize_hook(KeywordExtractor(alg)) example = ExampleFactory([field]).from_list([TEXT]) # make sure all the keywords originate from the raw data text_ = TEXT.lower() assert all(kw in text_ for kws in example["data"][1] for kw in kws.lower().split())
def test_delete_cache(data, fields): cache_dir = tempfile.mkdtemp() example_factory = ExampleFactory(fields) examples = map(example_factory.from_list, data) ad = DiskBackedDataset.from_examples(fields, examples, cache_path=cache_dir) assert os.path.exists(cache_dir) ad.delete_cache() assert not os.path.exists(cache_dir)
def test_moses_normalizer(): pytest.importorskip("sacremoses") data = "What's up!" field = Field(name="data", tokenizer=None, keep_raw=True) normalizer = MosesNormalizer() field.add_pretokenize_hook(normalizer) example = ExampleFactory([field]).from_list([data]) assert "What's up!" == example["data"][1]
def test_concat_view_override_fields_eager(dataset, fields): upper_name_field = Field("name", pretokenize_hooks=(str.upper, ), numericalizer=Vocab()) other_fields = [fields[0], upper_name_field] example_factory = ExampleFactory(other_fields) examples = [example_factory.from_list(e) for e in TEST_DATA] other_dataset = Dataset(examples, other_fields) other_dataset.finalize_fields() new_field = Field("override_name_field", numericalizer=Vocab(eager=True)) dataset_concat = DatasetConcatView([dataset, other_dataset], field_overrides={"name": new_field}) assert dataset_concat.field_dict["override_name_field"].is_finalized concat_vocab = dataset_concat.field_dict["override_name_field"].vocab dataset_vocab = dataset.field_dict["name"].vocab other_vocab = other_dataset.field_dict["name"].vocab assert set( concat_vocab.itos) == set(dataset_vocab.itos) | set(other_vocab.itos)
def test_iterator_batch_as_list(): raw_dataset = [("1 2 3 4",), ("2 3 4",), ("3 4",)] field = Field( "test_field", numericalizer=int, tokenizer="split", disable_batch_matrix=True ) fields = (field,) ef = ExampleFactory(fields) examples = [ef.from_list(raw_example) for raw_example in raw_dataset] ds = Dataset(examples, fields) for i, batch in enumerate(Iterator(ds, batch_size=2, shuffle=False)): assert isinstance(batch.test_field, list) field_batch = batch.test_field if i == 0: assert len(field_batch) == 2 assert np.all(field_batch[0] == [1, 2, 3, 4]) assert np.all(field_batch[1] == [2, 3, 4]) if i == 2: assert len(field_batch) == 1 assert np.all(field_batch[0] == [3, 4])
def test_missing_datatype_exception(data, fields, tmpdir): data_null = [(*d, None) for d in data] null_field = Field( "null_field", keep_raw=True, allow_missing_data=True, numericalizer=Vocab() ) fields_null = [*fields, null_field] exf = ExampleFactory(fields_null) examples = map(exf.from_list, data_null) with pytest.raises(RuntimeError): DiskBackedDataset.from_examples(fields_null, examples, cache_path=tmpdir)
def test_lemmatization_and_stemming(hook): # we need this to postpone initialization # in pytest.mark.parametrize if inspect.isfunction(hook): hook = hook() data = "stemming playing books" field = Field(name="data") field.add_posttokenize_hook(hook) example = ExampleFactory([field]).from_list([data]) # we don't check the exact results, # instead we expect some modifications assert data != example["data"][1]
def test_hook_conversion(): field = Field(name="data", tokenizer="split", keep_raw=True) text_clean_up_hook = TextCleanUp(replace_url="<URL>") assert text_clean_up_hook.__hook_type__ == HookType.PRETOKENIZE with pytest.raises(ValueError): field.add_posttokenize_hook(text_clean_up_hook) text_clean_up_hook = as_posttokenize_hook(text_clean_up_hook) assert text_clean_up_hook.__hook_type__ == HookType.POSTTOKENIZE field.add_posttokenize_hook(text_clean_up_hook) data = "url to github is https://github.com" example = ExampleFactory([field]).from_list([data]) assert example["data"][1] == ["url", "to", "github", "is", "<URL>"]
def test_datatype_definition(data, fields): data_null = [(*d, None) for d in data] null_field = Field( "null_field", keep_raw=True, allow_missing_data=True, numericalizer=Vocab() ) fields_null = [*fields, null_field] exf = ExampleFactory(fields_null) examples = map(exf.from_list, data_null) datatypes = {"null_field": (pa.string(), pa.list_(pa.string()))} dataset = DiskBackedDataset.from_examples(fields_null, examples, data_types=datatypes) for ex, d in zip(dataset, data_null): assert int(ex["number"][0]) == d[0] assert ex["tokens"][0] == d[1] dataset.delete_cache()
def test_from_format(): list_example_factory = ExampleFactory(field_list) list_data = ["Mark Dark", 5, "Hawaiian pizza"] example = list_example_factory.from_format(list_data, ExampleFormat.LIST) assert example["Name"][0] == list_data[0] assert example["Score"][0] == list_data[1] assert example["Favorite_food"][0] == list_data[2] dict_example_factory = ExampleFactory(field_dict) dict_data = { "Name": "Mark Dark", "Score": 5, "Favorite_food": "Hawaiian pizza" } example = dict_example_factory.from_format(dict_data, ExampleFormat.DICT) assert example["Name"][0] == dict_data["Name"] assert example["Score"][0] == dict_data["Score"] assert example["Favorite_food"][0] == dict_data["Favorite_food"]
def dataset(fields) -> DatasetBase: example_factory = ExampleFactory(fields) examples = [example_factory.from_list(e) for e in TEST_DATA] ds = Dataset(examples, fields) ds.finalize_fields() return ds
def pyarrow_dataset(data, fields): example_factory = ExampleFactory(fields) examples = map(example_factory.from_list, data) return DiskBackedDataset.from_examples(fields, examples)
def __init__( self, fields: Union[Dict, List], example_format: Union[ExampleFormat, str], model: Union[AbstractSupervisedModel, Type[AbstractSupervisedModel]], trainer: AbstractTrainer = None, feature_transformer: Union[FeatureTransformer, Callable[[NamedTuple], np.array]] = None, label_transform_fn: Callable[[NamedTuple], np.ndarray] = None, output_transform_fn: Callable[[np.ndarray], Any] = None, ): """ Creates a new pipeline instance. Parameters ---------- fields : dict or list of fields Fields used to process raw data. Can be either a dict mapping column names to Fields (or tuples of Fields), or a list of Fields (or tuples of Fields). A Field value of None means the corresponding column will be ignored. example_format: ExampleFormat Format of expected raw examples. feature_transformer: FeatureTransformer FeatureTransformer used to transform data features from the podium "batch" format into numpy arrays. Will be fitted along with the model to the provided data. model : class or model instance Class of the model to be fitted or a pre-trained model. If pre-trained model is passed and `fit` is called a new model instance will be created. For fine-tuning of the passed model instance call `partial_fit`. Must be a subclass of Podium's `AbstractSupervisedModel` trainer: AbstractTrainer, Optional Trainer used to fit the model. If provided, this trainer instance will be stored in the pipeline and used as the default trainer if no trainer is provided in the `fit` and `partial_fit` methods. feature_transformer : Union[FeatureTransformer, Callable[[NamedTuple], np.array] FeatureTransformer that transforms the input part of the batch returned by the iterator into features that can be fed into the model. Will also be fitted during Experiment fitting. A callable taking an input batch and returning a numpy array of features can also be passed. If None, a default feature transformer that returns a single feature from the batch will be used. In this case the Dataset used in training must contain a single input field. label_transform_fn : Callable[[NamedTuple], np.ndarray] Callable that transforms the target part of the batch returned by the iterator into the same format the model prediction is. For a hypothetical perfect model the prediction result of the model for some examples must be identical to the result of this callable for those same examples. If None, a default label transformer that returns a single feature from the batch will be used. In this case the Dataset used in training must contain a single target field. output_transform_fn: Callable[[np.ndarray], Any] Callable used to transform the prediction result of the model. Raises ------ TypeError If `example format` is LIST, CSV or NLTK and `fields` is not either a list or tuple. If `example format` is DICT, XML or JSON and `fields` is not a dict. """ if isinstance(example_format, ExampleFormat): example_format = example_format.value if example_format in ( ExampleFormat.LIST.value, ExampleFormat.CSV.value, ExampleFormat.NLTK.value, ): if not isinstance(fields, (list, tuple)): raise TypeError( "If `example format` is LIST, CSV or NLTK, `fields` " "must be either a list or tuple. " f"Type of `fields`: {type(fields)}") elif not isinstance(fields, dict): raise TypeError( "If `example format` is DICT, XML or JSON, `fields` " "must be a dict. " f"Type of `fields`: {type(fields)}") if isinstance(fields, (list, tuple)): feature_field_dict = _filter_feature_fields( {k: v for k, v in enumerate(fields)}) self.feature_fields = list(feature_field_dict.values()) else: self.feature_fields = _filter_feature_fields(fields) self.all_fields = fields self.example_format = example_format self.training_example_factory = ExampleFactory(self.all_fields) self.prediction_example_factory = ExampleFactory(self.feature_fields) self.output_transform_fn = output_transform_fn super().__init__( model, feature_transformer=feature_transformer, trainer=trainer, label_transform_fn=label_transform_fn, )