示例#1
0
    def test_read_parquet(self):
        file_path = os.path.join(FILES_PATH, "test.parquet")
        ds = DataSource(format="parquet", source=file_path)

        df = ds.to_dataframe().compute()
        self.assertTrue("reviewerID" in df.columns)
        self.assertTrue("path" in df.columns)
    def test_no_mapping(self):

        ds = DataSource(
            format="json", source=os.path.join(FILES_PATH, "dataset_source.jsonl")
        )
        with pytest.raises(ValueError):
            ds.to_mapped_dataframe()
    def test_flatten_json(self):
        file_path = os.path.join(FILES_PATH, "to-be-flattened.jsonl")
        ds = DataSource(format="json", flatten=True, source=file_path)
        df = ds.to_dataframe().compute()

        for c in ["persons.*.lastName", "persons.*.name"]:
            self.assertIn(c, df.columns, f"Expected {c} as column name")
 def test_reader_csv_with_leading_and_trailing_spaces_in_examples(self):
     ds = DataSource(
         format="csv",
         source=os.path.join(FILES_PATH, "trailing_coma_in_headers.csv"),
         sep=";",
     )
     df = ds.to_dataframe().compute()
     self.assertIn("name", df.columns)
    def test_read_csv(self):
        file_path = os.path.join(FILES_PATH, "dataset_source.csv")

        datasource = DataSource(format="csv", source=file_path)
        data_frame = datasource.to_dataframe().compute()

        assert len(data_frame) > 0
        self.assertTrue("path" in data_frame.columns)
    def test_flatten_nested_list(self):
        file_path = os.path.join(FILES_PATH, "nested-list.jsonl")

        ds = DataSource(format="json", flatten=True, source=file_path)
        df = ds.to_dataframe().compute()

        for c in [
                "classification.*.origin.*.key",
                "classification.*.origin.*.source"
        ]:
            self.assertIn(c, df.columns, f"Expected {c} as data column")
    def test_add_mock_format(self):
        def ds_parser(*args, **kwargs):
            from dask import dataframe as ddf
            import pandas as pd

            return ddf.from_pandas(
                pd.DataFrame([i for i in range(0, 100)]), npartitions=1
            )

        DataSource.add_supported_format("new-format", ds_parser)
        self.assertFalse(
            DataSource(source="source", format="new-format").to_dataframe().columns
            is None
        )
def training_data_source(tmp_path) -> DataSource:
    data_file = tmp_path / "train.json"
    df = pd.DataFrame({
        "text": [
            "This is a simple NER test",
            "This is a simple NER test with misaligned spans",
            "No NER here",
        ],
        "labels": [
            [{
                "start": 17,
                "end": 20,
                "label": "NER"
            }],
            [{
                "start": 17,
                "end": 22,
                "label": "NER"
            }],
            [],
        ],
    })
    df.to_json(data_file, lines=True, orient="records")

    return DataSource(source=str(data_file),
                      flatten=False,
                      lines=True,
                      orient="records")
 def test_load_multiple_formats(self):
     files = [
         os.path.join(FILES_PATH, "dataset_source.jsonl"),
         os.path.join(FILES_PATH, "dataset_source.csv"),
     ]
     with pytest.raises(TypeError):
         DataSource(source=files)
示例#10
0
def test_load_pipeline_with_custom_head():
    config = PipelineConfiguration(
        "test-pipeline",
        head=TaskHeadConfiguration(
            type=MyCustomHead,
            labels=[
                "blue-collar",
                "technician",
                "management",
                "services",
                "retired",
                "admin.",
            ],
        ),
        features=FeaturesConfiguration(),
    )

    pipeline = Pipeline.from_config(config)
    assert isinstance(pipeline.head, MyCustomHead)

    train = DataSource(
        source=os.path.join(TEST_RESOURCES,
                            "resources/data/dataset_source.csv"),
        mapping={
            "label": "job",
            "text": ["education", "marital"]
        },
    )
    output = mkdtemp()
    pipeline.create_vocabulary(VocabularyConfiguration(sources=[train]))
    pipeline.train(output=output, training=train)

    trained_pl = Pipeline.from_pretrained(os.path.join(output, "model.tar.gz"))
    trained_pl.predict("Oh yeah")
    assert isinstance(trained_pl.head, MyCustomHead)
示例#11
0
    def explore(
        self,
        data_source: DataSource,
        explore_id: Optional[str] = None,
        es_host: Optional[str] = None,
        batch_size: int = 50,
        prediction_cache_size: int = 0,
        explain: bool = False,
        force_delete: bool = True,
        **metadata,
    ) -> dd.DataFrame:
        """Launches the Explore UI for a given data source

        Running this method inside an `IPython` notebook will try to render the UI directly in the notebook.

        Running this outside a notebook will try to launch the standalone web application.

        Parameters
        ----------
        data_source: `DataSource`
            The data source or its yaml file path
        explore_id: `Optional[str]`
            A name or id for this explore run, useful for running and keep track of several explorations
        es_host: `Optional[str]`
            The URL to the Elasticsearch host for indexing predictions (default is `localhost:9200`)
        batch_size: `int`
            The batch size for indexing predictions (default is `500)
        prediction_cache_size: `int`
            The size of the cache for caching predictions (default is `0)
        explain: `bool`
            Whether to extract and return explanations of token importance (default is `False`)
        force_delete: `bool`
            Deletes exploration with the same `explore_id` before indexing the new explore items (default is `True)

        Returns
        -------
        pipeline: `Pipeline`
            A configured pipeline
        """
        from ._helpers import _explore, _show_explore

        config = ExploreConfiguration(
            batch_size=batch_size,
            prediction_cache_size=prediction_cache_size,
            explain=explain,
            force_delete=force_delete,
            **metadata,
        )

        es_config = ElasticsearchExplore(
            es_index=explore_id or str(uuid.uuid1()),
            es_host=es_host or constants.DEFAULT_ES_HOST,
        )

        if not data_source.mapping:
            data_source.mapping = self._model._default_ds_mapping
        explore_df = _explore(self, data_source, config, es_config)
        _show_explore(es_config)

        return explore_df
def train_data_source() -> DataSource:
    source = (Path(__file__).parent.parent / "resources" / "data" /
              "emotions_with_transformers.txt")
    training_ds = DataSource(source=str(source),
                             format="csv",
                             sep=";",
                             names=["text", "label"])

    return training_ds
    def test_to_mapped(self):
        the_mapping = {"label": "overall", "tokens": "summary"}

        for ds in [
            DataSource(
                format="json",
                mapping=the_mapping,
                source=os.path.join(FILES_PATH, "dataset_source.jsonl"),
            ),
            DataSource(
                source=os.path.join(FILES_PATH, "dataset_source.jsonl"),
                mapping=the_mapping,
            ),
        ]:
            df = ds.to_mapped_dataframe()

            self.assertIn("label", df.columns)
            self.assertIn("tokens", df.columns)
示例#14
0
    def create_dataset(self,
                       datasource: DataSource,
                       lazy: bool = False) -> InstancesDataset:
        """
        Creates an instances torch Dataset from an data source

        Parameters
        ----------
        datasource:
            The source of data
        lazy:
            If enabled, the returned dataset is a subclass of `torch.data.utils.IterableDataset`

        Returns
        -------

        A torch Dataset containing the instances collection

        """
        mapping = {k: k for k in self.inputs + [self.output] if k}
        mapping.update(datasource.mapping)

        datasource.mapping = mapping
        ddf = datasource.to_mapped_dataframe()
        instances_series: "dask.dataframe.core.Series" = ddf.map_partitions(
            lambda df: df.apply(
                lambda row: self.head.featurize(**row.to_dict()), axis=1),
            meta=object,
        ).persist()
        # We remove the not featurizable examples from the data set. The head should log a warning for them though!
        instances_series = instances_series.dropna()

        def build_instance_generator(instances: DataFrame):
            """Configures an instance generator from DataFrame"""
            def instance_generator(path: str) -> Iterable[Instance]:
                yield from instances

            return instance_generator

        return (AllennlpLazyDataset(
            instance_generator=build_instance_generator(instances_series),
            file_path="dummy",
        ) if lazy else AllennlpDataset(list(instances_series.compute())))
示例#15
0
def test_lazy_dataset_creation(pipeline_test: Pipeline, datasource_test: DataSource):
    df = datasource_test.to_dataframe()
    dataset = pipeline_test.create_dataset(datasource_test, lazy=True)
    assert isinstance(dataset, AllennlpLazyDataset)
    assert len([x for x in dataset]) == len(df.text)

    for instance in dataset:
        assert isinstance(instance, Instance)
        assert "text" in instance.fields
        assert "label" in instance.fields
示例#16
0
def datasource_test(tmp_path) -> DataSource:
    data_file = tmp_path / "classifier.parquet"
    df = pd.DataFrame(
        {
            "text": ["A common text", "This is why you get", "Seriosly?, I'm not sure"],
            "label": ["one", "zero", "zero"],
        }
    )
    df.to_parquet(data_file)

    return DataSource(source=str(data_file))
示例#17
0
def test_dataset_creation_with_partial_mapping(
    datasource_with_partial_mapping: DataSource, pipeline_test: Pipeline
):
    df = datasource_with_partial_mapping.to_mapped_dataframe()
    dataset = pipeline_test.create_dataset(datasource_with_partial_mapping)
    assert isinstance(dataset, AllennlpDataset)
    assert len(dataset) == len(df.text)

    for instance in dataset:
        assert isinstance(instance, Instance)
        assert "text" in instance.fields
        assert "label" in instance.fields
示例#18
0
def training_data_source(tmp_path) -> DataSource:
    data_file = tmp_path / "relations.json"
    df = pd.DataFrame([
        {
            "text":
            "The most common audits were about waste and recycling.",
            "entities": [
                {
                    "start": 34,
                    "end": 39,
                    "label": "PN",
                    "text": "waste"
                },
                {
                    "start": 16,
                    "end": 22,
                    "label": "QTY",
                    "text": "audits"
                },
            ],
            "label":
            "Message-Topic(e1,e2)",
        },
        {
            "text":
            "The company fabricates plastic chairs.",
            "entities": [
                {
                    "start": 4,
                    "end": 11,
                    "label": "OBJECT",
                    "text": "company"
                },
                {
                    "start": 31,
                    "end": 37,
                    "label": "SUBJECT",
                    "text": "chairs"
                },
            ],
            "label":
            "Product-Producer(e2,e1)",
        },
    ])
    df.to_json(data_file, lines=True, orient="records")

    return DataSource(source=str(data_file),
                      flatten=False,
                      lines=True,
                      orient="records")
示例#19
0
def datasource_with_partial_mapping(tmp_path) -> DataSource:
    data_file = tmp_path / "classifier.parquet"
    df = pd.DataFrame(
        {
            "another_text": [
                "A common text",
                "This is why you get",
                "Seriosly?, I'm not sure",
            ],
            "label": ["one", "zero", "zero"],
        }
    )
    df.to_parquet(data_file)

    return DataSource(source=str(data_file), mapping={"text": "another_text"})
def training_data_source(tmp_path) -> DataSource:
    data_file = tmp_path / "record_pairs.json"
    df = pd.DataFrame({
        "text": [
            "this is a text",
            "my name is dani",
            "this is a table",
            "my name is paco",
        ],
    })
    df.to_json(data_file, lines=True, orient="records")

    return DataSource(source=str(data_file),
                      flatten=False,
                      lines=True,
                      orient="records")
def training_data_source(tmp_path) -> DataSource:
    data_file = tmp_path / "record_pairs.json"
    df = pd.DataFrame({
        "record1": [
            {
                "@fist_name": "Hans",
                "@last_name": "Peter"
            },
            {
                "@fist_name": "Heinrich",
                "@last_name": "Meier"
            },
            {
                "@fist_name": "Hans",
                "@last_name": "Peter"
            },
        ],
        "record2": [
            {
                "@fist_name": "Hans",
                "@last_name": "Petre"
            },
            {
                "@fist_name": "Heinz",
                "@last_name": "Meier"
            },
            {
                "@fist_name": "Hansel",
                "@last_name": "Peter"
            },
        ],
        "label": ["duplicate", "not_duplicate", "duplicate"],
    })
    df.to_json(data_file, lines=True, orient="records")

    return DataSource(source=str(data_file),
                      flatten=False,
                      lines=True,
                      orient="records")
示例#22
0
def _explore(
    pipeline: Pipeline,
    data_source: DataSource,
    config: ExploreConfiguration,
    elasticsearch: ElasticsearchExplore,
) -> dd.DataFrame:
    """
    Executes a pipeline prediction over a datasource and register results int a elasticsearch index

    Parameters
    ----------
    pipeline
    data_source
    config
    elasticsearch

    Returns
    -------

    """
    if config.prediction_cache > 0:
        pipeline.init_prediction_cache(config.prediction_cache)

    ddf_mapped = data_source.to_mapped_dataframe()
    # Stringify input data for better elasticsearch index mapping integration,
    # avoiding properties with multiple value types (string and long,...)
    for column in ddf_mapped.columns:
        ddf_mapped[column] = ddf_mapped[column].apply(helpers.stringify)

    # this only makes really sense when we have a predict_batch_json method implemented ...
    n_partitions = max(1, round(len(ddf_mapped) / config.batch_size))

    apply_func = pipeline.explain_batch if config.explain else pipeline.predict_batch

    def annotate_batch(df: pd.DataFrame):
        """Applies data annotation at batch level"""
        input_batch = df.to_dict(orient="records")
        predictions = apply_func(input_batch)
        return pd.Series(map(sanitize, predictions), index=df.index)

    # a persist is necessary here, otherwise it fails for n_partitions == 1
    # the reason is that with only 1 partition we pass on a generator to predict_batch_json
    ddf_mapped: dd.DataFrame = ddf_mapped.repartition(
        npartitions=n_partitions).persist()
    ddf_mapped["annotation"] = ddf_mapped.map_partitions(annotate_batch,
                                                         meta=(None, object))

    ddf_source = (data_source.to_dataframe().repartition(
        npartitions=n_partitions).persist())
    # Keep as metadata only non used values/columns
    ddf_source = ddf_source[[
        c for c in ddf_source.columns if c not in ddf_mapped.columns
    ]]
    ddf_mapped["metadata"] = ddf_source.map_partitions(
        lambda df: helpers.stringify(sanitize(df.to_dict(orient="records"))))

    ddf = DaskElasticClient(host=elasticsearch.es_host,
                            retry_on_timeout=True,
                            http_compress=True).save(
                                ddf_mapped,
                                index=elasticsearch.es_index,
                                doc_type=elasticsearch.es_doc)

    elasticsearch.create_explore_data_index(force_delete=config.force_delete)
    elasticsearch.create_explore_data_record({
        **(config.metadata or {}),
        "datasource":
        data_source.source,
        # TODO: This should change when ui is normalized (action detail and action link naming)
        "explore_name":
        elasticsearch.es_index,
        "model":
        pipeline.name,
        "columns":
        ddf.columns.values.tolist(),
        "metadata_columns":
        data_source.to_dataframe().columns.values.tolist(),
        "pipeline":
        pipeline.type_name,
        "output":
        pipeline.output,
        "inputs":
        pipeline.inputs,  # backward compatibility
        "signature":
        pipeline.inputs + [pipeline.output],
        "predict_signature":
        pipeline.inputs,
        "labels":
        pipeline.head.labels,
        "task":
        pipeline.head.task_name().as_string(),
    })
    return ddf.persist()
示例#23
0
def train_data_source() -> DataSource:
    resources_path = Path(__file__).parent.parent / "resources" / "data"
    training_ds = DataSource(source=str(resources_path /
                                        "business.cat.2k.train.csv"))

    return training_ds
示例#24
0
def train_valid_data_source() -> Tuple[DataSource, DataSource]:
    resources_path = Path(__file__).parent.parent / "resources" / "data"
    training_ds = DataSource(source=str(resources_path / "business.cat.2k.train.csv"))
    validation_ds = DataSource(source=str(resources_path / "business.cat.2k.valid.csv"))

    return training_ds, validation_ds
示例#25
0
def explore(pipeline_path: str, data_source: str, explain: bool,
            es_host: str) -> None:
    Pipeline.from_pretrained(pipeline_path).explore(
        data_source=DataSource.from_yaml(data_source),
        es_host=es_host,
        explain=explain)
 def test_override_format(self):
     with pytest.raises(TypeError):
         DataSource(source=os.path.join(FILES_PATH, "*.jsonl"), format="not-found")
 def test_wrong_format(self):
     with pytest.raises(MissingArgumentError):
         DataSource(format="not-found")
     # New format
     with pytest.raises(TypeError):
         DataSource(source="not-found")