def test_flatten_json(self): file_path = os.path.join(FILES_PATH, "to-be-flattened.jsonl") ds = DataSource(format="json", flatten=True, source=file_path) df = ds.to_dataframe().compute() for c in ["persons.*.lastName", "persons.*.name"]: self.assertIn(c, df.columns, f"Expected {c} as column name")
def test_read_parquet(self): file_path = os.path.join(FILES_PATH, "test.parquet") ds = DataSource(format="parquet", source=file_path) df = ds.to_dataframe().compute() self.assertTrue("reviewerID" in df.columns) self.assertTrue("path" in df.columns)
def test_read_csv(self): file_path = os.path.join(FILES_PATH, "dataset_source.csv") datasource = DataSource(format="csv", source=file_path) data_frame = datasource.to_dataframe().compute() assert len(data_frame) > 0 self.assertTrue("path" in data_frame.columns)
def test_reader_csv_with_leading_and_trailing_spaces_in_examples(self): ds = DataSource( format="csv", source=os.path.join(FILES_PATH, "trailing_coma_in_headers.csv"), sep=";", ) df = ds.to_dataframe().compute() self.assertIn("name", df.columns)
def test_lazy_dataset_creation(pipeline_test: Pipeline, datasource_test: DataSource): df = datasource_test.to_dataframe() dataset = pipeline_test.create_dataset(datasource_test, lazy=True) assert isinstance(dataset, AllennlpLazyDataset) assert len([x for x in dataset]) == len(df.text) for instance in dataset: assert isinstance(instance, Instance) assert "text" in instance.fields assert "label" in instance.fields
def test_flatten_nested_list(self): file_path = os.path.join(FILES_PATH, "nested-list.jsonl") ds = DataSource(format="json", flatten=True, source=file_path) df = ds.to_dataframe().compute() for c in [ "classification.*.origin.*.key", "classification.*.origin.*.source" ]: self.assertIn(c, df.columns, f"Expected {c} as data column")
def _explore( pipeline: Pipeline, data_source: DataSource, config: ExploreConfiguration, elasticsearch: ElasticsearchExplore, ) -> dd.DataFrame: """ Executes a pipeline prediction over a datasource and register results int a elasticsearch index Parameters ---------- pipeline data_source config elasticsearch Returns ------- """ if config.prediction_cache > 0: pipeline.init_prediction_cache(config.prediction_cache) ddf_mapped = data_source.to_mapped_dataframe() # Stringify input data for better elasticsearch index mapping integration, # avoiding properties with multiple value types (string and long,...) for column in ddf_mapped.columns: ddf_mapped[column] = ddf_mapped[column].apply(helpers.stringify) # this only makes really sense when we have a predict_batch_json method implemented ... n_partitions = max(1, round(len(ddf_mapped) / config.batch_size)) apply_func = pipeline.explain_batch if config.explain else pipeline.predict_batch def annotate_batch(df: pd.DataFrame): """Applies data annotation at batch level""" input_batch = df.to_dict(orient="records") predictions = apply_func(input_batch) return pd.Series(map(sanitize, predictions), index=df.index) # a persist is necessary here, otherwise it fails for n_partitions == 1 # the reason is that with only 1 partition we pass on a generator to predict_batch_json ddf_mapped: dd.DataFrame = ddf_mapped.repartition( npartitions=n_partitions).persist() ddf_mapped["annotation"] = ddf_mapped.map_partitions(annotate_batch, meta=(None, object)) ddf_source = (data_source.to_dataframe().repartition( npartitions=n_partitions).persist()) # Keep as metadata only non used values/columns ddf_source = ddf_source[[ c for c in ddf_source.columns if c not in ddf_mapped.columns ]] ddf_mapped["metadata"] = ddf_source.map_partitions( lambda df: helpers.stringify(sanitize(df.to_dict(orient="records")))) ddf = DaskElasticClient(host=elasticsearch.es_host, retry_on_timeout=True, http_compress=True).save( ddf_mapped, index=elasticsearch.es_index, doc_type=elasticsearch.es_doc) elasticsearch.create_explore_data_index(force_delete=config.force_delete) elasticsearch.create_explore_data_record({ **(config.metadata or {}), "datasource": data_source.source, # TODO: This should change when ui is normalized (action detail and action link naming) "explore_name": elasticsearch.es_index, "model": pipeline.name, "columns": ddf.columns.values.tolist(), "metadata_columns": data_source.to_dataframe().columns.values.tolist(), "pipeline": pipeline.type_name, "output": pipeline.output, "inputs": pipeline.inputs, # backward compatibility "signature": pipeline.inputs + [pipeline.output], "predict_signature": pipeline.inputs, "labels": pipeline.head.labels, "task": pipeline.head.task_name().as_string(), }) return ddf.persist()