def test_flatten_json(self): file_path = os.path.join(FILES_PATH, "to-be-flattened.jsonl") ds = DataSource(format="json", flatten=True, path=file_path) df = ds.to_dataframe().compute() for c in ["persons.*.lastName", "persons.*.name"]: self.assertIn(c, df.columns, f"Expected {c} as column name")
def test_read_parquet(self): file_path = os.path.join(FILES_PATH, "test.parquet") ds = DataSource(format="parquet", path=file_path) df = ds.to_dataframe().compute() self.assertTrue("reviewerID" in df.columns) self.assertTrue("path" in df.columns)
def test_reader_csv_with_leading_and_trailing_spaces_in_examples(self): ds = DataSource( format="csv", source=os.path.join(TEST_RESOURCES, "trailing_coma_in_headers.csv"), attributes=dict(sep=";"), ) df = ds.to_dataframe().compute() self.assertIn("name", df.columns)
def test_read_csv(self): file_path = os.path.join(TEST_RESOURCES, "dataset_source.csv") datasource = DataSource(format="csv", path=file_path) data_frame = datasource.to_dataframe().compute() assert len(data_frame) > 0 self.assertTrue("path" in data_frame.columns)
def test_read_json(self): file_path = os.path.join(FILES_PATH, "dataset_source.jsonl") datasource = DataSource(format="json", path=file_path) data_frame = datasource.to_dataframe().compute() assert len(data_frame) > 0 self.assertTrue("path" in data_frame.columns)
def test_read_excel(self): file_path = os.path.join(FILES_PATH, "test.xlsx") datasource = DataSource(format="xlsx", path=file_path) data_frame = datasource.to_dataframe().compute() assert len(data_frame) > 0 self.assertTrue("path" in data_frame.columns)
def test_flatten_nested_list(self): file_path = os.path.join(FILES_PATH, "nested-list.jsonl") ds = DataSource(format="json", flatten=True, path=file_path) df = ds.to_dataframe().compute() for c in [ "classification.*.origin.*.key", "classification.*.origin.*.source" ]: self.assertIn(c, df.columns, f"Expected {c} as data column")
def test_add_mock_format(self): def ds_parser(*args, **kwargs): from dask import dataframe as ddf import pandas as pd return ddf.from_pandas(pd.DataFrame([i for i in range(0, 100)]), npartitions=1) DataSource.add_supported_format("new-format", ds_parser) for ds in [ DataSource(format="new-format"), DataSource(source="new-format") ]: self.assertFalse(ds.to_dataframe().columns is None)
def test_load_multiple_formats(self): files = [ os.path.join(FILES_PATH, "dataset_source.jsonl"), os.path.join(FILES_PATH, "dataset_source.csv"), ] with pytest.raises(TypeError): DataSource(source=files)
def test_to_mapped(self): the_mapping = {"label": "overall", "tokens": "summary"} for ds in [ DataSource( format="json", mapping=the_mapping, path=os.path.join(FILES_PATH, "dataset_source.jsonl"), ), DataSource( source=os.path.join(FILES_PATH, "dataset_source.jsonl"), mapping=the_mapping, ), ]: df = ds.to_mapped_dataframe() self.assertIn("label", df.columns) self.assertIn("tokens", df.columns) bag = ds.to_mapped_bag().take(1)[0] self.assertIn("label", bag) self.assertIn("tokens", bag)
def test_override_format(self): with pytest.raises(TypeError): DataSource(source=os.path.join(FILES_PATH, "*.jsonl"), format="not-found")
def test_no_mapping(self): ds = DataSource(format="json", path=os.path.join(FILES_PATH, "dataset_source.jsonl")) with pytest.raises(ValueError): ds.to_mapped_dataframe()
def test_wrong_format(self): with pytest.raises(TypeError): DataSource(format="not-found") # New format with pytest.raises(TypeError): DataSource(source="not-found")