def test_dataset_bad_encoding(): client = utils.client.create(**CONFIG["toolbox_test_instance"]) bad_encoding_dataset_id = CONFIG["datasets"]["bad_encoding"] dataset = client.datasets.by_resource_id(bad_encoding_dataset_id) filepath = os.path.join(get_toolbox_root_dir(), "tests/data_io/temp_bad_encoding.csv") csv.from_dataset(dataset, filepath) os.remove(filepath)
def test_dataset_not_streamable(): client = utils.client.create(**CONFIG["toolbox_test_instance"]) sm_dataset_id = CONFIG["datasets"]["broken_schema_mapping_unified_dataset"] dataset = client.datasets.by_resource_id(sm_dataset_id) timestamp = datetime.now().strftime("%Y%m%d%H%M%S") with tempfile.TemporaryDirectory() as tempdir: filepath = Path( tempdir) / f"test_dataset_not_streamable_{timestamp}.csv" # ValueError raised by renaming that would yield duplicate columns with pytest.raises(RuntimeError): csv.from_dataset(dataset, filepath, allow_dataset_refresh=False)
def test_dataset_export_csv_delim_error(): client = utils.client.create(**CONFIG["toolbox_test_instance"]) sm_dataset_id = CONFIG["datasets"][ "minimal_schema_mapping_unified_dataset"] dataset = client.datasets.by_resource_id(sm_dataset_id) with tempfile.TemporaryDirectory() as tempdir: filepath = Path(tempdir) / "test_taxonomy_invalid_delim.csv" with pytest.raises(ValueError): csv.from_dataset(dataset, filepath, csv_delimiter="|", flatten_delimiter="|")
def test_export_csv_empty_dataset( buffer_size: Optional[int], nrows: Optional[int], csv_delimiter: str, flatten_delimiter: str, ): client = utils.client.create(**CONFIG["toolbox_test_instance"]) empty_dataset_id = CONFIG["datasets"]["people_0_records"] dataset = client.datasets.by_resource_id(empty_dataset_id) timestamp = datetime.now().strftime("%Y%m%d%H%M%S") filename = Path(tempfile.gettempdir()) / f"test_export_csv_{timestamp}.csv" records_written = csv.from_dataset( dataset, filename, overwrite=True, buffer_size=buffer_size, nrows=nrows, csv_delimiter=csv_delimiter, flatten_delimiter=flatten_delimiter, ) # Load raw export data and sort for comparison. compare_to_df = pd.read_csv(io.StringIO(EMPTY_TEST_DATA), dtype="object", index_col="id").sort_index() test_df = pd.read_csv(filename, dtype="object", delimiter=csv_delimiter, index_col="id", quotechar='"').sort_index() assert test_df.equals(compare_to_df) assert records_written == len(compare_to_df)
def test_dataset_export_csv_empty_dataset( buffer_size: Optional[int], nrows: Optional[int], columns: Optional[List[str]], ): client = utils.client.create(**CONFIG["toolbox_test_instance"]) empty_dataset_id = CONFIG["datasets"]["people_0_records"] dataset = client.datasets.by_resource_id(empty_dataset_id) timestamp = datetime.now().strftime("%Y%m%d%H%M%S") with tempfile.TemporaryDirectory() as tempdir: filename = Path(tempdir) / f"test_export_csv_{timestamp}.csv" records_written = csv.from_dataset( dataset, filename, overwrite=True, buffer_size=buffer_size, nrows=nrows, columns=columns, ) records_written = csv.from_dataset( dataset, filename, overwrite=True, buffer_size=buffer_size, nrows=nrows, columns=columns, ) header_string = (EMPTY_TEST_DATA if columns is None else ",".join( f'"{col}"' for col in columns)) # Load raw export data and sort for comparison. compare_to_df = pd.read_csv(io.StringIO(header_string), dtype="object", index_col="id").sort_index() test_df = pd.read_csv(filename, dtype="object", delimiter=",", index_col="id", quotechar='"').sort_index() assert test_df.equals(compare_to_df) assert records_written == len(compare_to_df)
def test_dataset_overwrite_file(overwrite: bool): client = utils.client.create(**CONFIG["toolbox_test_instance"]) sm_dataset_id = CONFIG["datasets"][ "minimal_schema_mapping_unified_dataset"] dataset = client.datasets.by_resource_id(sm_dataset_id) with tempfile.TemporaryDirectory() as tempdir: filepath = Path(tempdir) / "test_dataset_overwrite_{overwrite}.csv" f = open(filepath, "w") f.write("Temporary file") f.close() with pytest.raises(FileExistsError): csv.from_dataset(dataset, filepath, csv_delimiter=",", flatten_delimiter="|", overwrite=overwrite)
def test_dataset_invalid_renaming_map( columns=["tamr_id", "all_names", "ssn", "last_name", "first_name"], column_name_dict={"all_names": "last_name"}, ): client = utils.client.create(**CONFIG["toolbox_test_instance"]) sm_dataset_id = CONFIG["datasets"][ "minimal_schema_mapping_unified_dataset"] dataset = client.datasets.by_resource_id(sm_dataset_id) timestamp = datetime.now().strftime("%Y%m%d%H%M%S") filename = Path(tempfile.gettempdir()) / f"test_export_csv_{timestamp}.csv" # ValueError raised by renaming that would yield duplicate columns with pytest.raises(ValueError): csv.from_dataset( dataset, filename, overwrite=True, columns=columns, column_name_dict=column_name_dict, )
def test_dataset_renaming_csv_columns( columns: List[str], column_name_dict: Dict[str, str], ): client = utils.client.create(**CONFIG["toolbox_test_instance"]) sm_dataset_id = CONFIG["datasets"][ "minimal_schema_mapping_unified_dataset"] dataset = client.datasets.by_resource_id(sm_dataset_id) timestamp = datetime.now().strftime("%Y%m%d%H%M%S") with tempfile.TemporaryDirectory() as tempdir: filename = Path(tempdir) / f"test_export_csv_{timestamp}.csv" records_written = csv.from_dataset( dataset, filename, overwrite=True, columns=columns, column_name_dict=column_name_dict, ) # Load raw export data and sort for comparison. # Clean up default handling of multi-values where a value is empty (e.g. "|Tuck") since the # toolbox does not export in this way (e.g. "Tuck") compare_to_df = pd.read_csv(io.StringIO(TEST_DATA), dtype="object", index_col="tamr_id").sort_index() compare_to_df["all_names"] = compare_to_df["all_names"].str.strip( "|").str.split("|") # Sort columns of test data if columns is not None: columns.remove("tamr_id") compare_to_df = compare_to_df[columns] # CSV renaming should behave the same way as pandas dataframe renaming compare_to_df.rename(columns=column_name_dict, inplace=True) test_df = pd.read_csv(filename, dtype="object", delimiter=",", index_col="tamr_id", quotechar='"').sort_index() test_df["all_names"] = test_df["all_names"].str.split("|") assert records_written == len(compare_to_df) assert test_df.fillna("").isin(compare_to_df.fillna("")).all(axis=0).all()
def test_dataset_export_csv( buffer_size: Optional[int], nrows: Optional[int], csv_delimiter: str, flatten_delimiter: str, columns: List[str], ): client = utils.client.create(**CONFIG["toolbox_test_instance"]) sm_dataset_id = CONFIG["datasets"][ "minimal_schema_mapping_unified_dataset"] dataset = client.datasets.by_resource_id(sm_dataset_id) timestamp = datetime.now().strftime("%Y%m%d%H%M%S") with tempfile.TemporaryDirectory() as tempdir: filename = Path(tempdir) / f"test_export_csv_{timestamp}.csv" records_written = csv.from_dataset( dataset, filename, overwrite=True, buffer_size=buffer_size, nrows=nrows, csv_delimiter=csv_delimiter, flatten_delimiter=flatten_delimiter, columns=columns, ) # Load raw export data and sort for comparison. # Clean up default handling of multi-values where a value is empty (e.g. "|Tuck") since the # toolbox does not export in this way (e.g. "Tuck") compare_to_df = pd.read_csv(io.StringIO(TEST_DATA), dtype="object", index_col="tamr_id").sort_index() compare_to_df["all_names"] = compare_to_df["all_names"].str.strip( "|").str.split("|") test_df = pd.read_csv(filename, dtype="object", delimiter=csv_delimiter, index_col="tamr_id", quotechar='"').sort_index() test_df["all_names"] = test_df["all_names"].str.split(flatten_delimiter) # Sort columns of test data if columns is not None: # Drop tamr_id as it has been promoted to index columns.remove("tamr_id") compare_to_df = compare_to_df[columns] if nrows is None: assert test_df.equals(compare_to_df) assert records_written == len(compare_to_df) else: assert records_written == min(nrows, len(compare_to_df)) assert len(test_df) == nrows # Check that subset of rows are present in comparison dataframe # Must convert Nulls to empty strings since None!=None # perform check first on the rows (axis=0) and then on the series # (note in later versions checking of pandas checking both axes separately to condense to # a single boolean can be done by passing simply axis=None, but performing in this way # supports earlier version as well) assert test_df.fillna("").isin( compare_to_df.fillna("")).all(axis=0).all()