Python from_dataset 예제들, tamr_toolbox.data_io.csv.from_dataset Python 예제들

예제 #1

0

파일 보기

파일: test_csv.py 프로젝트: Datatamer/tamr-toolbox

def test_dataset_bad_encoding():
    client = utils.client.create(**CONFIG["toolbox_test_instance"])
    bad_encoding_dataset_id = CONFIG["datasets"]["bad_encoding"]
    dataset = client.datasets.by_resource_id(bad_encoding_dataset_id)

    filepath = os.path.join(get_toolbox_root_dir(),
                            "tests/data_io/temp_bad_encoding.csv")
    csv.from_dataset(dataset, filepath)
    os.remove(filepath)

예제 #2

0

파일 보기

파일: test_csv.py 프로젝트: Datatamer/tamr-toolbox

def test_dataset_not_streamable():
    client = utils.client.create(**CONFIG["toolbox_test_instance"])
    sm_dataset_id = CONFIG["datasets"]["broken_schema_mapping_unified_dataset"]
    dataset = client.datasets.by_resource_id(sm_dataset_id)
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")

    with tempfile.TemporaryDirectory() as tempdir:
        filepath = Path(
            tempdir) / f"test_dataset_not_streamable_{timestamp}.csv"

        # ValueError raised by renaming that would yield duplicate columns
        with pytest.raises(RuntimeError):
            csv.from_dataset(dataset, filepath, allow_dataset_refresh=False)

예제 #3

0

파일 보기

파일: test_csv.py 프로젝트: Datatamer/tamr-toolbox

def test_dataset_export_csv_delim_error():
    client = utils.client.create(**CONFIG["toolbox_test_instance"])
    sm_dataset_id = CONFIG["datasets"][
        "minimal_schema_mapping_unified_dataset"]
    dataset = client.datasets.by_resource_id(sm_dataset_id)

    with tempfile.TemporaryDirectory() as tempdir:
        filepath = Path(tempdir) / "test_taxonomy_invalid_delim.csv"

        with pytest.raises(ValueError):
            csv.from_dataset(dataset,
                             filepath,
                             csv_delimiter="|",
                             flatten_delimiter="|")

예제 #4

0

파일 보기

def test_export_csv_empty_dataset(
    buffer_size: Optional[int],
    nrows: Optional[int],
    csv_delimiter: str,
    flatten_delimiter: str,
):
    client = utils.client.create(**CONFIG["toolbox_test_instance"])
    empty_dataset_id = CONFIG["datasets"]["people_0_records"]
    dataset = client.datasets.by_resource_id(empty_dataset_id)
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
    filename = Path(tempfile.gettempdir()) / f"test_export_csv_{timestamp}.csv"
    records_written = csv.from_dataset(
        dataset,
        filename,
        overwrite=True,
        buffer_size=buffer_size,
        nrows=nrows,
        csv_delimiter=csv_delimiter,
        flatten_delimiter=flatten_delimiter,
    )

    # Load raw export data and sort for comparison.
    compare_to_df = pd.read_csv(io.StringIO(EMPTY_TEST_DATA),
                                dtype="object",
                                index_col="id").sort_index()

    test_df = pd.read_csv(filename,
                          dtype="object",
                          delimiter=csv_delimiter,
                          index_col="id",
                          quotechar='"').sort_index()

    assert test_df.equals(compare_to_df)
    assert records_written == len(compare_to_df)

예제 #5

0

파일 보기

파일: test_csv.py 프로젝트: Datatamer/tamr-toolbox

def test_dataset_export_csv_empty_dataset(
    buffer_size: Optional[int],
    nrows: Optional[int],
    columns: Optional[List[str]],
):
    client = utils.client.create(**CONFIG["toolbox_test_instance"])
    empty_dataset_id = CONFIG["datasets"]["people_0_records"]
    dataset = client.datasets.by_resource_id(empty_dataset_id)
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
    with tempfile.TemporaryDirectory() as tempdir:
        filename = Path(tempdir) / f"test_export_csv_{timestamp}.csv"
        records_written = csv.from_dataset(
            dataset,
            filename,
            overwrite=True,
            buffer_size=buffer_size,
            nrows=nrows,
            columns=columns,
        )
        records_written = csv.from_dataset(
            dataset,
            filename,
            overwrite=True,
            buffer_size=buffer_size,
            nrows=nrows,
            columns=columns,
        )

        header_string = (EMPTY_TEST_DATA if columns is None else ",".join(
            f'"{col}"' for col in columns))

        # Load raw export data and sort for comparison.
        compare_to_df = pd.read_csv(io.StringIO(header_string),
                                    dtype="object",
                                    index_col="id").sort_index()

        test_df = pd.read_csv(filename,
                              dtype="object",
                              delimiter=",",
                              index_col="id",
                              quotechar='"').sort_index()

    assert test_df.equals(compare_to_df)
    assert records_written == len(compare_to_df)

예제 #6

0

파일 보기

파일: test_csv.py 프로젝트: Datatamer/tamr-toolbox

def test_dataset_overwrite_file(overwrite: bool):
    client = utils.client.create(**CONFIG["toolbox_test_instance"])
    sm_dataset_id = CONFIG["datasets"][
        "minimal_schema_mapping_unified_dataset"]
    dataset = client.datasets.by_resource_id(sm_dataset_id)

    with tempfile.TemporaryDirectory() as tempdir:
        filepath = Path(tempdir) / "test_dataset_overwrite_{overwrite}.csv"

        f = open(filepath, "w")
        f.write("Temporary file")
        f.close()

        with pytest.raises(FileExistsError):
            csv.from_dataset(dataset,
                             filepath,
                             csv_delimiter=",",
                             flatten_delimiter="|",
                             overwrite=overwrite)

예제 #7

0

파일 보기

파일: test_csv.py 프로젝트: skalish/tamr-toolbox

def test_dataset_invalid_renaming_map(
    columns=["tamr_id", "all_names", "ssn", "last_name", "first_name"],
    column_name_dict={"all_names": "last_name"},
):
    client = utils.client.create(**CONFIG["toolbox_test_instance"])
    sm_dataset_id = CONFIG["datasets"][
        "minimal_schema_mapping_unified_dataset"]
    dataset = client.datasets.by_resource_id(sm_dataset_id)
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
    filename = Path(tempfile.gettempdir()) / f"test_export_csv_{timestamp}.csv"

    # ValueError raised by renaming that would yield duplicate columns
    with pytest.raises(ValueError):
        csv.from_dataset(
            dataset,
            filename,
            overwrite=True,
            columns=columns,
            column_name_dict=column_name_dict,
        )

예제 #8

0

파일 보기

파일: test_csv.py 프로젝트: Datatamer/tamr-toolbox

def test_dataset_renaming_csv_columns(
    columns: List[str],
    column_name_dict: Dict[str, str],
):
    client = utils.client.create(**CONFIG["toolbox_test_instance"])
    sm_dataset_id = CONFIG["datasets"][
        "minimal_schema_mapping_unified_dataset"]
    dataset = client.datasets.by_resource_id(sm_dataset_id)
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
    with tempfile.TemporaryDirectory() as tempdir:
        filename = Path(tempdir) / f"test_export_csv_{timestamp}.csv"
        records_written = csv.from_dataset(
            dataset,
            filename,
            overwrite=True,
            columns=columns,
            column_name_dict=column_name_dict,
        )

        # Load raw export data and sort for comparison.
        # Clean up default handling of multi-values where a value is empty (e.g. "|Tuck") since the
        # toolbox does not export in this way (e.g. "Tuck")
        compare_to_df = pd.read_csv(io.StringIO(TEST_DATA),
                                    dtype="object",
                                    index_col="tamr_id").sort_index()
        compare_to_df["all_names"] = compare_to_df["all_names"].str.strip(
            "|").str.split("|")

        # Sort columns of test data
        if columns is not None:
            columns.remove("tamr_id")
            compare_to_df = compare_to_df[columns]
        # CSV renaming should behave the same way as pandas dataframe renaming
        compare_to_df.rename(columns=column_name_dict, inplace=True)

        test_df = pd.read_csv(filename,
                              dtype="object",
                              delimiter=",",
                              index_col="tamr_id",
                              quotechar='"').sort_index()
    test_df["all_names"] = test_df["all_names"].str.split("|")

    assert records_written == len(compare_to_df)
    assert test_df.fillna("").isin(compare_to_df.fillna("")).all(axis=0).all()

예제 #9

0

파일 보기

파일: test_csv.py 프로젝트: Datatamer/tamr-toolbox

def test_dataset_export_csv(
    buffer_size: Optional[int],
    nrows: Optional[int],
    csv_delimiter: str,
    flatten_delimiter: str,
    columns: List[str],
):
    client = utils.client.create(**CONFIG["toolbox_test_instance"])
    sm_dataset_id = CONFIG["datasets"][
        "minimal_schema_mapping_unified_dataset"]
    dataset = client.datasets.by_resource_id(sm_dataset_id)
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
    with tempfile.TemporaryDirectory() as tempdir:
        filename = Path(tempdir) / f"test_export_csv_{timestamp}.csv"
        records_written = csv.from_dataset(
            dataset,
            filename,
            overwrite=True,
            buffer_size=buffer_size,
            nrows=nrows,
            csv_delimiter=csv_delimiter,
            flatten_delimiter=flatten_delimiter,
            columns=columns,
        )

        # Load raw export data and sort for comparison.
        # Clean up default handling of multi-values where a value is empty (e.g. "|Tuck") since the
        # toolbox does not export in this way (e.g. "Tuck")
        compare_to_df = pd.read_csv(io.StringIO(TEST_DATA),
                                    dtype="object",
                                    index_col="tamr_id").sort_index()
        compare_to_df["all_names"] = compare_to_df["all_names"].str.strip(
            "|").str.split("|")

        test_df = pd.read_csv(filename,
                              dtype="object",
                              delimiter=csv_delimiter,
                              index_col="tamr_id",
                              quotechar='"').sort_index()
    test_df["all_names"] = test_df["all_names"].str.split(flatten_delimiter)

    # Sort columns of test data
    if columns is not None:
        # Drop tamr_id as it has been promoted to index
        columns.remove("tamr_id")
        compare_to_df = compare_to_df[columns]

    if nrows is None:
        assert test_df.equals(compare_to_df)
        assert records_written == len(compare_to_df)
    else:
        assert records_written == min(nrows, len(compare_to_df))
        assert len(test_df) == nrows
        # Check that subset of rows are present in comparison dataframe
        # Must convert Nulls to empty strings since None!=None
        # perform check first on the rows (axis=0) and then on the series
        # (note in later versions checking of pandas checking both axes separately to condense to
        # a single boolean can be done by passing simply axis=None, but performing in this way
        # supports earlier version as well)

        assert test_df.fillna("").isin(
            compare_to_df.fillna("")).all(axis=0).all()