def test_generate_to_meta():

    struct = pa.struct([
        ("x", pa.timestamp("s")),
        (
            "y",
            pa.struct([
                ("f1", pa.int32()),
                ("f2", pa.string()),
                ("f3", pa.decimal128(3, 5)),
            ]),
        ),
    ])

    example_schema = pa.schema([
        pa.field("a", pa.int64()),
        pa.field("b", pa.string()),
        pa.field("c", struct),
        pa.field("d", pa.list_(pa.int64())),
    ])

    expected_name_type = (
        ("a", "int64"),
        ("b", "string"),
        ("c", "struct<"),
        ("d", "list<"),
    )

    ac = ArrowConverter()
    meta1 = ac.generate_to_meta(arrow_schema=example_schema)

    assert isinstance(meta1, Metadata)

    checks = [
        c["name"] == e[0] and c["type"].startswith(e[1])
        for c, e in zip(meta1.columns, expected_name_type)
    ]
    assert all(checks)

    meta2 = ac.generate_to_meta(
        arrow_schema=example_schema,
        meta_init_dict={
            "name": "test",
            "file_format": "parquet"
        },
    )
    assert isinstance(meta2, Metadata)
    assert meta2.name == "test"
    assert meta2.file_format == "parquet"
    assert meta1.columns == meta2.columns

    # Check warning is raised on columns being overwritten
    with pytest.warns(UserWarning):
        _ = ac.generate_to_meta(
            arrow_schema=example_schema,
            meta_init_dict={"columns": [{
                "name": "stuff",
                "type": "string"
            }]},
        )
def test_converter_accepts_type(meta_type):
    """
    If new type is added to tests.valid_types then it may fail this test

    Args:
        meta_type ([type]): str
    """
    ac = ArrowConverter()
    _ = ac.convert_col_type(meta_type)
def _get_arrow_schema(schema: Union[pa.schema, Metadata, dict]):
    ac = ArrowConverter()
    if isinstance(schema, Metadata):
        schema = ac.generate_to_meta(schema)
    elif isinstance(schema, dict):
        schema = Metadata.from_dict(schema)
        schema = ac.generate_to_meta(schema)
    elif isinstance(schema, pa.Schema):
        pass
    else:
        raise TypeError(f"schema type not allowed: {type(schema)}")

    return schema
def test_generate_from_meta():
    md = Metadata.from_dict({
        "name":
        "test_table",
        "file_format":
        "test-format",
        "columns": [
            {
                "name": "my_int",
                "type": "int64",
                "description": "This is an integer",
                "nullable": False,
            },
            {
                "name": "my_double",
                "type": "float64",
                "nullable": True
            },
            {
                "name": "my_date",
                "type": "date64"
            },
            {
                "name": "my_decimal",
                "type": "decimal128(10,2)"
            },
            {
                "name": "my_timestamp",
                "type": "timestamp(s)",
                "description": "Partition column",
            },
        ],
        "partitions": ["my_timestamp"],
    })

    ac = ArrowConverter()
    assert isinstance(ac.options, BaseConverterOptions)

    schema1 = ac.generate_from_meta(md)
    schema2 = ac.generate_from_meta(md, False)

    assert isinstance(schema1, pa.Schema)
    assert isinstance(schema2, pa.Schema)

    schema_str1 = ("my_int: int64 not null\nmy_double: double\n"
                   "my_date: date64[ms]\nmy_decimal: decimal(10, 2)")
    schema_str2 = schema_str1 + "\nmy_timestamp: timestamp[s]"
    assert schema1.to_string() == schema_str1
    assert schema2.to_string() == schema_str2
    def read(
        self, input_path: str, metadata: Metadata = None, **kwargs
    ) -> pd.DataFrame:
        """
        Reads a Parquet file and returns a Pandas DataFrame
        input_path: File to read either local or S3.
        metadata: A metadata object or dict
        **kwargs (optional): Additional kwargs are passed to the arrow reader
            arrow.parquet.read_table
        """

        arrow_tab = pq.read_table(input_path, **kwargs)

        if metadata:
            meta = validate_and_enrich_metadata(metadata)
            schema = ArrowConverter().generate_from_meta(meta)
            arrow_tab = cast_arrow_table_to_schema(
                arrow_tab,
                schema=schema,
                expect_full_schema=self.expect_full_schema,
            )

        df = arrow_to_pandas(
            arrow_tab,
            pd_boolean=self.pd_boolean,
            pd_integer=self.pd_integer,
            pd_string=self.pd_string,
            pd_date_type=self.pd_date_type,
            pd_timestamp_type=self.pd_timestamp_type,
        )

        return df
def test_meta_to_arrow_type(meta_type, arrow_type):
    assert_meta_col_conversion(ArrowConverter,
                               meta_type,
                               arrow_type,
                               expect_raises=None)
    # Test round trip
    ac = ArrowConverter()
    round_trip_meta_type = ac.reverse_convert_col_type(arrow_type)
    # reverse always returns non-underscored aliases for bool and list
    meta_type = meta_type.replace("bool_", "bool")
    meta_type = meta_type.replace("list_", "list")

    # utf8 and string are the same
    # pa.string().equals(pa.utf8()) # True
    # So reverse conversion sets pa.utf8() to "string"
    meta_type = meta_type.replace("utf8", "string")

    # finally remove any whitespace
    meta_type = "".join(meta_type.split())
    round_trip_meta_type = "".join(round_trip_meta_type.split())

    assert meta_type == round_trip_meta_type
def test_generate_from_meta():
    md = Metadata.from_dict({
        "name":
        "test_table",
        "file_format":
        "test-format",
        "columns": [
            {
                "name": "my_int",
                "type": "int64",
                "description": "This is an integer",
                "nullable": False,
            },
            {
                "name": "my_double",
                "type": "float64",
                "nullable": True
            },
            {
                "name": "my_date",
                "type": "date64"
            },
            {
                "name": "my_decimal",
                "type": "decimal128(10,2)"
            },
            {
                "name": "my_timestamp",
                "type": "timestamp(s)",
                "description": "Partition column",
            },
        ],
        "partitions": ["my_timestamp"],
    })

    ac = ArrowConverter()
    assert isinstance(ac.options, BaseConverterOptions)

    schema1 = ac.generate_from_meta(md)
    schema2 = ac.generate_from_meta(md, False)

    assert isinstance(schema1, pa.Schema)
    assert isinstance(schema2, pa.Schema)

    expected_names = ["my_int", "my_double", "my_date", "my_decimal"]
    expected_types = [
        pa.int64(),
        pa.float64(),
        pa.date64(),
        pa.decimal128(10, 2)
    ]
    assert schema1.names == expected_names

    checks1 = [a.equals(e) for a, e in zip(schema1.types, expected_types)]
    assert all(checks1)

    # Do schema2 assertions
    expected_names.append("my_timestamp")
    expected_types.append(pa.timestamp("s"))

    assert schema2.names == expected_names

    checks2 = [a.equals(e) for a, e in zip(schema2.types, expected_types)]
    assert all(checks2)

    # Also check specific type properties
    assert schema2.field("my_decimal").type.precision == 10
    assert schema2.field("my_decimal").type.scale == 2
    assert schema2.field("my_timestamp").type.unit == "s"
示例#8
0
def _parse_data_to_pandas(filepath: str, table_params: dict, metadata: dict):
    """
    Reads in the data from the given filepath and returns
    a dataframe
    """

    meta_col_names = [
        c["name"] for c in metadata["columns"]
        if c["name"] not in metadata.get("partitions", [])
    ]

    # For string based file types convert make arrow readers read them in as strings
    # validators will still treat these as dates but will run validation against strings
    # cols expecting values to match a timestamp format
    if "json" in metadata["file_format"] or "csv" in metadata["file_format"]:
        md_obj = Metadata.from_dict(metadata)
        cols = md_obj.columns

        cols_to_force_str_read_in = []
        for c in cols:
            if c["type"].startswith("time") or c["type"].startswith("date"):
                c["type"] = "string"
                c["type_category"] = "string"
                cols_to_force_str_read_in.append(c["name"])

        md_obj.columns = cols
        ac = ArrowConverter()
        arrow_schema = ac.generate_from_meta(md_obj)

        ts_as_str_schema = pa.schema([])
        for cname in cols_to_force_str_read_in:
            ts_as_str_schema = ts_as_str_schema.append(
                arrow_schema.field(cname))

    # Set the reader type
    if filepath.startswith("s3://"):
        reader_fs = fs.S3FileSystem(region="eu-west-1")
        fp_for_file_reader = filepath.replace("s3://", "", 1)

    else:
        reader_fs = fs.LocalFileSystem()
        fp_for_file_reader = filepath

    with reader_fs.open_input_stream(fp_for_file_reader) as f:
        if "csv" in metadata["file_format"]:

            # Safer CSV load for newlines_in_values set to True
            if table_params.get("expect-header", True):
                po = csv.ParseOptions(newlines_in_values=True)
            else:
                po = csv.ParseOptions(newlines_in_values=True,
                                      column_names=meta_col_names)

            if ts_as_str_schema:
                co = csv.ConvertOptions(column_types=ts_as_str_schema)
            else:
                co = None

            df = pa_read_csv_to_pandas(
                input_file=f,
                schema=arrow_schema,
                expect_full_schema=False,
                parse_options=po,
                convert_options=co,
            )
            # dates/datetimes == string

        elif "json" in metadata["file_format"]:

            po = json.ParseOptions(
                newlines_in_values=True,
                explicit_schema=ts_as_str_schema if ts_as_str_schema else None,
            )

            df = pa_read_json_to_pandas(
                input_file=f,
                schema=arrow_schema,
                expect_full_schema=False,
                parse_options=po,
            )
            # dates/datetimes == string

        elif "parquet" in metadata["file_format"]:
            df = arrow_to_pandas(pq.read_table(f))
            # dates/datetimes == datetime / date

        else:
            raise ValueError(
                f"Unknown file_format in metadata: {metadata['file_format']}.")

    if table_params.get("row-limit"):
        df = df.sample(table_params.get("row-limit"))

    if table_params.get("headers-ignore-case"):
        df_cols = [c.lower() for c in df.columns]
        df.columns = df_cols

    if table_params.get("only-test-cols-in-metadata", False):
        keep_cols = [c for c in df.columns if c in meta_col_names]
        df = df[keep_cols]

    return df