def test_generate_to_meta(): struct = pa.struct([ ("x", pa.timestamp("s")), ( "y", pa.struct([ ("f1", pa.int32()), ("f2", pa.string()), ("f3", pa.decimal128(3, 5)), ]), ), ]) example_schema = pa.schema([ pa.field("a", pa.int64()), pa.field("b", pa.string()), pa.field("c", struct), pa.field("d", pa.list_(pa.int64())), ]) expected_name_type = ( ("a", "int64"), ("b", "string"), ("c", "struct<"), ("d", "list<"), ) ac = ArrowConverter() meta1 = ac.generate_to_meta(arrow_schema=example_schema) assert isinstance(meta1, Metadata) checks = [ c["name"] == e[0] and c["type"].startswith(e[1]) for c, e in zip(meta1.columns, expected_name_type) ] assert all(checks) meta2 = ac.generate_to_meta( arrow_schema=example_schema, meta_init_dict={ "name": "test", "file_format": "parquet" }, ) assert isinstance(meta2, Metadata) assert meta2.name == "test" assert meta2.file_format == "parquet" assert meta1.columns == meta2.columns # Check warning is raised on columns being overwritten with pytest.warns(UserWarning): _ = ac.generate_to_meta( arrow_schema=example_schema, meta_init_dict={"columns": [{ "name": "stuff", "type": "string" }]}, )
def test_converter_accepts_type(meta_type): """ If new type is added to tests.valid_types then it may fail this test Args: meta_type ([type]): str """ ac = ArrowConverter() _ = ac.convert_col_type(meta_type)
def _get_arrow_schema(schema: Union[pa.schema, Metadata, dict]): ac = ArrowConverter() if isinstance(schema, Metadata): schema = ac.generate_to_meta(schema) elif isinstance(schema, dict): schema = Metadata.from_dict(schema) schema = ac.generate_to_meta(schema) elif isinstance(schema, pa.Schema): pass else: raise TypeError(f"schema type not allowed: {type(schema)}") return schema
def test_generate_from_meta(): md = Metadata.from_dict({ "name": "test_table", "file_format": "test-format", "columns": [ { "name": "my_int", "type": "int64", "description": "This is an integer", "nullable": False, }, { "name": "my_double", "type": "float64", "nullable": True }, { "name": "my_date", "type": "date64" }, { "name": "my_decimal", "type": "decimal128(10,2)" }, { "name": "my_timestamp", "type": "timestamp(s)", "description": "Partition column", }, ], "partitions": ["my_timestamp"], }) ac = ArrowConverter() assert isinstance(ac.options, BaseConverterOptions) schema1 = ac.generate_from_meta(md) schema2 = ac.generate_from_meta(md, False) assert isinstance(schema1, pa.Schema) assert isinstance(schema2, pa.Schema) schema_str1 = ("my_int: int64 not null\nmy_double: double\n" "my_date: date64[ms]\nmy_decimal: decimal(10, 2)") schema_str2 = schema_str1 + "\nmy_timestamp: timestamp[s]" assert schema1.to_string() == schema_str1 assert schema2.to_string() == schema_str2
def read( self, input_path: str, metadata: Metadata = None, **kwargs ) -> pd.DataFrame: """ Reads a Parquet file and returns a Pandas DataFrame input_path: File to read either local or S3. metadata: A metadata object or dict **kwargs (optional): Additional kwargs are passed to the arrow reader arrow.parquet.read_table """ arrow_tab = pq.read_table(input_path, **kwargs) if metadata: meta = validate_and_enrich_metadata(metadata) schema = ArrowConverter().generate_from_meta(meta) arrow_tab = cast_arrow_table_to_schema( arrow_tab, schema=schema, expect_full_schema=self.expect_full_schema, ) df = arrow_to_pandas( arrow_tab, pd_boolean=self.pd_boolean, pd_integer=self.pd_integer, pd_string=self.pd_string, pd_date_type=self.pd_date_type, pd_timestamp_type=self.pd_timestamp_type, ) return df
def test_meta_to_arrow_type(meta_type, arrow_type): assert_meta_col_conversion(ArrowConverter, meta_type, arrow_type, expect_raises=None) # Test round trip ac = ArrowConverter() round_trip_meta_type = ac.reverse_convert_col_type(arrow_type) # reverse always returns non-underscored aliases for bool and list meta_type = meta_type.replace("bool_", "bool") meta_type = meta_type.replace("list_", "list") # utf8 and string are the same # pa.string().equals(pa.utf8()) # True # So reverse conversion sets pa.utf8() to "string" meta_type = meta_type.replace("utf8", "string") # finally remove any whitespace meta_type = "".join(meta_type.split()) round_trip_meta_type = "".join(round_trip_meta_type.split()) assert meta_type == round_trip_meta_type
def test_generate_from_meta(): md = Metadata.from_dict({ "name": "test_table", "file_format": "test-format", "columns": [ { "name": "my_int", "type": "int64", "description": "This is an integer", "nullable": False, }, { "name": "my_double", "type": "float64", "nullable": True }, { "name": "my_date", "type": "date64" }, { "name": "my_decimal", "type": "decimal128(10,2)" }, { "name": "my_timestamp", "type": "timestamp(s)", "description": "Partition column", }, ], "partitions": ["my_timestamp"], }) ac = ArrowConverter() assert isinstance(ac.options, BaseConverterOptions) schema1 = ac.generate_from_meta(md) schema2 = ac.generate_from_meta(md, False) assert isinstance(schema1, pa.Schema) assert isinstance(schema2, pa.Schema) expected_names = ["my_int", "my_double", "my_date", "my_decimal"] expected_types = [ pa.int64(), pa.float64(), pa.date64(), pa.decimal128(10, 2) ] assert schema1.names == expected_names checks1 = [a.equals(e) for a, e in zip(schema1.types, expected_types)] assert all(checks1) # Do schema2 assertions expected_names.append("my_timestamp") expected_types.append(pa.timestamp("s")) assert schema2.names == expected_names checks2 = [a.equals(e) for a, e in zip(schema2.types, expected_types)] assert all(checks2) # Also check specific type properties assert schema2.field("my_decimal").type.precision == 10 assert schema2.field("my_decimal").type.scale == 2 assert schema2.field("my_timestamp").type.unit == "s"
def _parse_data_to_pandas(filepath: str, table_params: dict, metadata: dict): """ Reads in the data from the given filepath and returns a dataframe """ meta_col_names = [ c["name"] for c in metadata["columns"] if c["name"] not in metadata.get("partitions", []) ] # For string based file types convert make arrow readers read them in as strings # validators will still treat these as dates but will run validation against strings # cols expecting values to match a timestamp format if "json" in metadata["file_format"] or "csv" in metadata["file_format"]: md_obj = Metadata.from_dict(metadata) cols = md_obj.columns cols_to_force_str_read_in = [] for c in cols: if c["type"].startswith("time") or c["type"].startswith("date"): c["type"] = "string" c["type_category"] = "string" cols_to_force_str_read_in.append(c["name"]) md_obj.columns = cols ac = ArrowConverter() arrow_schema = ac.generate_from_meta(md_obj) ts_as_str_schema = pa.schema([]) for cname in cols_to_force_str_read_in: ts_as_str_schema = ts_as_str_schema.append( arrow_schema.field(cname)) # Set the reader type if filepath.startswith("s3://"): reader_fs = fs.S3FileSystem(region="eu-west-1") fp_for_file_reader = filepath.replace("s3://", "", 1) else: reader_fs = fs.LocalFileSystem() fp_for_file_reader = filepath with reader_fs.open_input_stream(fp_for_file_reader) as f: if "csv" in metadata["file_format"]: # Safer CSV load for newlines_in_values set to True if table_params.get("expect-header", True): po = csv.ParseOptions(newlines_in_values=True) else: po = csv.ParseOptions(newlines_in_values=True, column_names=meta_col_names) if ts_as_str_schema: co = csv.ConvertOptions(column_types=ts_as_str_schema) else: co = None df = pa_read_csv_to_pandas( input_file=f, schema=arrow_schema, expect_full_schema=False, parse_options=po, convert_options=co, ) # dates/datetimes == string elif "json" in metadata["file_format"]: po = json.ParseOptions( newlines_in_values=True, explicit_schema=ts_as_str_schema if ts_as_str_schema else None, ) df = pa_read_json_to_pandas( input_file=f, schema=arrow_schema, expect_full_schema=False, parse_options=po, ) # dates/datetimes == string elif "parquet" in metadata["file_format"]: df = arrow_to_pandas(pq.read_table(f)) # dates/datetimes == datetime / date else: raise ValueError( f"Unknown file_format in metadata: {metadata['file_format']}.") if table_params.get("row-limit"): df = df.sample(table_params.get("row-limit")) if table_params.get("headers-ignore-case"): df_cols = [c.lower() for c in df.columns] df.columns = df_cols if table_params.get("only-test-cols-in-metadata", False): keep_cols = [c for c in df.columns if c in meta_col_names] df = df[keep_cols] return df