def test_generate_from_meta(): md = Metadata.from_dict({ "name": "test_table", "file_format": "test-format", "columns": [ { "name": "my_int", "type": "int64", "description": "This is an integer", "nullable": False, }, { "name": "my_double", "type": "float64", "nullable": True }, { "name": "my_date", "type": "date64" }, { "name": "my_decimal", "type": "decimal128(10,2)" }, { "name": "my_timestamp", "type": "timestamp(s)", "description": "Partition column", }, ], "partitions": ["my_timestamp"], }) ac = ArrowConverter() assert isinstance(ac.options, BaseConverterOptions) schema1 = ac.generate_from_meta(md) schema2 = ac.generate_from_meta(md, False) assert isinstance(schema1, pa.Schema) assert isinstance(schema2, pa.Schema) schema_str1 = ("my_int: int64 not null\nmy_double: double\n" "my_date: date64[ms]\nmy_decimal: decimal(10, 2)") schema_str2 = schema_str1 + "\nmy_timestamp: timestamp[s]" assert schema1.to_string() == schema_str1 assert schema2.to_string() == schema_str2
def test_generate_from_meta(): md = Metadata.from_dict({ "name": "test_table", "file_format": "test-format", "columns": [ { "name": "my_int", "type": "int64", "description": "This is an integer", "nullable": False, }, { "name": "my_double", "type": "float64", "nullable": True }, { "name": "my_date", "type": "date64" }, { "name": "my_decimal", "type": "decimal128(10,2)" }, { "name": "my_timestamp", "type": "timestamp(s)", "description": "Partition column", }, ], "partitions": ["my_timestamp"], }) ac = ArrowConverter() assert isinstance(ac.options, BaseConverterOptions) schema1 = ac.generate_from_meta(md) schema2 = ac.generate_from_meta(md, False) assert isinstance(schema1, pa.Schema) assert isinstance(schema2, pa.Schema) expected_names = ["my_int", "my_double", "my_date", "my_decimal"] expected_types = [ pa.int64(), pa.float64(), pa.date64(), pa.decimal128(10, 2) ] assert schema1.names == expected_names checks1 = [a.equals(e) for a, e in zip(schema1.types, expected_types)] assert all(checks1) # Do schema2 assertions expected_names.append("my_timestamp") expected_types.append(pa.timestamp("s")) assert schema2.names == expected_names checks2 = [a.equals(e) for a, e in zip(schema2.types, expected_types)] assert all(checks2) # Also check specific type properties assert schema2.field("my_decimal").type.precision == 10 assert schema2.field("my_decimal").type.scale == 2 assert schema2.field("my_timestamp").type.unit == "s"
def _parse_data_to_pandas(filepath: str, table_params: dict, metadata: dict): """ Reads in the data from the given filepath and returns a dataframe """ meta_col_names = [ c["name"] for c in metadata["columns"] if c["name"] not in metadata.get("partitions", []) ] # For string based file types convert make arrow readers read them in as strings # validators will still treat these as dates but will run validation against strings # cols expecting values to match a timestamp format if "json" in metadata["file_format"] or "csv" in metadata["file_format"]: md_obj = Metadata.from_dict(metadata) cols = md_obj.columns cols_to_force_str_read_in = [] for c in cols: if c["type"].startswith("time") or c["type"].startswith("date"): c["type"] = "string" c["type_category"] = "string" cols_to_force_str_read_in.append(c["name"]) md_obj.columns = cols ac = ArrowConverter() arrow_schema = ac.generate_from_meta(md_obj) ts_as_str_schema = pa.schema([]) for cname in cols_to_force_str_read_in: ts_as_str_schema = ts_as_str_schema.append( arrow_schema.field(cname)) # Set the reader type if filepath.startswith("s3://"): reader_fs = fs.S3FileSystem(region="eu-west-1") fp_for_file_reader = filepath.replace("s3://", "", 1) else: reader_fs = fs.LocalFileSystem() fp_for_file_reader = filepath with reader_fs.open_input_stream(fp_for_file_reader) as f: if "csv" in metadata["file_format"]: # Safer CSV load for newlines_in_values set to True if table_params.get("expect-header", True): po = csv.ParseOptions(newlines_in_values=True) else: po = csv.ParseOptions(newlines_in_values=True, column_names=meta_col_names) if ts_as_str_schema: co = csv.ConvertOptions(column_types=ts_as_str_schema) else: co = None df = pa_read_csv_to_pandas( input_file=f, schema=arrow_schema, expect_full_schema=False, parse_options=po, convert_options=co, ) # dates/datetimes == string elif "json" in metadata["file_format"]: po = json.ParseOptions( newlines_in_values=True, explicit_schema=ts_as_str_schema if ts_as_str_schema else None, ) df = pa_read_json_to_pandas( input_file=f, schema=arrow_schema, expect_full_schema=False, parse_options=po, ) # dates/datetimes == string elif "parquet" in metadata["file_format"]: df = arrow_to_pandas(pq.read_table(f)) # dates/datetimes == datetime / date else: raise ValueError( f"Unknown file_format in metadata: {metadata['file_format']}.") if table_params.get("row-limit"): df = df.sample(table_params.get("row-limit")) if table_params.get("headers-ignore-case"): df_cols = [c.lower() for c in df.columns] df.columns = df_cols if table_params.get("only-test-cols-in-metadata", False): keep_cols = [c for c in df.columns if c in meta_col_names] df = df[keep_cols] return df