def test_concat_timestamp(): df1 = pa.Table.from_arrays( [pa.array(['2020-01-31', '2020-01-31']).cast('timestamp[us]')], names=['ts']) df2 = pa.Table.from_arrays( [pa.array(['2020-12-31', '2020-12-31']).cast('timestamp[ns]')], names=['ts']) df1_vx = vaex.from_arrow_table(df1) df2_vx = vaex.from_arrow_table(df2) df = vaex.concat([df1_vx, df2_vx]) assert df.ts.tolist() == df1['ts'].to_pylist() + df2['ts'].to_pylist() assert df.ts.dtype.internal == pa.timestamp('ns')
def read_file(path, convert=True, **kwargs): """Reads a generic spatial file. Parameters: path (string): The spatial file full path. convert (bool|string): Exports to arrow file when convert is a path. If True, ``arrow_path = path+'.arrow'``. **kwargs: Extra keyword arguments. Returns: (object) A GeoDataFrame object. """ if not convert: table = pa.concat_tables(geovaex.io.to_arrow_table(path, **kwargs), promote=False) if table.schema.metadata is not None and b'geovaex version' in table.schema.metadata.keys( ): df = from_arrow_spatial_table(table) has_geometry = df.geometry.get_raw_geometry().null_count != len( df.geometry) if has_geometry: return df table = table.drop(['geometry']) warnings.warn('Not a spatial file. Returning a Vaex DataFrame.') df = from_arrow_table(table).copy() return df arrow_file = os.path.splitext(path)[0] + '.arrow' if convert else convert to_arrow(path, arrow_file, **kwargs) return open(arrow_file)
def test_partitioning_write_hdf5(): shutil.rmtree(data_path / 'parquet_dataset_partitioned_vaex', ignore_errors=True) df = vaex.from_arrow_table(table) df.export_partitioned( data_path / 'parquet_dataset_partitioned_vaex_my_choice/{subdir}/{i}.hdf5', ['country']) assert len( glob.glob( str(data_path / 'parquet_dataset_partitioned_vaex_my_choice/*/*.hdf5')) ) == 3 # 3 unique values assert len( glob.glob( str(data_path / 'parquet_dataset_partitioned_vaex_my_choice/country=US/[012].hdf5' ))) == 1 assert len( glob.glob( str(data_path / 'parquet_dataset_partitioned_vaex_my_choice/country=NL/[012].hdf5' ))) == 1 assert len( glob.glob( str(data_path / 'parquet_dataset_partitioned_vaex_my_choice/country=FR/[012].hdf5' ))) == 1
def test_partitioning_write_parquet(): shutil.rmtree(data_path / 'parquet_dataset_partitioned_vaex', ignore_errors=True) df = vaex.from_arrow_table(table) df.export_partitioned(data_path / 'parquet_dataset_partitioned_vaex', ['country', 'year']) df = vaex.open(data_path / 'parquet_dataset_partitioned_vaex', partitioning="hive") assert len( glob.glob( str(data_path / 'parquet_dataset_partitioned_vaex/*/*/*.parquet')) ) == 5 # 5 unique values assert len( glob.glob( str(data_path / 'parquet_dataset_partitioned_vaex/country=US/year=2020/*.parquet' ))) == 1 assert len( glob.glob( str(data_path / 'parquet_dataset_partitioned_vaex/country=NL/year=2020/*.parquet' ))) == 1 # import pdb; pdb.set_trace() assert set(df.value.tolist()) == set(values) assert set(df.year.tolist()) == set(years) assert set(df.country.tolist()) == set(countries)
def open(path): """Opens an arrow spatial file. Parameters: path (string): The file's full path. Returns: (object) A GeoDataFrame object. """ source = pa.memory_map(path) try: # first we try if it opens as stream reader = pa.ipc.open_stream(source) except pa.lib.ArrowInvalid: # if not, we open as file reader = pa.ipc.open_file(source) # for some reason this reader is not iterable batches = [ reader.get_batch(i) for i in range(reader.num_record_batches) ] else: # if a stream, we're good batches = reader # this reader is iterable table = pa.Table.from_batches(batches) if table.schema.metadata is not None and b'geovaex version' in table.schema.metadata.keys( ): metadata = table.schema.metadata print(f"Opened file {os.path.basename(path)}, " f"created by geovaex v{metadata[b'geovaex version'].decode()} " f"using {metadata[b'driver'].decode()} driver.") df = from_arrow_spatial_table(table) has_geometry = df.geometry.get_raw_geometry().null_count != len( df.geometry) if has_geometry: return df table = table.drop(['geometry']) warnings.warn('Not a spatial arrow file. Returning a Vaex DataFrame.') df = from_arrow_table(table).copy() return df
def test_arrow_write_table(tmpdir, as_stream): path = str(tmpdir.join('test.arrow')) vaex.from_arrow_table(table).export_arrow(path, as_stream=as_stream) df = vaex.open(path) assert 'col1' in df
def test_partitioning_write_directory(): shutil.rmtree(data_path / 'parquet_dataset_partitioned_directory1', ignore_errors=True) shutil.rmtree(data_path / 'parquet_dataset_partitioned_directory2', ignore_errors=True) partitioning = pa.dataset.partitioning( pa.schema([("country", pa.string())])) df = vaex.from_arrow_table(table) df.export_partitioned(data_path / 'parquet_dataset_partitioned_directory1', ['country'], directory_format='{value}') assert len( glob.glob( str(data_path / 'parquet_dataset_partitioned_directory1/*/*.parquet')) ) == 3 # 3 unique values assert len( glob.glob( str(data_path / 'parquet_dataset_partitioned_directory1/US/*.parquet'))) == 1 assert len( glob.glob( str(data_path / 'parquet_dataset_partitioned_directory1/NL/*.parquet'))) == 1 assert len( glob.glob( str(data_path / 'parquet_dataset_partitioned_directory1/FR/*.parquet'))) == 1 assert set(df.value.tolist()) == set(values) assert set(df.year.tolist()) == set(years) assert set(df.country.tolist()) == set(countries) # now with 2 keys partitioning = pa.dataset.partitioning( pa.schema([("year", pa.int64()), ("country", pa.string())])) df.export_partitioned(data_path / 'parquet_dataset_partitioned_directory2', ['year', 'country'], directory_format='{value}') assert len( glob.glob( str(data_path / 'parquet_dataset_partitioned_directory2/*/*/*.parquet')) ) == 5 # 5 unique values assert len( glob.glob( str(data_path / 'parquet_dataset_partitioned_directory2/2020/US/*.parquet')) ) == 1 assert len( glob.glob( str(data_path / 'parquet_dataset_partitioned_directory2/2020/NL/*.parquet')) ) == 1 df = vaex.open(data_path / 'parquet_dataset_partitioned_directory2', partitioning=partitioning) assert set(df.value.tolist()) == set(values) assert set(df.year.tolist()) == set(years) assert set(df.country.tolist()) == set(countries)