def test_passes_two_datasets_different_expressions(): x = np.array([2.]) y = x**2 dataset = vaex.dataset.DatasetArrays(x=x, y=y) df1 = vaex.from_dataset(dataset) df2 = vaex.from_dataset(dataset) df1['a'] = 'x * y' df2['b'] = 'x + y' executor = df1.executor executor.passes = 0 s1 = df1.sum('a', delay=True) s2 = df2.sum('b', delay=True) df1.execute() assert executor.passes == 1 assert s1.get() == 2 * 4 assert s2.get() == 2 + 4
async def get_df(name): if name not in datasets: raise HTTPException(status_code=404, detail=f"dataset {name!r} not found") # for now we only allow 1 request to execute at a time async with global_lock: yield vaex.from_dataset(datasets[name])
def process(args): counts, arrays = args # arrays = {key: value.get() for key, value in arrays.items()} # take out the edges arrays = { key: vaex.utils.extract_central_part(value) for key, value in arrays.items() } counts = vaex.utils.extract_central_part(counts) # make sure we respect the sorting def sort(ar): for i, by in list(enumerate(self.by))[::-1]: sort_indices = by.sort_indices if sort_indices is not None: # if sort_indices come from arrow, it will be uint64 # which np.take does not like sort_indices = vaex.array_types.to_numpy(sort_indices) if sort_indices.dtype == np.dtype("uint64"): sort_indices = sort_indices.astype("int64") ar = np.take(ar, sort_indices, axis=i) return ar arrays = {key: sort(value) for key, value in arrays.items()} if self.combine and self.expand and isinstance( self.by[0], GrouperCombined): assert len(self.by) == 1 values = self.by[0].bin_values columns = { field.name: ar for field, ar in zip(values.type, values.flatten()) } for key, value in arrays.items(): assert value.ndim == 1 columns[key] = value else: counts = sort(counts) mask = counts > 0 columns = {} for by, indices in zip(self.by, np.where(mask)): columns[by.label] = by.bin_values.take(indices) if mask.sum() == mask.size: # if we want all, just take it all # should be faster for key, value in arrays.items(): columns[key] = value.ravel() else: for key, value in arrays.items(): columns[key] = value[mask] dataset_arrays = vaex.dataset.DatasetArrays(columns) dataset = DatasetGroupby(dataset_arrays, self.df, self.by_original, actions, combine=self.combine, expand=self.expand, sort=self.sort) df_grouped = vaex.from_dataset(dataset) return df_grouped
def add_graphql(): import vaex.graphql import graphene from starlette.graphql import GraphQLApp dfs = {name: vaex.from_dataset(ds) for name, ds in datasets.items()} Query = vaex.graphql.create_query(dfs) schema = graphene.Schema(query=Query) app.add_route("/graphql", GraphQLApp(schema=schema))
def cached_output(*args, **kwargs): ds = vaex.dataset.open(path_input, fs_options=fs_options_input, *args, **kwargs) if ds is not None: df = vaex.from_dataset(ds) df.export(path_output)
def test_passes_two_datasets_different_vars(): x = np.array([2.]) y = x**2 dataset = vaex.dataset.DatasetArrays(x=x, y=y) df1 = vaex.from_dataset(dataset) df2 = vaex.from_dataset(dataset) df1.variables['a'] = 1 df2.variables['a'] = 2 df1['z'] = 'x + y * a' df2['z'] = 'x + y * a' executor = df1.executor executor.passes = 0 s1 = df1.sum('z', delay=True) s2 = df2.sum('z', delay=True) df1.execute() assert executor.passes == 1 assert s1.get() == 2 + 4 * 1 assert s2.get() == 2 + 4 * 2
def agg(self, actions): # TODO: this basically forms a cartesian product, we can do better, use a # 'multistage' hashmap arrays = super(GroupBy, self)._agg(actions) has_non_existing_pairs = len(self.by) > 1 # we don't want non-existing pairs (e.g. Amsterdam in France does not exist) counts = self.counts # nobody wanted to know count*, but we need it if we included non-existing pairs if has_non_existing_pairs and counts is None: # TODO: it seems this path is never tested count_agg = vaex.agg.count(edges=True) counts = self.df._agg(count_agg, self.binners, delay=_USE_DELAY) self.df.execute() if _USE_DELAY: arrays = {key: value.get() for key, value in arrays.items()} if has_non_existing_pairs: counts = counts.get() # take out the edges arrays = {key: vaex.utils.extract_central_part(value) for key, value in arrays.items()} if has_non_existing_pairs: counts = vaex.utils.extract_central_part(counts) # make sure we respect the sorting sorting = tuple(by.sort_indices if by.sort_indices is not None else slice(None) for by in self.by) arrays = {key: value[sorting] for key, value in arrays.items()} if self.combine and self.expand and isinstance(self.by[0], GrouperCombined): assert len(self.by) == 1 values = self.by[0].bin_values columns = {field.name: ar for field, ar in zip(values.type, values.flatten())} for key, value in arrays.items(): assert value.ndim == 1 columns[key] = value else: if has_non_existing_pairs: counts = counts[sorting] mask = counts > 0 coords = [coord[mask] for coord in np.meshgrid(*self._coords1d, indexing='ij')] columns = {by.label: coord for by, coord in zip(self.by, coords)} for key, value in arrays.items(): columns[key] = value[mask] else: columns = {by.label: coord for by, coord in zip(self.by, self._coords1d)} for key, value in arrays.items(): assert value.ndim == 1 columns[key] = value dataset_arrays = vaex.dataset.DatasetArrays(columns) dataset = DatasetGroupby(dataset_arrays, self.df, self.by_original, actions, combine=self.combine, expand=self.expand, sort=self.sort) df_grouped = vaex.from_dataset(dataset) return df_grouped
def update_service(dfs=None): global service_threaded import vaex.server.service if dfs is None: dfs = { name: vaex.from_dataset(dataset) for name, dataset in datasets.items() } service_bare = vaex.server.service.Service(dfs) server_thread_count = 1 threads_per_job = 32 service_threaded = vaex.server.service.AsyncThreadedService( service_bare, server_thread_count, threads_per_job)
def test_concat_chunk_iterator(l1, l2): i1 = 0 i2 = i1 + l1 i3 = i2 + l2 x = np.arange(10) y = x**2 g = x // 3 ds = vaex.dataset.DatasetArrays(x=x, y=y, g=g) df_original = df = vaex.from_dataset(ds) df1 = df[i1:i2] df2 = df[i2:i3] df3 = df[i3:] df = vaex.concat([df1, df2, df3]) ds_full = ds = df.dataset # very similar to the arrow/datase_test.py parquet test iter = ds.chunk_iterator(['x', 'y'], chunk_size=2) for i in range(5): i1, i2, chunks = next(iter) assert i1 == i * 2 assert i2 == (i + 1) * 2 chunks['x'].tolist() == x[i1:i2].tolist() chunks['y'].tolist() == y[i1:i2].tolist() # no columns iter = ds.chunk_iterator([], chunk_size=2) for i in range(5): i1, i2, chunks = next(iter) assert i1 == i * 2 assert i2 == (i + 1) * 2 ds = ds[1:10] assert 'x' in ds assert ds.row_count == 9 iter = ds.chunk_iterator(['x', 'y'], chunk_size=2) for i in range(5): i1, i2, chunks = next(iter) if i == 4: assert i1 == 8 assert i2 == 9 else: assert i1 == i * 2 assert i2 == (i + 1) * 2 # chunks = chunks chunks['x'].tolist() == x[i1:i2].tolist() chunks['y'].tolist() == y[i1:i2].tolist() ds = ds[1:9] assert ds.row_count == 8 iter = ds.chunk_iterator(['x', 'y'], chunk_size=2) for i in range(4): i1, i2, chunks = next(iter) assert i1 == i * 2 assert i2 == (i + 1) * 2 chunks['x'].tolist() == x[i1:i2].tolist() chunks['y'].tolist() == y[i1:i2].tolist() # no columns iter = ds.chunk_iterator([], chunk_size=2) for i in range(4): i1, i2, chunks = next(iter) assert i1 == i * 2 assert i2 == (i + 1) * 2 # again, but here we skip of total of a chunk_size at the end ds = ds_full[:8] # import pdb; pdb.set_trace() assert ds.row_count == 8 iter = ds.chunk_iterator(['x', 'y'], chunk_size=2) for i in range(4): i1, i2, chunks = next(iter) assert i1 == i * 2 assert i2 == (i + 1) * 2 chunks['x'].tolist() == x[i1:i2].tolist() chunks['y'].tolist() == y[i1:i2].tolist() for i in range(9): for j in range(i + 1, 10): ds = ds_full.slice(i, j) values = [] for i1, i2, chunks in ds.chunk_iterator(['x']): values.extend(chunks['x'].tolist()) assert x[i:j].tolist() == values assert df.x.tolist() == x.tolist() assert df.g.tolist() == g.tolist() ds_dropped = ds.dropped('x') assert 'x' not in ds_dropped
def open(path, convert=False, progress=None, shuffle=False, fs_options={}, fs=None, *args, **kwargs): """Open a DataFrame from file given by path. Example: >>> df = vaex.open('sometable.hdf5') >>> df = vaex.open('somedata*.csv', convert='bigdata.hdf5') :param str or list path: local or absolute path to file, or glob string, or list of paths :param convert: Uses `dataframe.export` when convert is a path. If True, ``convert=path+'.hdf5'`` The conversion is skipped if the input file or conversion argument did not change. :param progress: (_Only applies when convert is not False_) {progress} :param bool shuffle: shuffle converted DataFrame or not :param dict fs_options: Extra arguments passed to an optional file system if needed: * Amazon AWS S3 * `anonymous` - access file without authentication (public files) * `access_key` - AWS access key, if not provided will use the standard env vars, or the `~/.aws/credentials` file * `secret_key` - AWS secret key, similar to `access_key` * `profile` - If multiple profiles are present in `~/.aws/credentials`, pick this one instead of 'default', see https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html * `region` - AWS Region, e.g. 'us-east-1`, will be determined automatically if not provided. * `endpoint_override` - URL/ip to connect to, instead of AWS, e.g. 'localhost:9000' for minio * Google Cloud Storage * :py:class:`gcsfs.core.GCSFileSystem` In addition you can pass the boolean "cache" option. :param group: (optional) Specify the group to be read from and HDF5 file. By default this is set to "/table". :param fs: Apache Arrow FileSystem object, or FSSpec FileSystem object, if specified, fs_options should be empty. :param args: extra arguments for file readers that need it :param kwargs: extra keyword arguments :return: return a DataFrame on success, otherwise None :rtype: DataFrame Cloud storage support: Vaex supports streaming of HDF5 files from Amazon AWS S3 and Google Cloud Storage. Files are by default cached in $HOME/.vaex/file-cache/(s3|gs) such that successive access is as fast as native disk access. The following common fs_options are used for S3 access: * anon: Use anonymous access or not (false by default). (Allowed values are: true,True,1,false,False,0) * cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0) All fs_options can also be encoded in the file path as a query string. Examples: >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5', fs_options={{'anonymous': True}}) >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5?anon=true') >>> df = vaex.open('s3://mybucket/path/to/file.hdf5', fs_options={{'access_key': my_key, 'secret_key': my_secret_key}}) >>> df = vaex.open(f's3://mybucket/path/to/file.hdf5?access_key={{my_key}}&secret_key={{my_secret_key}}') >>> df = vaex.open('s3://mybucket/path/to/file.hdf5?profile=myproject') Google Cloud Storage support: The following fs_options are used for GCP access: * token: Authentication method for GCP. Use 'anon' for annonymous access. See https://gcsfs.readthedocs.io/en/latest/index.html#credentials for more details. * cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0). * project and other arguments are passed to :py:class:`gcsfs.core.GCSFileSystem` Examples: >>> df = vaex.open('gs://vaex-data/airlines/us_airline_data_1988_2019.hdf5', fs_options={{'token': None}}) >>> df = vaex.open('gs://vaex-data/airlines/us_airline_data_1988_2019.hdf5?token=anon') >>> df = vaex.open('gs://vaex-data/testing/xys.hdf5?token=anon&cache=False') """ import vaex import vaex.convert try: if not isinstance(path, (list, tuple)): # remote and clusters only support single path, not a list path = vaex.file.stringyfy(path) if path in aliases: path = aliases[path] path = vaex.file.stringyfy(path) if path.startswith("http://") or path.startswith("ws://") or \ path.startswith("vaex+wss://") or path.startswith("wss://") or \ path.startswith("vaex+http://") or path.startswith("vaex+ws://"): server, name = path.rsplit("/", 1) url = urlparse(path) if '?' in name: name = name[:name.index('?')] extra_args = {key: values[0] for key, values in parse_qs(url.query).items()} if 'token' in extra_args: kwargs['token'] = extra_args['token'] if 'token_trusted' in extra_args: kwargs['token_trusted'] = extra_args['token_trusted'] client = vaex.connect(server, **kwargs) return client[name] if path.startswith("cluster"): import vaex.enterprise.distributed return vaex.enterprise.distributed.open(path, *args, **kwargs) import vaex.file import glob if isinstance(path, str): paths = [path] else: paths = path filenames = [] for path in paths: path = vaex.file.stringyfy(path) if path in aliases: path = aliases[path] path = vaex.file.stringyfy(path) naked_path, options = vaex.file.split_options(path) if glob.has_magic(naked_path): filenames.extend(list(sorted(vaex.file.glob(path, fs_options=fs_options, fs=fs)))) else: filenames.append(path) df = None if len(filenames) == 0: raise IOError(f'File pattern did not match anything {path}') filename_hdf5 = vaex.convert._convert_name(filenames, shuffle=shuffle) filename_hdf5_noshuffle = vaex.convert._convert_name(filenames, shuffle=False) if len(filenames) == 1: path = filenames[0] # # naked_path, _ = vaex.file.split_options(path, fs_options) _, ext, _ = vaex.file.split_ext(path) if ext == '.csv': # special case for csv return vaex.from_csv(path, fs_options=fs_options, fs=fs, convert=convert, progress=progress, **kwargs) if convert: path_output = convert if isinstance(convert, str) else filename_hdf5 vaex.convert.convert( path_input=path, fs_options_input=fs_options, fs_input=fs, path_output=path_output, fs_options_output=fs_options, fs_output=fs, progress=progress, *args, **kwargs ) ds = vaex.dataset.open(path_output, fs_options=fs_options, fs=fs, **kwargs) else: ds = vaex.dataset.open(path, fs_options=fs_options, fs=fs, **kwargs) df = vaex.from_dataset(ds) if df is None: if os.path.exists(path): raise IOError('Could not open file: {}, did you install vaex-hdf5? Is the format supported?'.format(path)) elif len(filenames) > 1: if convert not in [True, False]: filename_hdf5 = convert else: filename_hdf5 = vaex.convert._convert_name(filenames, shuffle=shuffle) if os.path.exists(filename_hdf5) and convert: # also check mtime df = vaex.open(filename_hdf5) else: dfs = [] for filename in filenames: dfs.append(vaex.open(filename, fs_options=fs_options, fs=fs, convert=bool(convert), shuffle=shuffle, **kwargs)) df = vaex.concat(dfs) if convert: if shuffle: df = df.shuffle() df.export_hdf5(filename_hdf5, progress=progress) df = vaex.open(filename_hdf5) if df is None: raise IOError('Unknown error opening: {}'.format(path)) return df except: logger.exception("error opening %r" % path) raise
async def get_df(name): if name not in datasets: raise HTTPException(status_code=404, detail=f"dataset {name!r} not found") yield vaex.from_dataset(datasets[name])
def open(path, convert=False, shuffle=False, fs_options={}, *args, **kwargs): """Open a DataFrame from file given by path. Example: >>> df = vaex.open('sometable.hdf5') >>> df = vaex.open('somedata*.csv', convert='bigdata.hdf5') :param str or list path: local or absolute path to file, or glob string, or list of paths :param convert: convert files to an hdf5 file for optimization, can also be a path :param bool shuffle: shuffle converted DataFrame or not :param args: extra arguments for file readers that need it :param kwargs: extra keyword arguments :return: return a DataFrame on success, otherwise None :rtype: DataFrame S3 support: Vaex supports streaming of hdf5 files from Amazon AWS object storage S3. Files are by default cached in $HOME/.vaex/file-cache/s3 such that successive access is as fast as native disk access. The following url parameters control S3 options: * anon: Use anonymous access or not (false by default). (Allowed values are: true,True,1,false,False,0) * use_cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0) * profile and other arguments are passed to :py:class:`s3fs.core.S3FileSystem` All arguments can also be passed as kwargs, but then arguments such as `anon` can only be a boolean, not a string. Examples: >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5?anon=true') >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5', anon=True) # Note that anon is a boolean, not the string 'true' >>> df = vaex.open('s3://mybucket/path/to/file.hdf5?profile=myprofile') GCS support: Vaex supports streaming of hdf5 files from Google Cloud Storage. Files are by default cached in $HOME/.vaex/file-cache/gs such that successive access is as fast as native disk access. The following url parameters control GCS options: * token: Authentication method for GCP. Use 'anon' for annonymous access. See https://gcsfs.readthedocs.io/en/latest/index.html#credentials for more details. * use_cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0). * project and other arguments are passed to :py:class:`gcsfs.core.GCSFileSystem` Examples: >>> df = vaex.open('gs://vaex-data/airlines/us_airline_data_1988_2019.hdf5?token=anon') >>> df = vaex.open('gs://vaex-data/testing/xys.hdf5?token=anon&cache=False') """ import vaex import vaex.convert try: path = vaex.file.stringyfy(path) if path in aliases: path = aliases[path] path = vaex.file.stringyfy(path) if path.startswith("http://") or path.startswith("ws://") or \ path.startswith("vaex+http://") or path.startswith("vaex+ws://"): # TODO: think about https and wss server, name = path.rsplit("/", 1) url = urlparse(path) if '?' in name: name = name[:name.index('?')] extra_args = { key: values[0] for key, values in parse_qs(url.query).items() } if 'token' in extra_args: kwargs['token'] = extra_args['token'] if 'token_trusted' in extra_args: kwargs['token_trusted'] = extra_args['token_trusted'] client = vaex.connect(server, **kwargs) return client[name] if path.startswith("cluster"): import vaex.enterprise.distributed return vaex.enterprise.distributed.open(path, *args, **kwargs) else: import vaex.file import glob if isinstance(path, str): paths = [path] else: paths = path filenames = [] for path in paths: naked_path, options = vaex.file.split_options(path) if glob.has_magic(naked_path): filenames.extend( list(sorted(vaex.file.glob(path, **kwargs)))) else: filenames.append(path) df = None if len(filenames) == 0: raise IOError(f'File pattern did not match anything {path}') filename_hdf5 = vaex.convert._convert_name(filenames, shuffle=shuffle) filename_hdf5_noshuffle = vaex.convert._convert_name(filenames, shuffle=False) if len(filenames) == 1: path = filenames[0] # # naked_path, _ = vaex.file.split_options(path, fs_options) _, ext, _ = vaex.file.split_ext(path) if ext == '.csv': # special case for csv return vaex.from_csv(path, fs_options=fs_options, convert=convert, **kwargs) if convert: path_output = convert if isinstance(convert, str) else filename_hdf5 vaex.convert.convert(path_input=path, fs_options_input=fs_options, path_output=path_output, fs_options_output=fs_options, *args, **kwargs) ds = vaex.dataset.open(path_output, fs_options=fs_options) else: ds = vaex.dataset.open(path, fs_options=fs_options) df = vaex.from_dataset(ds) if df is None: if os.path.exists(path): raise IOError( 'Could not open file: {}, did you install vaex-hdf5? Is the format supported?' .format(path)) elif len(filenames) > 1: if convert not in [True, False]: filename_hdf5 = convert else: filename_hdf5 = vaex.convert._convert_name(filenames, shuffle=shuffle) if os.path.exists( filename_hdf5) and convert: # also check mtime df = vaex.open(filename_hdf5) else: dfs = [] for filename in filenames: dfs.append( vaex.open(filename, convert=bool(convert), shuffle=shuffle, **kwargs)) df = vaex.concat(dfs) if convert: if shuffle: df = df.shuffle() df.export_hdf5(filename_hdf5) df = vaex.open(filename_hdf5) if df is None: raise IOError('Unknown error opening: {}'.format(path)) return df except: logging.getLogger("vaex").error("error opening %r" % path) raise
def process(args): counts, arrays = args logger.info(f"aggregated on grid, constructing dataframe...") if counts is not None: for name, array in arrays.items(): if array.shape != counts.shape: raise RuntimeError(f'{array} {name} has shape {array.shape} while we expected {counts.shape}') arrays = {key: self._extract_center(value) for key, value in arrays.items()} if not self.dense: counts = self._extract_center(counts) # make sure we respect the sorting def sort(ar): for i, by in list(enumerate(self.by))[::-1]: sort_indices = by.sort_indices if sort_indices is not None: # if sort_indices come from arrow, it will be uint64 # which np.take does not like sort_indices = vaex.array_types.to_numpy(sort_indices) if sort_indices.dtype == np.dtype("uint64"): sort_indices = sort_indices.astype("int64") ar = np.take(ar, sort_indices, axis=i) return ar arrays = {key: sort(value) for key, value in arrays.items()} if self.combine and self.expand and isinstance(self.by[0], GrouperCombined): assert len(self.by) == 1 values = self.by[0].bin_values columns = {field.name: ar for field, ar in zip(values.type, values.flatten())} for key, value in arrays.items(): assert value.ndim == 1 columns[key] = value else: columns = {} if self.dense: if len(self.by) == 1: for by in self.by: columns[by.label] = by.bin_values else: array0 = arrays[list(arrays)[0]] # similar to the where, this creates indices like [0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1] indices = [k.ravel() for k in np.mgrid[[slice(0, n) for n in array0.shape]]] for by, index in zip(self.by, indices): columns[by.label] = vaex.array_types.take(by.bin_values, index) else: counts = sort(counts) mask = counts > 0 for by, indices in zip(self.by, np.where(mask)): columns[by.label] = by.bin_values.take(indices) if self.dense or mask.sum() == mask.size: # if we want all, just take it all # should be faster for key, value in arrays.items(): columns[key] = value.ravel() else: for key, value in arrays.items(): columns[key] = value[mask] logger.info(f"constructed dataframe") dataset_arrays = vaex.dataset.DatasetArrays(columns) dataset = DatasetGroupby(dataset_arrays, self.df, self.by_original, actions, combine=self.combine, expand=self.expand, sort=self.sort) df_grouped = vaex.from_dataset(dataset) return df_grouped
def test_parquet(l1, l2, rebuild_dataset): i1 = 0 i2 = i1 + l1 i3 = i2 + l2 x = np.arange(10) y = x**2 g = x // 3 ds = vaex.dataset.DatasetArrays(x=x, y=y, g=g) df = vaex.from_dataset(ds) path1 = HERE.parent / 'data' / 'parquet' / 'test1.parquet' path2 = HERE.parent / 'data' / 'parquet' / 'test2.parquet' path3 = HERE.parent / 'data' / 'parquet' / 'test3.parquet' path1.parent.mkdir(exist_ok=True) # df.export(str(path)) pyarrow.parquet.write_table(df[i1:i2].to_arrow_table(), str(path1), row_group_size=2) pyarrow.parquet.write_table(df[i2:i3].to_arrow_table(), str(path2), row_group_size=2) pyarrow.parquet.write_table(df[i3:].to_arrow_table(), str(path3), row_group_size=2) ds = vaex.arrow.dataset.open_parquet([str(path1), str(path2), str(path3)]) # TODO: future PR will require this: df = vaex.from_dataset(ds) ds_full = ds = df.dataset iter = ds.chunk_iterator(['x', 'y'], chunk_size=2) for i in range(5): i1, i2, chunks = next(iter) assert i1 == i * 2 assert i2 == (i + 1) * 2 chunks['x'].to_pylist() == x[i1:i2].tolist() chunks['y'].to_pylist() == y[i1:i2].tolist() ds = ds[1:10] assert 'x' in ds assert ds.row_count == 9 iter = ds.chunk_iterator(['x', 'y'], chunk_size=2) for i in range(5): i1, i2, chunks = next(iter) if i == 4: assert i1 == 8 assert i2 == 9 else: assert i1 == i * 2 assert i2 == (i + 1) * 2 # chunks = chunks chunks['x'].to_pylist() == x[i1:i2].tolist() chunks['y'].to_pylist() == y[i1:i2].tolist() ds = ds[1:9] assert ds.row_count == 8 iter = ds.chunk_iterator(['x', 'y'], chunk_size=2) for i in range(4): i1, i2, chunks = next(iter) assert i1 == i * 2 assert i2 == (i + 1) * 2 chunks['x'].to_pylist() == x[i1:i2].tolist() chunks['y'].to_pylist() == y[i1:i2].tolist() # empty columns iter = ds.chunk_iterator([], chunk_size=2) for i in range(4): i1, i2, chunks = next(iter) assert i1 == i * 2 assert i2 == (i + 1) * 2 for i in range(9): for j in range(i + 1, 10): ds = ds_full.slice(i, j) values = [] for i1, i2, chunks in ds.chunk_iterator(['x']): values.extend(chunks['x'].to_pylist()) assert x[i:j].tolist() == values assert df.x.tolist() == x.tolist() assert df.g.tolist() == g.tolist() # ds.chunk_size = 4 ds_dropped = ds.dropped('x') assert 'x' not in ds_dropped rebuild_dataset(ds).hashed() == ds.hashed()
def test_parquet(): x = np.arange(10) y = x**2 g = x // 3 ds = vaex.dataset.DatasetArrays(x=x, y=y, g=g) df = vaex.from_dataset(ds) path = HERE.parent / 'data' / 'parquet' / 'test.parquet' path.parent.mkdir(exist_ok=True) # df.export(str(path)) pyarrow.parquet.write_table(df.to_arrow_table(), str(path), row_group_size=2) df = vaex.open(str(path)) ds_full = ds = df.dataset iter = ds.chunk_iterator(['x', 'y'], chunk_size=2) for i in range(5): i1, i2, chunks = next(iter) assert i1 == i*2 assert i2 == (i + 1) * 2 chunks['x'].to_pylist() == x[i1:i2].tolist() chunks['y'].to_pylist() == y[i1:i2].tolist() ds = ds[1:10] assert 'x' in ds assert ds.row_count == 9 iter = ds.chunk_iterator(['x', 'y'], chunk_size=2) for i in range(5): i1, i2, chunks = next(iter) if i == 4: assert i1 == 8 assert i2 == 9 else: assert i1 == i*2 assert i2 == (i + 1) * 2 # chunks = chunks chunks['x'].to_pylist() == x[i1:i2].tolist() chunks['y'].to_pylist() == y[i1:i2].tolist() ds = ds[1:9] assert ds.row_count == 8 iter = ds.chunk_iterator(['x', 'y'], chunk_size=2) for i in range(4): i1, i2, chunks = next(iter) assert i1 == i*2 assert i2 == (i + 1) * 2 chunks['x'].to_pylist() == x[i1:i2].tolist() chunks['y'].to_pylist() == y[i1:i2].tolist() # empty columns iter = ds.chunk_iterator([], chunk_size=2) for i in range(4): i1, i2, chunks = next(iter) assert i1 == i*2 assert i2 == (i + 1) * 2 for i in range(9): for j in range(i+1, 10): ds = ds_full.slice(i, j) values = [] for i1, i2, chunks in ds.chunk_iterator(['x']): values.extend(chunks['x'].to_pylist()) assert x[i:j].tolist() == values assert df.x.tolist() == x.tolist() assert df.g.tolist() == g.tolist() # ds.chunk_size = 4 ds_dropped = ds.dropped('x') assert 'x' not in ds_dropped