def store(self, store: KeyValueStore, dataset_uuid: str) -> str: """ Store the index as a parquet file If compatible, the new keyname will be the name stored under the attribute `index_storage_key`. If this attribute is None, a new key will be generated of the format `{dataset_uuid}/indices/{column}/{timestamp}.by-dataset-index.parquet` where the timestamp is in nanosecond accuracy and is created upon Index object initialization Parameters ---------- store: dataset_uuid: """ storage_key = None if (self.index_storage_key is not None and dataset_uuid and dataset_uuid in self.index_storage_key): storage_key = self.index_storage_key if storage_key is None: storage_key = "{dataset_uuid}/indices/{column}/{timestamp}{suffix}".format( dataset_uuid=dataset_uuid, suffix=naming.EXTERNAL_INDEX_SUFFIX, column=quote(self.column), timestamp=quote(self.creation_time.isoformat()), ) table = _index_dct_to_table(self.index_dct, self.column, self.dtype) buf = pa.BufferOutputStream() pq.write_table(table, buf) store.put(storage_key, buf.getvalue().to_pybytes()) return storage_key
def store(self, store: StoreInput, dataset_uuid: str) -> str: """ Store the index as a parquet file If compatible, the new keyname will be the name stored under the attribute `index_storage_key`. If this attribute is None, a new key will be generated of the format `{dataset_uuid}/indices/{column}/{timestamp}.by-dataset-index.parquet` where the timestamp is in nanosecond accuracy and is created upon Index object initialization Parameters ---------- store: dataset_uuid: """ storage_key = None store = ensure_store(store) if (self.index_storage_key is not None and dataset_uuid and dataset_uuid in self.index_storage_key): storage_key = self.index_storage_key if storage_key is None: storage_key = "{dataset_uuid}/indices/{column}/{timestamp}{suffix}".format( dataset_uuid=dataset_uuid, suffix=naming.EXTERNAL_INDEX_SUFFIX, column=quote(self.column), timestamp=quote(self.creation_time.isoformat()), ) # The arrow representation of index_dct requires a large amount of memory because strings are duplicated and # flattened into the buffer. To avoid a high peak memory usage, split the index_dct into chunks and only convert # one chunk a time to arrow. parts_iter = partition_all(10_000, self.index_dct.items()) # Get first table explicit because its schema is required for ParquetWriter. try: table = _index_dct_to_table(dict(next(parts_iter)), self.column, self.dtype) except StopIteration: # index_dct was empty, just pass it entirely table = _index_dct_to_table(self.index_dct, self.column, self.dtype) buf = pa.BufferOutputStream() with pq.ParquetWriter(buf, schema=table.schema) as writer: writer.write_table(table) del table for part in parts_iter: writer.write_table( _index_dct_to_table(dict(part), self.column, self.dtype)) store.put(storage_key, buf.getvalue().to_pybytes()) return storage_key