def pyarrow_transform(batch: pa.Table) -> pa.Table: batch = batch.filter(pac.equal(batch["variety"], "Versicolor")) batch = batch.append_column( "normalized.sepal.length", pac.divide(batch["sepal.length"], pac.max(batch["sepal.length"])), ) return batch.drop(["sepal.length"])
def _remove_unsupported_feature_columns(examples_table: pa.Table, schema: schema_pb2.Schema) -> pa.Table: """Removes feature columns that contain unsupported values. All feature columns that are multivalent are dropped since they are not supported by sk-learn. All columns of STRUCT type are also dropped. Args: examples_table: Arrow table containing a batch of examples. schema: The schema for the data. Returns: Arrow table. """ multivalent_features = schema_util.get_multivalent_features(schema) unsupported_columns = set() for f in multivalent_features: unsupported_columns.add(f.steps()[0]) for column_name, column in zip(examples_table.schema.names, examples_table.itercolumns()): if (stats_util.get_feature_type_from_arrow_type( types.FeaturePath([column_name]), column.type) == statistics_pb2.FeatureNameStatistics.STRUCT): unsupported_columns.add(column_name) return examples_table.drop(unsupported_columns)
def _query_table(pa_table: pa.Table, key: Union[int, slice, range, str, Iterable]) -> pa.Table: """ Query a pyarrow Table to extract the subtable that correspond to the given key. """ if isinstance(key, int): return pa_table.slice(key % pa_table.num_rows, 1) if isinstance(key, slice): key = range(*key.indices(pa_table.num_rows)) if isinstance(key, range): if _is_range_contiguous(key) and key.start >= 0: return pa_table.slice(key.start, key.stop - key.start) else: pass # treat as an iterable if isinstance(key, str): return pa_table.drop(column for column in pa_table.column_names if column != key) if isinstance(key, Iterable): if len(key) == 0: return pa_table.slice(0, 0) # don't use pyarrow.Table.take even for pyarrow >=1.0 (see https://issues.apache.org/jira/browse/ARROW-9773) return pa.concat_tables( pa_table.slice(int(i) % pa_table.num_rows, 1) for i in key) _raise_bad_key_type(key)
def group_by_pk_hash_bucket(table: pa.Table, num_buckets: int, primary_keys: List[str]) -> np.ndarray: # generate the primary key digest column all_pk_column_fields = [] for pk_name in primary_keys: # casting a primary key column to numpy also ensures no nulls exist column_fields = table[pk_name].to_numpy() all_pk_column_fields.append(column_fields) hash_column_generator = hash_pk_bytes_generator(all_pk_column_fields) table = sc.append_pk_hash_column(table, hash_column_generator) # drop primary key columns to free up memory table = table.drop(primary_keys) # group hash bucket record indices hash_bucket_to_indices = np.empty([num_buckets], dtype="object") record_index = 0 for digest in sc.pk_hash_column_np(table): hash_bucket = pk_digest_to_hash_bucket_index(digest, num_buckets) if hash_bucket_to_indices[hash_bucket] is None: hash_bucket_to_indices[hash_bucket] = [] hash_bucket_to_indices[hash_bucket].append(record_index) record_index += 1 # generate the ordered record number column hash_bucket_to_table = np.empty([num_buckets], dtype="object") for hash_bucket in range(len(hash_bucket_to_indices)): indices = hash_bucket_to_indices[hash_bucket] if indices: hash_bucket_to_table[hash_bucket] = sc.append_record_idx_col( table.take(indices), indices, ) return hash_bucket_to_table
def add_page_pings_enabled_col(table: pa.Table) -> pa.Table: # Page views with page pings enabled have the 'heartbeat' context added. The context also tells us how many seconds # there are between each page ping. For now, we just hard code that value to 30s but it can be extracted from the # heartbeat context if needed. page_pings_enabled = table.column('contexts').to_pandas()\ .str.contains('iglu:dk.jyllands-posten/heartbeat/jsonschema/') # noinspection PyCallByClass,PyTypeChecker return table.append_column( pa.Column.from_array('page_pings_enabled', page_pings_enabled))
def format_table( pa_table: pa.Table, key: Union[int, slice, range, str, Iterable], formatter: Formatter, format_columns: Optional[list] = None, output_all_columns=False, ): """ Format a pyarrow Table depending on the key that was used and a Formatter object. Args: pa_table (``pyarrow.Table``): The input pyarrow Table to format key (``Union[int, slice, range, str, Iterable]``): Depending on the key that was used, the formatter formats the table as either a row, a column or a batch. formatter (``datasets.formatting.formatting.Formatter``): Any subclass of a Formatter such as PythonFormatter, NumpyFormatter, etc. format_columns (Optional ``List[str]``): if not None, it defines the columns that will be formatted using the given formatter. Other columns are discarded (unless ``output_all_columns`` is True) output_all_columns (``bool``, defaults to False). If True, the formatted output is completed using the columns that are not in the ``format_columns`` list. For these columns, the PythonFormatter is used. Returns: A row, column or batch formatted object defined by the Formatter: - the PythonFormatter returns a dictionary for a row or a batch, and a list for a column. - the NumpyFormatter returns a dictionary for a row or a batch, and a np.array for a column. - the PandasFormatter returns a pd.DataFrame for a row or a batch, and a pd.Series for a column. - the TorchFormatter returns a dictionary for a row or a batch, and a torch.Tensor for a column. - the TFFormatter returns a dictionary for a row or a batch, and a tf.Tensor for a column. """ query_type = key_to_query_type(key) python_formatter = PythonFormatter() if format_columns is None: return formatter(pa_table, query_type=query_type) elif query_type == "column": if key in format_columns: return formatter(pa_table, query_type) else: return python_formatter(pa_table, query_type=query_type) else: pa_table_to_format = pa_table.drop(col for col in pa_table.column_names if col not in format_columns) formatted_output = formatter(pa_table_to_format, query_type=query_type) if output_all_columns: if isinstance(formatted_output, Mapping): pa_table_with_remaining_columns = pa_table.drop( col for col in pa_table.column_names if col in format_columns) remaining_columns_dict = python_formatter( pa_table_with_remaining_columns, query_type=query_type) formatted_output.update(remaining_columns_dict) else: raise TypeError( f"Custom formatting function must return a dict to work with output_all_columns=True, but got {formatted_output}" ) return formatted_output
def _split_into_per_realization_tables(table: pa.Table) -> Dict[int, pa.Table]: per_real_tables: Dict[int, pa.Table] = {} unique_reals = table.column("REAL").unique().to_pylist() for real in unique_reals: # pylint: disable=no-member mask = pc.is_in(table["REAL"], value_set=pa.array([real])) real_table = table.filter(mask).drop(["REAL"]) per_real_tables[real] = real_table return per_real_tables
def render_arrow_v1(table: pa.Table, params, **kwargs): todo = frozenset(params["colnames"]) for i, colname in enumerate(table.column_names): if colname not in todo: continue table = table.set_column( i, colname, format_chunked_array(table.column(i), table.schema.field(i))) return ArrowRenderResult(table)
def _convert_arrow_to_proto( table: pyarrow.Table, feature_view: FeatureView ) -> List[Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]]]: rows_to_write = [] for row in zip(*table.to_pydict().values()): entity_key = EntityKeyProto() for entity_name in feature_view.entities: entity_key.entity_names.append(entity_name) idx = table.column_names.index(entity_name) value = python_value_to_proto_value(row[idx]) entity_key.entity_values.append(value) feature_dict = {} for feature in feature_view.features: idx = table.column_names.index(feature.name) value = python_value_to_proto_value(row[idx]) feature_dict[feature.name] = value event_timestamp_idx = table.column_names.index( feature_view.input.event_timestamp_column) event_timestamp = row[event_timestamp_idx] if feature_view.input.created_timestamp_column is not None: created_timestamp_idx = table.column_names.index( feature_view.input.created_timestamp_column) created_timestamp = row[created_timestamp_idx] else: created_timestamp = None rows_to_write.append( (entity_key, feature_dict, event_timestamp, created_timestamp)) return rows_to_write
def find_nonnull_table_mask(table: pa.Table) -> pa.Array: mask = pa.array(np.ones(table.num_rows), pa.bool_()) for column in table.itercolumns(): mask = pa.compute.and_(mask, column.chunks[0].is_valid()) return mask
def _send_data(self, to: ActorVirtualIdentity, data_payload: DataPayload) -> None: """ Send data payload to the given target actor. This method is to be used internally only. :param to: The target actor's ActorVirtualIdentity :param data_payload: The data payload to be sent, can be either DataFrame or EndOfUpstream """ if isinstance(data_payload, OutputDataFrame): # converting from a column-based dictionary is the fastest known method # https://stackoverflow.com/questions/57939092/fastest-way-to-construct-pyarrow-table-row-by-row field_names = data_payload.schema.names table = Table.from_pydict( {name: [t[name] for t in data_payload.frame] for name in field_names}, schema=data_payload.schema ) data_header = PythonDataHeader(tag=to, is_end=False) self._proxy_client.send_data(bytes(data_header), table) elif isinstance(data_payload, EndOfUpstream): data_header = PythonDataHeader(tag=to, is_end=True) self._proxy_client.send_data(bytes(data_header), None) else: raise TypeError(f"Unexpected payload {data_payload}")
def arrow_to_pydf(data: pa.Table, columns: Optional[Sequence[str]] = None, rechunk: bool = True) -> "PyDataFrame": """ Construct a PyDataFrame from an Arrow Table. """ if columns is not None: try: data = data.rename_columns(columns) except pa.lib.ArrowInvalid as e: raise ValueError( "Dimensions of columns arg must match data dimensions.") from e data_dict = {} for i, column in enumerate(data): # extract the name before casting if column._name is None: name = f"column_{i}" else: name = column._name column = coerce_arrow(column) data_dict[name] = column batches = pa.table(data_dict).to_batches() pydf = PyDataFrame.from_arrow_record_batches(batches) if rechunk: pydf = pydf.rechunk() return pydf
def test_by_time_period_from_7670_to_8400(): expected = Table.from_pydict({ 'unit_id': [ 1000000002, 1000000004, 1000000003, 1000000001, 1000000001, 1000000003, 1000000003, 1000000001, 1000000002 ], 'value': ["8", "2", "12", "3", "16", "2", "12", "3", "8"], 'start_epoch_days': [ 1461, 3287, 4018, 5479, 7851, 7701, 7957, 8126, 8066 ], 'stop_epoch_days': [ 8065, 7710, 7700, 7850, 8125, 7956, np.nan, np.nan, np.nan ] }) print_expected(expected) actual = filter_by_time_period( TEST_BOSTED_PARQUET_DIR, 7670, 8400, None, True ) print_actual(actual) assert_frame_equal( expected.to_pandas(), actual.to_pandas(), check_dtype=False )
def _arrowtable2df( table: pa.Table, categories: Optional[List[str]], safe: bool, use_threads: bool, dataset: bool, path: str, path_root: Optional[str], ) -> pd.DataFrame: metadata: Dict[str, Any] = {} if table.schema.metadata is not None and b"pandas" in table.schema.metadata: metadata = json.loads(table.schema.metadata[b"pandas"]) df: pd.DataFrame = _apply_partitions( df=table.to_pandas( use_threads=use_threads, split_blocks=True, self_destruct=True, integer_object_nulls=False, date_as_object=True, ignore_metadata=True, strings_to_categorical=False, safe=safe, categories=categories, types_mapper=_data_types.pyarrow2pandas_extension, ), dataset=dataset, path=path, path_root=path_root, ) df = _utils.ensure_df_is_mutable(df=df) if metadata: _logger.debug("metadata: %s", metadata) df = _apply_index(df=df, metadata=metadata) df = _apply_timezone(df=df, metadata=metadata) return df
def write_table(self, pa_table: pa.Table, writer_batch_size: Optional[int] = None): """Write a batch of Example to file. Args: example: the Example to add. """ if writer_batch_size is None: writer_batch_size = self.writer_batch_size if self.pa_writer is None: self._build_writer(inferred_schema=pa_table.schema) pa_table = pa_table.cast(self._schema) batches: List[pa.RecordBatch] = pa_table.to_batches(max_chunksize=writer_batch_size) self._num_bytes += sum(batch.nbytes for batch in batches) self._num_examples += pa_table.num_rows for batch in batches: self.pa_writer.write_batch(batch)
def _geopandas_to_arrow(df, index=None): """ Helper function with main, shared logic for to_parquet/to_feather. """ from pyarrow import Table warnings.warn( "this is an initial implementation of Parquet/Feather file support and " "associated metadata. This is tracking version 0.1.0 of the metadata " "specification at " "https://github.com/geopandas/geo-arrow-spec\n\n" "This metadata specification does not yet make stability promises. " "We do not yet recommend using this in a production setting unless you " "are able to rewrite your Parquet/Feather files.\n\n" "To further ignore this warning, you can do: \n" "import warnings; warnings.filterwarnings('ignore', " "message='.*initial implementation of Parquet.*')", UserWarning, stacklevel=4, ) _validate_dataframe(df) # create geo metadata before altering incoming data frame geo_metadata = _create_metadata(df) df = _encode_wkb(df) table = Table.from_pandas(df, preserve_index=index) # Store geopandas specific file-level metadata # This must be done AFTER creating the table or it is not persisted metadata = table.schema.metadata metadata.update({b"geo": _encode_metadata(geo_metadata)}) return table.replace_schema_metadata(metadata)
def cast_arrow_table_to_schema( tab: pa.Table, schema: Union[pa.Schema, None] = None, expect_full_schema: bool = True, ): """Casts an arrow schema to a new or partial schema Args: tab (pa.Table): An arrow table schema (Union[pa.Schema, None], optional): [description]. Defaults to None. expect_full_schema (bool, optional): if True, pyarrow reader will expect the input schema to have fields for every col in the input file. If False, then will only cast columns that are listed in the schema, leaving all other columns to their default type on read. """ if expect_full_schema: update_schema = schema else: update_schema = update_existing_schema(tab.schema, schema) new_tab = tab.cast(update_schema) return new_tab
def _arrow_table_to_pandas( cls, arrow_table: pa.Table, categories, **kwargs ) -> pd.DataFrame: _kwargs = kwargs.get("arrow_to_pandas", {}) _kwargs.update({"use_threads": False, "ignore_metadata": False}) return arrow_table.to_pandas(categories=categories, **_kwargs)
def _arrowtable2df( table: pa.Table, categories: Optional[List[str]], safe: bool, use_threads: bool, dataset: bool, path: str, path_root: Optional[str], ) -> pd.DataFrame: df: pd.DataFrame = _apply_partitions( df=table.to_pandas( use_threads=use_threads, split_blocks=True, self_destruct=True, integer_object_nulls=False, date_as_object=True, ignore_metadata=True, categories=categories, safe=safe, types_mapper=_data_types.pyarrow2pandas_extension, ), dataset=dataset, path=path, path_root=path_root, ) return _utils.ensure_df_is_mutable(df=df)
def register_file_by_path(self, path): root, fname = os.path.split(path) tablename, _ = os.path.splitext(fname) fpath = os.path.join(root, fname) loaded = False exception = None for sep in [',', '|', '\t']: df = None # TODO delete columnar_tb = None try: with openfile(fpath) as f: df = pandas.read_csv(f, sep=sep) # TODO delete columnar_tb = pa_tb.from_pandas(df) except Exception as e: exception = e if df is not None and columnar_tb is not None: self.register_dataframe(tablename, df) self.register_columnar_tb(tablename, columnar_tb) loaded = True break if not loaded: print("Failed to read data file %s" % (fpath)) print(exception)
def generate_partial_statistics_in_memory( table: pa.Table, options: stats_options.StatsOptions, stats_generators: List[stats_generator.CombinerStatsGenerator] ) -> List[Any]: """Generates statistics for an in-memory list of examples. Args: table: Arrow table. options: Options for generating data statistics. stats_generators: A list of combiner statistics generators. Returns: A list of accumulators containing partial statistics. """ result = [] if options.feature_whitelist: whitelisted_columns = [ table.column(f).data for f in options.feature_whitelist ] table = pa.Table.from_arrays(whitelisted_columns, list(options.feature_whitelist)) for generator in stats_generators: result.append( generator.add_input(generator.create_accumulator(), table)) return result
def test_parquet_invalid_metadata(tmpdir, geo_meta, error): """Has geo metadata with missing required fields will raise a ValueError. This requires writing the parquet file directly below, so that we can control the metadata that is written for this test. """ from pyarrow import parquet, Table test_dataset = "naturalearth_lowres" df = read_file(get_path(test_dataset)) # convert to DataFrame and encode geometry to WKB df = DataFrame(df) df["geometry"] = to_wkb(df["geometry"].values) table = Table.from_pandas(df) metadata = table.schema.metadata metadata.update(geo_meta) table = table.replace_schema_metadata(metadata) filename = os.path.join(str(tmpdir), "test.pq") parquet.write_table(table, filename) with pytest.raises(ValueError, match=error): read_parquet(filename)
def get_broadcastable_column(input_table: pa.Table, column_name: Text) -> pa.Array: """Gets a column from the input table, validating that it can be broadcast. Args: input_table: Input table. column_name: Name of the column to be retrieved and validated. This column must refer to a ListArray in which each list has length 1. Returns: An arrow array containing a flattened view of the broadcast column. Raises: ValueError: If the broadcast feature is not present in the input table or is not a valid column. A valid column must have exactly one value per example and be of a numeric type. """ try: column = input_table.column(column_name).data.chunk(0) except KeyError: raise ValueError( 'Column "{}" not present in the input table.'.format(column_name)) # Before flattening, check that there is a single value for each example. column_lengths = array_util.ListLengthsFromListArray(column).to_numpy() if not np.all(column_lengths == 1): raise ValueError( 'Column "{}" must have exactly one value in each example.'.format( column_name)) return column.flatten()
def arrow_to_pandas(cls, table: pa.Table) -> pd.DataFrame: return table.to_pandas( ignore_metadata=True, # noqa date_as_object=False, # noqa timestamp_as_object=False, # noqa types_mapper=cls.__ARROW_TO_PANDAS_TYPE_MAPPING.get)
def resample_single_real_table(table: pa.Table, freq: Frequency) -> pa.Table: """Resample table that contains only a single realization. The table must contain a DATE column and it must be sorted on DATE """ # Notes: # Getting meta data using json.loads() takes quite a bit of time!! # We should provide this info in another way. schema = table.schema raw_dates_np = table.column("DATE").to_numpy() raw_dates_np_as_uint = raw_dates_np.astype(np.uint64) min_raw_date = np.min(raw_dates_np) max_raw_date = np.max(raw_dates_np) sample_dates_np = generate_normalized_sample_dates( min_raw_date, max_raw_date, freq=freq ) sample_dates_np_as_uint = sample_dates_np.astype(np.uint64) column_arrays = [] for colname in schema.names: if colname == "DATE": column_arrays.append(sample_dates_np) elif colname == "REAL": column_arrays.append( np.full(len(sample_dates_np), table.column("REAL")[0].as_py()) ) else: raw_numpy_arr = table.column(colname).to_numpy() if is_rate_from_field_meta(table.field(colname)): i = interpolate_backfill( sample_dates_np_as_uint, raw_dates_np_as_uint, raw_numpy_arr, 0, 0 ) else: i = np.interp( sample_dates_np_as_uint, raw_dates_np_as_uint, raw_numpy_arr ) column_arrays.append(i) ret_table = pa.table(column_arrays, schema=schema) return ret_table
def write_table(table: pyarrow.Table, compression_level=0) -> bytes: table = table.combine_chunks() doc = collections.OrderedDict() for col in table.columns: buf = write_array(col.data.chunks[0], compression_level) doc[col.name] = bson.raw_bson.RawBSONDocument(buf) return bson.encode(doc)
def _write_table_to_file(table: pa.Table, filename: str) -> int: with open(filename, "wb") as sink: writer = pa.RecordBatchStreamWriter(sink=sink, schema=table.schema) batches: List[pa.RecordBatch] = table.to_batches() for batch in batches: writer.write_batch(batch) writer.close() return sum(batch.nbytes for batch in batches)
def add_input(self, accumulator: List[float], examples_table: pa.Table) -> List[float]: accumulator[0] += examples_table.num_rows if self._weight_feature: weights_column = examples_table.column(self._weight_feature) for weight_array in weights_column.data.iterchunks(): accumulator[1] += np.sum(np.asarray(weight_array.flatten())) return accumulator
def row_iter(table: pyarrow.Table): """Iterator row over row.""" # pylint: disable=invalid-name Row = collections.namedtuple("Row", table.column_names) for index in range(table.num_rows): row = table.slice(index, 1) obj = Row(*(col[0].as_py() for col in row.itercolumns())) yield obj
def reencode_dictionaries(table: pa.Table) -> pa.Table: for i in range(table.num_columns): column = table.columns[i] if pa.types.is_dictionary(column.type): table = table.set_column( i, table.column_names[i], reencode_dictionary_array(column.chunks[0])) return table
def write_to_dataset(table, root_path, partition_cols=None, filesystem=None, preserve_index=True, **kwargs): """ Wrapper around parquet.write_table for writing a Table to Parquet format by partitions. For each combination of partition columns and values, a subdirectories are created in the following manner: root_dir/ group1=value1 group2=value1 <uuid>.parquet group2=value2 <uuid>.parquet group1=valueN group2=value1 <uuid>.parquet group2=valueN <uuid>.parquet Parameters ---------- table : pyarrow.Table root_path : string, The root directory of the dataset filesystem : FileSystem, default None If nothing passed, paths assumed to be found in the local on-disk filesystem partition_cols : list, Column names by which to partition the dataset Columns are partitioned in the order they are given preserve_index : bool, Parameter for instantiating Table; preserve pandas index or not. **kwargs : dict, kwargs for write_table function. """ from pyarrow import ( Table, compat ) if filesystem is None: fs = _get_fs_from_path(root_path) else: fs = _ensure_filesystem(filesystem) _mkdir_if_not_exists(fs, root_path) if partition_cols is not None and len(partition_cols) > 0: df = table.to_pandas() partition_keys = [df[col] for col in partition_cols] data_df = df.drop(partition_cols, axis='columns') data_cols = df.columns.drop(partition_cols) if len(data_cols) == 0: raise ValueError("No data left to save outside partition columns") for keys, subgroup in data_df.groupby(partition_keys): if not isinstance(keys, tuple): keys = (keys,) subdir = "/".join( ["{colname}={value}".format(colname=name, value=val) for name, val in zip(partition_cols, keys)]) subtable = Table.from_pandas(subgroup, preserve_index=preserve_index) prefix = "/".join([root_path, subdir]) _mkdir_if_not_exists(fs, prefix) outfile = compat.guid() + ".parquet" full_path = "/".join([prefix, outfile]) with fs.open(full_path, 'wb') as f: write_table(subtable, f, **kwargs) else: outfile = compat.guid() + ".parquet" full_path = "/".join([root_path, outfile]) with fs.open(full_path, 'wb') as f: write_table(table, f, **kwargs)