Python Table 예제들, pyarrow.Table Python 예제들

예제 #1

0

파일 보기

def pyarrow_transform(batch: pa.Table) -> pa.Table:
    batch = batch.filter(pac.equal(batch["variety"], "Versicolor"))
    batch = batch.append_column(
        "normalized.sepal.length",
        pac.divide(batch["sepal.length"], pac.max(batch["sepal.length"])),
    )
    return batch.drop(["sepal.length"])

예제 #2

0

파일 보기

파일: sklearn_mutual_information.py 프로젝트: Bobgy/data-validation

def _remove_unsupported_feature_columns(examples_table: pa.Table,
                                        schema: schema_pb2.Schema) -> pa.Table:
  """Removes feature columns that contain unsupported values.

  All feature columns that are multivalent are dropped since they are
  not supported by sk-learn.

  All columns of STRUCT type are also dropped.

  Args:
    examples_table: Arrow table containing a batch of examples.
    schema: The schema for the data.

  Returns:
    Arrow table.
  """
  multivalent_features = schema_util.get_multivalent_features(schema)
  unsupported_columns = set()
  for f in multivalent_features:
    unsupported_columns.add(f.steps()[0])
  for column_name, column in zip(examples_table.schema.names,
                                 examples_table.itercolumns()):
    if (stats_util.get_feature_type_from_arrow_type(
        types.FeaturePath([column_name]),
        column.type) == statistics_pb2.FeatureNameStatistics.STRUCT):
      unsupported_columns.add(column_name)
  return examples_table.drop(unsupported_columns)

예제 #3

0

파일 보기

파일: formatting.py 프로젝트: ddhruvkr/datasets-1

def _query_table(pa_table: pa.Table, key: Union[int, slice, range, str,
                                                Iterable]) -> pa.Table:
    """
    Query a pyarrow Table to extract the subtable that correspond to the given key.
    """
    if isinstance(key, int):
        return pa_table.slice(key % pa_table.num_rows, 1)
    if isinstance(key, slice):
        key = range(*key.indices(pa_table.num_rows))
    if isinstance(key, range):
        if _is_range_contiguous(key) and key.start >= 0:
            return pa_table.slice(key.start, key.stop - key.start)
        else:
            pass  # treat as an iterable
    if isinstance(key, str):
        return pa_table.drop(column for column in pa_table.column_names
                             if column != key)
    if isinstance(key, Iterable):
        if len(key) == 0:
            return pa_table.slice(0, 0)
        # don't use pyarrow.Table.take even for pyarrow >=1.0 (see https://issues.apache.org/jira/browse/ARROW-9773)
        return pa.concat_tables(
            pa_table.slice(int(i) % pa_table.num_rows, 1) for i in key)

    _raise_bad_key_type(key)

예제 #4

0

파일 보기

파일: hash_bucket.py 프로젝트: amzn/amazon-ray

def group_by_pk_hash_bucket(table: pa.Table, num_buckets: int,
                            primary_keys: List[str]) -> np.ndarray:

    # generate the primary key digest column
    all_pk_column_fields = []
    for pk_name in primary_keys:
        # casting a primary key column to numpy also ensures no nulls exist
        column_fields = table[pk_name].to_numpy()
        all_pk_column_fields.append(column_fields)
    hash_column_generator = hash_pk_bytes_generator(all_pk_column_fields)
    table = sc.append_pk_hash_column(table, hash_column_generator)

    # drop primary key columns to free up memory
    table = table.drop(primary_keys)

    # group hash bucket record indices
    hash_bucket_to_indices = np.empty([num_buckets], dtype="object")
    record_index = 0
    for digest in sc.pk_hash_column_np(table):
        hash_bucket = pk_digest_to_hash_bucket_index(digest, num_buckets)
        if hash_bucket_to_indices[hash_bucket] is None:
            hash_bucket_to_indices[hash_bucket] = []
        hash_bucket_to_indices[hash_bucket].append(record_index)
        record_index += 1

    # generate the ordered record number column
    hash_bucket_to_table = np.empty([num_buckets], dtype="object")
    for hash_bucket in range(len(hash_bucket_to_indices)):
        indices = hash_bucket_to_indices[hash_bucket]
        if indices:
            hash_bucket_to_table[hash_bucket] = sc.append_record_idx_col(
                table.take(indices),
                indices,
            )
    return hash_bucket_to_table

예제 #5

0

파일 보기

파일: move_to_dfp.py 프로젝트: ygoleite/SnowplowDeployments

def add_page_pings_enabled_col(table: pa.Table) -> pa.Table:
    # Page views with page pings enabled have the 'heartbeat' context added. The context also tells us how many seconds
    # there are between each page ping. For now, we just hard code that value to 30s but it can be extracted from the
    # heartbeat context if needed.
    page_pings_enabled = table.column('contexts').to_pandas()\
        .str.contains('iglu:dk.jyllands-posten/heartbeat/jsonschema/')
    # noinspection PyCallByClass,PyTypeChecker
    return table.append_column(
        pa.Column.from_array('page_pings_enabled', page_pings_enabled))

예제 #6

0

파일 보기

파일: formatting.py 프로젝트: ddhruvkr/datasets-1

def format_table(
    pa_table: pa.Table,
    key: Union[int, slice, range, str, Iterable],
    formatter: Formatter,
    format_columns: Optional[list] = None,
    output_all_columns=False,
):
    """
    Format a pyarrow Table depending on the key that was used and a Formatter object.

    Args:
        pa_table (``pyarrow.Table``): The input pyarrow Table to format
        key (``Union[int, slice, range, str, Iterable]``): Depending on the key that was used, the formatter formats
            the table as either a row, a column or a batch.
        formatter (``datasets.formatting.formatting.Formatter``): Any subclass of a Formatter such as
            PythonFormatter, NumpyFormatter, etc.
        format_columns (Optional ``List[str]``): if not None, it defines the columns that will be formatted using the
            given formatter. Other columns are discarded (unless ``output_all_columns`` is True)
        output_all_columns (``bool``, defaults to False). If True, the formatted output is completed using the columns
            that are not in the ``format_columns`` list. For these columns, the PythonFormatter is used.


    Returns:
        A row, column or batch formatted object defined by the Formatter:
        - the PythonFormatter returns a dictionary for a row or a batch, and a list for a column.
        - the NumpyFormatter returns a dictionary for a row or a batch, and a np.array for a column.
        - the PandasFormatter returns a pd.DataFrame for a row or a batch, and a pd.Series for a column.
        - the TorchFormatter returns a dictionary for a row or a batch, and a torch.Tensor for a column.
        - the TFFormatter returns a dictionary for a row or a batch, and a tf.Tensor for a column.
    """
    query_type = key_to_query_type(key)
    python_formatter = PythonFormatter()
    if format_columns is None:
        return formatter(pa_table, query_type=query_type)
    elif query_type == "column":
        if key in format_columns:
            return formatter(pa_table, query_type)
        else:
            return python_formatter(pa_table, query_type=query_type)
    else:
        pa_table_to_format = pa_table.drop(col for col in pa_table.column_names
                                           if col not in format_columns)
        formatted_output = formatter(pa_table_to_format, query_type=query_type)
        if output_all_columns:
            if isinstance(formatted_output, Mapping):
                pa_table_with_remaining_columns = pa_table.drop(
                    col for col in pa_table.column_names
                    if col in format_columns)
                remaining_columns_dict = python_formatter(
                    pa_table_with_remaining_columns, query_type=query_type)
                formatted_output.update(remaining_columns_dict)
            else:
                raise TypeError(
                    f"Custom formatting function must return a dict to work with output_all_columns=True, but got {formatted_output}"
                )
        return formatted_output

예제 #7

0

파일 보기

파일: test_ensemble_summary_provider_impl_arrow_lazy.py 프로젝트: CeetronSolutions/webviz-subsurface

def _split_into_per_realization_tables(table: pa.Table) -> Dict[int, pa.Table]:
    per_real_tables: Dict[int, pa.Table] = {}
    unique_reals = table.column("REAL").unique().to_pylist()
    for real in unique_reals:
        # pylint: disable=no-member
        mask = pc.is_in(table["REAL"], value_set=pa.array([real]))
        real_table = table.filter(mask).drop(["REAL"])
        per_real_tables[real] = real_table

    return per_real_tables

예제 #8

0

파일 보기

파일: converttotext.py 프로젝트: CJWorkbench/converttotext

def render_arrow_v1(table: pa.Table, params, **kwargs):
    todo = frozenset(params["colnames"])

    for i, colname in enumerate(table.column_names):
        if colname not in todo:
            continue

        table = table.set_column(
            i, colname,
            format_chunked_array(table.column(i), table.schema.field(i)))

    return ArrowRenderResult(table)

예제 #9

0

파일 보기

def _convert_arrow_to_proto(
    table: pyarrow.Table, feature_view: FeatureView
) -> List[Tuple[EntityKeyProto, Dict[str, ValueProto], datetime,
                Optional[datetime]]]:
    rows_to_write = []
    for row in zip(*table.to_pydict().values()):
        entity_key = EntityKeyProto()
        for entity_name in feature_view.entities:
            entity_key.entity_names.append(entity_name)
            idx = table.column_names.index(entity_name)
            value = python_value_to_proto_value(row[idx])
            entity_key.entity_values.append(value)
        feature_dict = {}
        for feature in feature_view.features:
            idx = table.column_names.index(feature.name)
            value = python_value_to_proto_value(row[idx])
            feature_dict[feature.name] = value
        event_timestamp_idx = table.column_names.index(
            feature_view.input.event_timestamp_column)
        event_timestamp = row[event_timestamp_idx]
        if feature_view.input.created_timestamp_column is not None:
            created_timestamp_idx = table.column_names.index(
                feature_view.input.created_timestamp_column)
            created_timestamp = row[created_timestamp_idx]
        else:
            created_timestamp = None

        rows_to_write.append(
            (entity_key, feature_dict, event_timestamp, created_timestamp))
    return rows_to_write

예제 #10

0

파일 보기

def find_nonnull_table_mask(table: pa.Table) -> pa.Array:
    mask = pa.array(np.ones(table.num_rows), pa.bool_())

    for column in table.itercolumns():
        mask = pa.compute.and_(mask, column.chunks[0].is_valid())

    return mask

예제 #11

0

파일 보기

파일: network_sender.py 프로젝트: sadeemsaleh/texera

    def _send_data(self, to: ActorVirtualIdentity, data_payload: DataPayload) -> None:
        """
        Send data payload to the given target actor. This method is to be used internally only.

        :param to: The target actor's ActorVirtualIdentity
        :param data_payload: The data payload to be sent, can be either DataFrame or EndOfUpstream
        """

        if isinstance(data_payload, OutputDataFrame):
            # converting from a column-based dictionary is the fastest known method
            # https://stackoverflow.com/questions/57939092/fastest-way-to-construct-pyarrow-table-row-by-row
            field_names = data_payload.schema.names
            table = Table.from_pydict(
                {name: [t[name] for t in data_payload.frame] for name in field_names},
                schema=data_payload.schema
            )
            data_header = PythonDataHeader(tag=to, is_end=False)
            self._proxy_client.send_data(bytes(data_header), table)

        elif isinstance(data_payload, EndOfUpstream):
            data_header = PythonDataHeader(tag=to, is_end=True)
            self._proxy_client.send_data(bytes(data_header), None)

        else:
            raise TypeError(f"Unexpected payload {data_payload}")

예제 #12

0

파일 보기

def arrow_to_pydf(data: pa.Table,
                  columns: Optional[Sequence[str]] = None,
                  rechunk: bool = True) -> "PyDataFrame":
    """
    Construct a PyDataFrame from an Arrow Table.
    """
    if columns is not None:
        try:
            data = data.rename_columns(columns)
        except pa.lib.ArrowInvalid as e:
            raise ValueError(
                "Dimensions of columns arg must match data dimensions.") from e

    data_dict = {}
    for i, column in enumerate(data):
        # extract the name before casting
        if column._name is None:
            name = f"column_{i}"
        else:
            name = column._name

        column = coerce_arrow(column)
        data_dict[name] = column

    batches = pa.table(data_dict).to_batches()
    pydf = PyDataFrame.from_arrow_record_batches(batches)
    if rechunk:
        pydf = pydf.rechunk()
    return pydf

예제 #13

0

파일 보기

파일: test_filters.py 프로젝트: statisticsnorway/microdata-data-service

def test_by_time_period_from_7670_to_8400():
    expected = Table.from_pydict({
        'unit_id': [
            1000000002, 1000000004, 1000000003, 1000000001,
            1000000001, 1000000003, 1000000003, 1000000001,
            1000000002
        ],
        'value': ["8", "2", "12", "3", "16", "2", "12", "3", "8"],
        'start_epoch_days': [
            1461, 3287, 4018, 5479, 7851, 7701, 7957, 8126, 8066
        ],
        'stop_epoch_days': [
            8065, 7710, 7700, 7850, 8125, 7956, np.nan, np.nan, np.nan
        ]
    })
    print_expected(expected)

    actual = filter_by_time_period(
        TEST_BOSTED_PARQUET_DIR, 7670, 8400, None, True
    )
    print_actual(actual)

    assert_frame_equal(
        expected.to_pandas(), actual.to_pandas(), check_dtype=False
    )

예제 #14

0

파일 보기

def _arrowtable2df(
    table: pa.Table,
    categories: Optional[List[str]],
    safe: bool,
    use_threads: bool,
    dataset: bool,
    path: str,
    path_root: Optional[str],
) -> pd.DataFrame:
    metadata: Dict[str, Any] = {}
    if table.schema.metadata is not None and b"pandas" in table.schema.metadata:
        metadata = json.loads(table.schema.metadata[b"pandas"])
    df: pd.DataFrame = _apply_partitions(
        df=table.to_pandas(
            use_threads=use_threads,
            split_blocks=True,
            self_destruct=True,
            integer_object_nulls=False,
            date_as_object=True,
            ignore_metadata=True,
            strings_to_categorical=False,
            safe=safe,
            categories=categories,
            types_mapper=_data_types.pyarrow2pandas_extension,
        ),
        dataset=dataset,
        path=path,
        path_root=path_root,
    )
    df = _utils.ensure_df_is_mutable(df=df)
    if metadata:
        _logger.debug("metadata: %s", metadata)
        df = _apply_index(df=df, metadata=metadata)
        df = _apply_timezone(df=df, metadata=metadata)
    return df

예제 #15

0

파일 보기

파일: arrow_writer.py 프로젝트: rosakun/datasets

    def write_table(self, pa_table: pa.Table, writer_batch_size: Optional[int] = None):
        """Write a batch of Example to file.

        Args:
            example: the Example to add.
        """
        if writer_batch_size is None:
            writer_batch_size = self.writer_batch_size
        if self.pa_writer is None:
            self._build_writer(inferred_schema=pa_table.schema)
        pa_table = pa_table.cast(self._schema)
        batches: List[pa.RecordBatch] = pa_table.to_batches(max_chunksize=writer_batch_size)
        self._num_bytes += sum(batch.nbytes for batch in batches)
        self._num_examples += pa_table.num_rows
        for batch in batches:
            self.pa_writer.write_batch(batch)

예제 #16

0

파일 보기

파일: arrow.py 프로젝트: willschlitzer/geopandas

def _geopandas_to_arrow(df, index=None):
    """
    Helper function with main, shared logic for to_parquet/to_feather.
    """
    from pyarrow import Table

    warnings.warn(
        "this is an initial implementation of Parquet/Feather file support and "
        "associated metadata.  This is tracking version 0.1.0 of the metadata "
        "specification at "
        "https://github.com/geopandas/geo-arrow-spec\n\n"
        "This metadata specification does not yet make stability promises.  "
        "We do not yet recommend using this in a production setting unless you "
        "are able to rewrite your Parquet/Feather files.\n\n"
        "To further ignore this warning, you can do: \n"
        "import warnings; warnings.filterwarnings('ignore', "
        "message='.*initial implementation of Parquet.*')",
        UserWarning,
        stacklevel=4,
    )

    _validate_dataframe(df)

    # create geo metadata before altering incoming data frame
    geo_metadata = _create_metadata(df)

    df = _encode_wkb(df)

    table = Table.from_pandas(df, preserve_index=index)

    # Store geopandas specific file-level metadata
    # This must be done AFTER creating the table or it is not persisted
    metadata = table.schema.metadata
    metadata.update({b"geo": _encode_metadata(geo_metadata)})
    return table.replace_schema_metadata(metadata)

예제 #17

0

파일 보기

파일: parse.py 프로젝트: uk-gov-mirror/moj-analytical-services.mojap-arrow-pd-parser

def cast_arrow_table_to_schema(
    tab: pa.Table,
    schema: Union[pa.Schema, None] = None,
    expect_full_schema: bool = True,
):
    """Casts an arrow schema to a new or partial schema

    Args:
        tab (pa.Table): An arrow table
        schema (Union[pa.Schema, None], optional): [description]. Defaults to None.
        expect_full_schema (bool, optional): if True, pyarrow reader will
            expect the input schema to have fields for every col in the
            input file. If False, then will only cast columns that
            are listed in the schema, leaving all other columns to their
            default type on read.
    """

    if expect_full_schema:
        update_schema = schema
    else:
        update_schema = update_existing_schema(tab.schema, schema)

    new_tab = tab.cast(update_schema)

    return new_tab

예제 #18

0

파일 보기

    def _arrow_table_to_pandas(
        cls, arrow_table: pa.Table, categories, **kwargs
    ) -> pd.DataFrame:
        _kwargs = kwargs.get("arrow_to_pandas", {})
        _kwargs.update({"use_threads": False, "ignore_metadata": False})

        return arrow_table.to_pandas(categories=categories, **_kwargs)

예제 #19

0

파일 보기

파일: _read_parquet.py 프로젝트: rparthas/aws-data-wrangler

def _arrowtable2df(
    table: pa.Table,
    categories: Optional[List[str]],
    safe: bool,
    use_threads: bool,
    dataset: bool,
    path: str,
    path_root: Optional[str],
) -> pd.DataFrame:
    df: pd.DataFrame = _apply_partitions(
        df=table.to_pandas(
            use_threads=use_threads,
            split_blocks=True,
            self_destruct=True,
            integer_object_nulls=False,
            date_as_object=True,
            ignore_metadata=True,
            categories=categories,
            safe=safe,
            types_mapper=_data_types.pyarrow2pandas_extension,
        ),
        dataset=dataset,
        path=path,
        path_root=path_root,
    )
    return _utils.ensure_df_is_mutable(df=df)

예제 #20

0

파일 보기

  def register_file_by_path(self, path):
    root, fname = os.path.split(path)
    tablename, _ = os.path.splitext(fname)
    fpath = os.path.join(root, fname)
    loaded = False
    exception = None
    for sep in [',', '|', '\t']:
      df = None # TODO delete
      columnar_tb = None
      try:
        with openfile(fpath) as f:
          df = pandas.read_csv(f, sep=sep) # TODO delete
        columnar_tb = pa_tb.from_pandas(df)
      except Exception as e:
        exception = e

      if df is not None and columnar_tb is not None:
        self.register_dataframe(tablename, df)
        self.register_columnar_tb(tablename, columnar_tb)
        loaded = True
        break

    if not loaded:
      print("Failed to read data file %s" % (fpath))
      print(exception)

예제 #21

0

파일 보기

파일: stats_impl.py 프로젝트: Bobgy/data-validation

def generate_partial_statistics_in_memory(
    table: pa.Table, options: stats_options.StatsOptions,
    stats_generators: List[stats_generator.CombinerStatsGenerator]
) -> List[Any]:
    """Generates statistics for an in-memory list of examples.

  Args:
    table: Arrow table.
    options: Options for generating data statistics.
    stats_generators: A list of combiner statistics generators.

  Returns:
    A list of accumulators containing partial statistics.
  """
    result = []
    if options.feature_whitelist:
        whitelisted_columns = [
            table.column(f).data for f in options.feature_whitelist
        ]
        table = pa.Table.from_arrays(whitelisted_columns,
                                     list(options.feature_whitelist))
    for generator in stats_generators:
        result.append(
            generator.add_input(generator.create_accumulator(), table))
    return result

예제 #22

0

파일 보기

파일: test_arrow.py 프로젝트: Imanflow/geopandas

def test_parquet_invalid_metadata(tmpdir, geo_meta, error):
    """Has geo metadata with missing required fields will raise a ValueError.

    This requires writing the parquet file directly below, so that we can
    control the metadata that is written for this test.
    """

    from pyarrow import parquet, Table

    test_dataset = "naturalearth_lowres"
    df = read_file(get_path(test_dataset))

    # convert to DataFrame and encode geometry to WKB
    df = DataFrame(df)
    df["geometry"] = to_wkb(df["geometry"].values)

    table = Table.from_pandas(df)
    metadata = table.schema.metadata
    metadata.update(geo_meta)
    table = table.replace_schema_metadata(metadata)

    filename = os.path.join(str(tmpdir), "test.pq")
    parquet.write_table(table, filename)

    with pytest.raises(ValueError, match=error):
        read_parquet(filename)

예제 #23

0

파일 보기

파일: arrow_util.py 프로젝트: Bobgy/data-validation

def get_broadcastable_column(input_table: pa.Table,
                             column_name: Text) -> pa.Array:
    """Gets a column from the input table, validating that it can be broadcast.

  Args:
    input_table: Input table.
    column_name: Name of the column to be retrieved and validated.
      This column must refer to a ListArray in which each list has length 1.

  Returns:
    An arrow array containing a flattened view of the broadcast column.

  Raises:
    ValueError: If the broadcast feature is not present in the input table or is
        not a valid column. A valid column must have exactly one value per
        example and be of a numeric type.
  """
    try:
        column = input_table.column(column_name).data.chunk(0)
    except KeyError:
        raise ValueError(
            'Column "{}" not present in the input table.'.format(column_name))

    # Before flattening, check that there is a single value for each example.
    column_lengths = array_util.ListLengthsFromListArray(column).to_numpy()
    if not np.all(column_lengths == 1):
        raise ValueError(
            'Column "{}" must have exactly one value in each example.'.format(
                column_name))
    return column.flatten()

예제 #24

0

파일 보기

    def arrow_to_pandas(cls, table: pa.Table) -> pd.DataFrame:

        return table.to_pandas(
            ignore_metadata=True,  # noqa
            date_as_object=False,  # noqa
            timestamp_as_object=False,  # noqa
            types_mapper=cls.__ARROW_TO_PANDAS_TYPE_MAPPING.get)

예제 #25

0

파일 보기

파일: _resampling.py 프로젝트: rnyb/webviz-subsurface

def resample_single_real_table(table: pa.Table, freq: Frequency) -> pa.Table:
    """Resample table that contains only a single realization.
    The table must contain a DATE column and it must be sorted on DATE
    """

    # Notes:
    # Getting meta data using json.loads() takes quite a bit of time!!
    # We should provide this info in another way.

    schema = table.schema

    raw_dates_np = table.column("DATE").to_numpy()
    raw_dates_np_as_uint = raw_dates_np.astype(np.uint64)

    min_raw_date = np.min(raw_dates_np)
    max_raw_date = np.max(raw_dates_np)

    sample_dates_np = generate_normalized_sample_dates(
        min_raw_date, max_raw_date, freq=freq
    )
    sample_dates_np_as_uint = sample_dates_np.astype(np.uint64)

    column_arrays = []

    for colname in schema.names:
        if colname == "DATE":
            column_arrays.append(sample_dates_np)
        elif colname == "REAL":
            column_arrays.append(
                np.full(len(sample_dates_np), table.column("REAL")[0].as_py())
            )
        else:
            raw_numpy_arr = table.column(colname).to_numpy()
            if is_rate_from_field_meta(table.field(colname)):
                i = interpolate_backfill(
                    sample_dates_np_as_uint, raw_dates_np_as_uint, raw_numpy_arr, 0, 0
                )
            else:
                i = np.interp(
                    sample_dates_np_as_uint, raw_dates_np_as_uint, raw_numpy_arr
                )

            column_arrays.append(i)

    ret_table = pa.table(column_arrays, schema=schema)

    return ret_table

예제 #26

0

파일 보기

파일: __init__.py 프로젝트: fairtide/DataFrame

def write_table(table: pyarrow.Table, compression_level=0) -> bytes:
    table = table.combine_chunks()
    doc = collections.OrderedDict()
    for col in table.columns:
        buf = write_array(col.data.chunks[0], compression_level)
        doc[col.name] = bson.raw_bson.RawBSONDocument(buf)

    return bson.encode(doc)

예제 #27

0

파일 보기

파일: table.py 프로젝트: merveenoyan/datasets

def _write_table_to_file(table: pa.Table, filename: str) -> int:
    with open(filename, "wb") as sink:
        writer = pa.RecordBatchStreamWriter(sink=sink, schema=table.schema)
        batches: List[pa.RecordBatch] = table.to_batches()
        for batch in batches:
            writer.write_batch(batch)
        writer.close()
        return sum(batch.nbytes for batch in batches)

예제 #28

0

파일 보기

파일: stats_impl.py 프로젝트: Bobgy/data-validation

 def add_input(self, accumulator: List[float],
               examples_table: pa.Table) -> List[float]:
     accumulator[0] += examples_table.num_rows
     if self._weight_feature:
         weights_column = examples_table.column(self._weight_feature)
         for weight_array in weights_column.data.iterchunks():
             accumulator[1] += np.sum(np.asarray(weight_array.flatten()))
     return accumulator

예제 #29

0

파일 보기

파일: rowiter.py 프로젝트: simonla/beancount-1

def row_iter(table: pyarrow.Table):
    """Iterator row over row."""
    # pylint: disable=invalid-name
    Row = collections.namedtuple("Row", table.column_names)
    for index in range(table.num_rows):
        row = table.slice(index, 1)
        obj = Row(*(col[0].as_py() for col in row.itercolumns()))
        yield obj

예제 #30

0

파일 보기

def reencode_dictionaries(table: pa.Table) -> pa.Table:
    for i in range(table.num_columns):
        column = table.columns[i]
        if pa.types.is_dictionary(column.type):
            table = table.set_column(
                i, table.column_names[i],
                reencode_dictionary_array(column.chunks[0]))
    return table

예제 #31

0

파일 보기

파일: parquet.py 프로젝트: sunchao/arrow

def write_to_dataset(table, root_path, partition_cols=None,
                     filesystem=None, preserve_index=True, **kwargs):
    """
    Wrapper around parquet.write_table for writing a Table to
    Parquet format by partitions.
    For each combination of partition columns and values,
    a subdirectories are created in the following
    manner:

    root_dir/
      group1=value1
        group2=value1
          <uuid>.parquet
        group2=value2
          <uuid>.parquet
      group1=valueN
        group2=value1
          <uuid>.parquet
        group2=valueN
          <uuid>.parquet

    Parameters
    ----------
    table : pyarrow.Table
    root_path : string,
        The root directory of the dataset
    filesystem : FileSystem, default None
        If nothing passed, paths assumed to be found in the local on-disk
        filesystem
    partition_cols : list,
        Column names by which to partition the dataset
        Columns are partitioned in the order they are given
    preserve_index : bool,
        Parameter for instantiating Table; preserve pandas index or not.
    **kwargs : dict, kwargs for write_table function.
    """
    from pyarrow import (
        Table,
        compat
    )

    if filesystem is None:
        fs = _get_fs_from_path(root_path)
    else:
        fs = _ensure_filesystem(filesystem)

    _mkdir_if_not_exists(fs, root_path)

    if partition_cols is not None and len(partition_cols) > 0:
        df = table.to_pandas()
        partition_keys = [df[col] for col in partition_cols]
        data_df = df.drop(partition_cols, axis='columns')
        data_cols = df.columns.drop(partition_cols)
        if len(data_cols) == 0:
            raise ValueError("No data left to save outside partition columns")
        for keys, subgroup in data_df.groupby(partition_keys):
            if not isinstance(keys, tuple):
                keys = (keys,)
            subdir = "/".join(
                ["{colname}={value}".format(colname=name, value=val)
                 for name, val in zip(partition_cols, keys)])
            subtable = Table.from_pandas(subgroup,
                                         preserve_index=preserve_index)
            prefix = "/".join([root_path, subdir])
            _mkdir_if_not_exists(fs, prefix)
            outfile = compat.guid() + ".parquet"
            full_path = "/".join([prefix, outfile])
            with fs.open(full_path, 'wb') as f:
                write_table(subtable, f, **kwargs)
    else:
        outfile = compat.guid() + ".parquet"
        full_path = "/".join([root_path, outfile])
        with fs.open(full_path, 'wb') as f:
            write_table(table, f, **kwargs)