Python table示例，pyarrow.table Python示例

示例#1

0

显示文件

文件： test_period.py 项目： danielmoreira12/BAProject

def test_arrow_table_roundtrip():
    import pyarrow as pa
    from pandas.core.arrays._arrow_utils import ArrowPeriodType

    arr = PeriodArray([1, 2, 3], freq="D")
    arr[1] = pd.NaT
    df = pd.DataFrame({"a": arr})

    table = pa.table(df)
    assert isinstance(table.field("a").type, ArrowPeriodType)
    result = table.to_pandas()
    assert isinstance(result["a"].dtype, PeriodDtype)
    tm.assert_frame_equal(result, df)

    table2 = pa.concat_tables([table, table])
    result = table2.to_pandas()
    expected = pd.concat([df, df], ignore_index=True)
    tm.assert_frame_equal(result, expected)

示例#2

0

显示文件

def test_parquet_nested_extension(tmpdir):
    # Parquet support for extension types nested in struct or list
    import pyarrow.parquet as pq

    ext_type = IntegerType()
    storage = pa.array([4, 5, 6, 7], type=pa.int64())
    ext_array = pa.ExtensionArray.from_storage(ext_type, storage)
    struct_array = pa.StructArray.from_arrays(
        [storage, ext_array],
        names=['ints', 'exts'])

    orig_table = pa.table({'structs': struct_array})
    filename = tmpdir / 'nested_extension_type.parquet'
    pq.write_table(orig_table, filename)

    table = pq.read_table(filename)
    assert table.column(0).type == struct_array.type
    assert table == orig_table

示例#3

0

显示文件

文件： test_interval.py 项目： sduzjp/Python

def test_arrow_table_roundtrip(breaks):
    import pyarrow as pa
    from pandas.core.arrays._arrow_utils import ArrowIntervalType

    arr = IntervalArray.from_breaks(breaks)
    arr[1] = None
    df = pd.DataFrame({"a": arr})

    table = pa.table(df)
    assert isinstance(table.field("a").type, ArrowIntervalType)
    result = table.to_pandas()
    assert isinstance(result["a"].dtype, pd.IntervalDtype)
    tm.assert_frame_equal(result, df)

    table2 = pa.concat_tables([table, table])
    result = table2.to_pandas()
    expected = pd.concat([df, df], ignore_index=True)
    tm.assert_frame_equal(result, expected)

示例#4

0

显示文件

 def test_only_dictionary_encode_for_big_savings(self):
     no_values = ["A", "B", "C"] * 10  # dictionary would give ~10x savings
     yes_values = ["A", "B"] * 15  # dictionary would give ~15x savings
     csv = "\n".join(f"{no},{yes}"
                     for no, yes in zip(no_values, yes_values))
     with _temp_csv(csv) as path:
         assert_csv_result_equals(
             _internal_parse_csv(path, has_header=False),
             ParseCsvResult(
                 pa.table({
                     "Column 1":
                     no_values,
                     "Column 2":
                     pa.array(yes_values).dictionary_encode(),
                 }),
                 [],
             ),
         )

示例#5

0

显示文件

文件： test_dataset.py 项目： xiepeini/arrow

def test_ipc_format(tempdir):
    table = pa.table({'a': pa.array([1, 2, 3], type="int8"),
                      'b': pa.array([.1, .2, .3], type="float64")})

    path = str(tempdir / 'test.arrow')
    with pa.output_stream(path) as sink:
        writer = pa.RecordBatchFileWriter(sink, table.schema)
        writer.write_batch(table.to_batches()[0])
        writer.close()

    dataset = ds.dataset(path, format=ds.IpcFileFormat())
    result = dataset.to_table()
    assert result.equals(table)

    for format_str in ["ipc", "arrow"]:
        dataset = ds.dataset(path, format=format_str)
        result = dataset.to_table()
        assert result.equals(table)

示例#6

0

显示文件

文件： test_table.py 项目： riteshcs2010/WhitBread_ProblemStatement

def test_table_repr_to_string():
    # Schema passed explicitly
    schema = pa.schema([
        pa.field('c0', pa.int16(), metadata={'key': 'value'}),
        pa.field('c1', pa.int32())
    ],
                       metadata={b'foo': b'bar'})

    tab = pa.table([
        pa.array([1, 2, 3, 4], type='int16'),
        pa.array([1, 2, 3, 4], type='int32')
    ],
                   schema=schema)
    assert str(tab) == """pyarrow.Table
c0: int16
c1: int32"""

    assert tab.to_string(show_metadata=True) == """\

示例#7

0

显示文件

 def test_dictionary_encode_empty(self):
     # All empty strings => 0 bytes of text data. So Arrow doesn't create
     # a buffer ... and our buffer-size math must account for buf=None.
     with _temp_csv("A,B\n,\n,\n,\n,\n,\n") as path:
         assert_csv_result_equals(
             _internal_parse_csv(path,
                                 has_header=True,
                                 autoconvert_text_to_numbers=False),
             ParseCsvResult(
                 pa.table({
                     "A":
                     pa.array(["", "", "", "", ""]).dictionary_encode(),
                     "B":
                     pa.array(["", "", "", "", ""]).dictionary_encode(),
                 }),
                 [],
             ),
         )

示例#8

0

显示文件

def test_v2_lz4_default_compression():
    # ARROW-8750: Make sure that the compression=None option selects lz4 if
    # it's available
    if not pa.Codec.is_available('lz4_frame'):
        pytest.skip("LZ4 compression support is not built in C++")

    # some highly compressible data
    t = pa.table([np.repeat(0, 100000)], names=['f0'])

    buf = io.BytesIO()
    write_feather(t, buf)
    default_result = buf.getvalue()

    buf = io.BytesIO()
    write_feather(t, buf, compression='uncompressed')
    uncompressed_result = buf.getvalue()

    assert len(default_result) < len(uncompressed_result)

示例#9

0

显示文件

def test_filter_errors():
    arr = pa.chunked_array([["a", None], ["c", "d", "e"]])
    batch = pa.record_batch([pa.array(["a", None, "c", "d", "e"])],
                            names=["a'"])
    table = pa.table([pa.array(["a", None, "c", "d", "e"])], names=["a"])

    for obj in [arr, batch, table]:
        # non-boolean dtype
        mask = pa.array([0, 1, 0, 1, 0])
        with pytest.raises(NotImplementedError,
                           match="no kernel matching input types"):
            obj.filter(mask)

        # wrong length
        mask = pa.array([True, False, True])
        with pytest.raises(pa.ArrowInvalid,
                           match="must all be the same length"):
            obj.filter(mask)

示例#10

0

显示文件

def sssp(graph: PropertyGraph, source, length_property, shift, property_name):
    dists = create_distance_array(graph, source, length_property)
    init_bag = InsertBag[UpdateRequest]()
    init_bag.push((source, 0))

    t = StatTimer("Total SSSP")
    t.start()
    for_each(
        init_bag,
        sssp_operator(graph, dists, graph.get_edge_property(length_property)),
        worklist=OrderedByIntegerMetric(obim_indexer(shift)),
        disable_conflict_detection=True,
        loop_name="SSSP",
    )
    t.stop()
    print("Elapsed time: ", t.get(), "milliseconds.")

    graph.add_node_property(pyarrow.table({property_name: dists}))

示例#11

0

显示文件

文件： main.py 项目： MD2Korg/CerebralCortex-Scripts

def accel_convert_to_table(accel, stream_name, user_name, start_time, total_records, frequency):
    ts = get_timestamps(start_time=start_time, total_records=accel[:, 0].size, frequency=frequency)
    print("Converting into Table: Stream Name: ", stream_name)
    try:
        ndarray_table = pa.table(
            {
                "timestamp": ts.get("timestamp"),
                "localtime": ts.get("localtime"),
                "x": accel[:, 0],
                "y": accel[:, 1],
                "z": accel[:, 2]
            }
        )
        file_path = (data_folder_path+"parsed/stream="+stream_name+"/version=1/user="******"/").lower()
        Path(file_path).mkdir(parents=True, exist_ok=True)
        pq.write_table(ndarray_table,file_path+"data.parquet")
    except Exception as e:
        print(e)

示例#12

0

显示文件

def test_projection(primitive_type_test_file, pyarrow_primitive_array,
                    pyarrow_schema):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)

    source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema)
    num_cols = source_table.num_columns
    for i in range(1, num_cols - 1):
        source_table = source_table.remove_column(num_cols - i)

    assert source_table == reader.read()

示例#13

0

显示文件

文件： test_table.py 项目： cambridgesemantics/arrow

def test_chunked_array_to_pandas_preserve_name():
    # https://issues.apache.org/jira/browse/ARROW-7709
    import pandas as pd
    import pandas.testing as tm

    for data in [
            pa.array([1, 2, 3]),
            pa.array(pd.Categorical(["a", "b", "a"])),
            pa.array(pd.date_range("2012", periods=3)),
            pa.array(pd.date_range("2012", periods=3, tz="Europe/Brussels")),
            pa.array([1, 2, 3], pa.timestamp("ms")),
            pa.array([1, 2, 3], pa.timestamp("ms", "Europe/Brussels"))
    ]:
        table = pa.table({"name": data})
        result = table.column("name").to_pandas()
        assert result.name == "name"
        expected = pd.Series(data.to_pandas(), name="name")
        tm.assert_series_equal(result, expected)

示例#14

0

显示文件

def test_truncate_do_not_cause_invalid_utf8():
    workbook = xl.Workbook()
    sheet = workbook.add_sheet("X")
    for i, s in enumerate(
        [
            # Examples from https://en.wikipedia.org/wiki/UTF-8
            "AAAA",
            "AA\u00A2",  # ¢ (2 bytes) -- keep
            "AAA\u00A2",  # ¢ (2 bytes) -- drop both bytes
            "A\u0939",  # ह (3 bytes) -- keep
            "AA\u0939",  # ह (3 bytes) -- drop all three bytes
            "AAA\u0939",  # ह (3 bytes) -- drop all three bytes
            "\U00010348",  # 𐍈 (4 bytes) -- keep
            "A\U00010348",  # 𐍈 (4 bytes) -- drop all four bytes
            "AA\U00010348",  # 𐍈 (4 bytes) -- drop all four bytes
            "AAA\U00010348",  # 𐍈 (4 bytes) -- drop all four bytes
        ]
    ):
        sheet.write(i, 0, s)

    result, stdout = do_convert_data(
        workbook,
        max_bytes_per_value=4,
        header_rows="",
        include_stdout=True,
    )
    expected = pyarrow.table(
        {
            "A": [
                "AAAA",
                "AA\u00A2",
                "AAA",
                "A\u0939",
                "AA",
                "AAA",
                "\U00010348",
                "A",
                "AA",
                "AAA",
            ]
        }
    )
    assert_table_equals(result, expected)
    assert stdout == b"truncated 6 values (value byte limit is 4; see row 2 column A)\n"

示例#15

0

显示文件

def test_column_add(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(3, "string_col", StringType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(13, "int_col2", IntegerType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)
    pyarrow_array = [
        pa.array([1, 2, 3, 4, 5], type=pa.int32()),
        pa.array([1, 2, 3, None, 5], type=pa.int64()),
        pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()),
        pa.array([None, None, None, None, None], type=pa.int32())
    ]
    source_table = pa.table(pyarrow_array,
                            schema=pa.schema([
                                pa.field("int_col", pa.int32(),
                                         nullable=False),
                                pa.field("bigint_col",
                                         pa.int64(),
                                         nullable=True),
                                pa.field("string_col",
                                         pa.string(),
                                         nullable=True),
                                pa.field("float_col",
                                         pa.float32(),
                                         nullable=True),
                                pa.field("dbl_col",
                                         pa.float64(),
                                         nullable=True),
                                pa.field("int_col2", pa.int32(),
                                         nullable=True),
                            ]))

    target_table = reader.read()
    assert source_table == target_table

示例#16

0

显示文件

文件： parquet.py 项目： zhiliangpersonal/cjworkbench

def write(parquet_path: Path, table: pyarrow.Table) -> None:
    """
    Write an Arrow table to a Parquet file, overwriting if needed.

    We aim to keep the file format "stable": all future versions of
    parquet.read() should support all files written by today's version of this
    function.

    Dictionary-encoded columns will stay dictionary-encoded. Practically,
    `parquet.write(path, table); table = parquet.read(path)` does not change
    `table`.
    """
    if table.num_rows == 0:
        # Workaround for https://issues.apache.org/jira/browse/ARROW-6568
        # If table is zero-length, guarantee it has a RecordBatch so Arrow
        # won't crash when writing a DictionaryArray.

        def empty_array_for_field(field):
            if pyarrow.types.is_dictionary(field.type):
                return pyarrow.DictionaryArray.from_arrays(
                    pyarrow.array([], type=field.type.index_type),
                    pyarrow.array([], type=field.type.value_type),
                )
            else:
                return pyarrow.array([], type=field.type)

        table = pyarrow.table({
            field.name: empty_array_for_field(field)
            for field in table.schema
        })

    pyarrow.parquet.write_table(
        table,
        str(parquet_path),
        version="2.0",
        compression="SNAPPY",
        # Preserve whatever dictionaries we have in Pandas. Write+read
        # should return an exact copy.
        use_dictionary=[
            name.encode("utf-8")
            for name, column in zip(table.column_names, table.columns)
            if pyarrow.types.is_dictionary(column.type)
        ],
    )

示例#17

0

显示文件

文件： test_parquet.py 项目： rth/pandas

    def test_use_nullable_dtypes(self, engine):
        import pyarrow.parquet as pq

        if engine == "fastparquet":
            # We are manually disabling fastparquet's
            # nullable dtype support pending discussion
            pytest.skip("Fastparquet nullable dtype support is disabled")

        table = pyarrow.table({
            "a": pyarrow.array([1, 2, 3, None], "int64"),
            "b": pyarrow.array([1, 2, 3, None], "uint8"),
            "c": pyarrow.array(["a", "b", "c", None]),
            "d": pyarrow.array([True, False, True, None]),
            # Test that nullable dtypes used even in absence of nulls
            "e": pyarrow.array([1, 2, 3, 4], "int64"),
        })
        with tm.ensure_clean() as path:
            # write manually with pyarrow to write integers
            pq.write_table(table, path)
            result1 = read_parquet(path, engine=engine)
            result2 = read_parquet(path,
                                   engine=engine,
                                   use_nullable_dtypes=True)

        assert result1["a"].dtype == np.dtype("float64")
        expected = pd.DataFrame({
            "a":
            pd.array([1, 2, 3, None], dtype="Int64"),
            "b":
            pd.array([1, 2, 3, None], dtype="UInt8"),
            "c":
            pd.array(["a", "b", "c", None], dtype="string"),
            "d":
            pd.array([True, False, True, None], dtype="boolean"),
            "e":
            pd.array([1, 2, 3, 4], dtype="Int64"),
        })
        if engine == "fastparquet":
            # Fastparquet doesn't support string columns yet
            # Only int and boolean
            result2 = result2.drop("c", axis=1)
            expected = expected.drop("c", axis=1)
        tm.assert_frame_equal(result2, expected)

示例#18

0

显示文件

文件： scdata.py 项目： das-projects/singlecell

    def transpose(self, in_place: bool = True):
        """\
        Transpose whole object.

        Data matrix is transposed, observations and variables are interchanged.
        """
        scdata = self.to_memory().read_all().select(
            self.var).to_pandas().transpose()
        obs = self.format_obs(pa.table(self.var), self.to_memory())
        var = self.obs
        if not in_place:
            return SCData(scdata, obs, var, self.uns, self.obsm, self.varm)
        else:
            scdata: pa.Table = pa.Table.from_pandas(df=scdata,
                                                    preserve_index=True,
                                                    nthreads=self.use_cores)
            scdata = self.ensure_scdata_format(scdata, obs, var, self.obsm,
                                               self.varm, self.uns)
            self.update_scdata(scdata)

示例#19

0

显示文件

def test_convert_uint8_uint16_uint32():
    # parquet only stores int32/int64 values natively. These are upcast to
    # be encoded.
    _test_convert_via_arrow(
        pyarrow.table({
            "u8":
            pyarrow.array([1, 138, None], type=pyarrow.uint8()),
            "u16":
            pyarrow.array([1, 38383, None], type=pyarrow.uint16()),
            "u32":
            pyarrow.array([1, 4294967291, None], type=pyarrow.uint32()),
        }),
        "u8,u16,u32\r\n1,1,1\r\n138,38383,4294967291\r\n,,",
        [
            dict(u8=1, u16=1, u32=1),
            dict(u8=138, u16=38383, u32=4294967291),
            dict(u8=None, u16=None, u32=None),
        ],
    )

示例#20

0

显示文件

文件： test_extension_type.py 项目： hihod/arrow

def test_parquet_period(tmpdir, registered_period_type):
    # Parquet support for primitive extension types
    period_type, period_class = registered_period_type
    storage = pa.array([1, 2, 3, 4], pa.int64())
    arr = pa.ExtensionArray.from_storage(period_type, storage)
    table = pa.table([arr], names=["ext"])

    import pyarrow.parquet as pq

    filename = tmpdir / 'period_extension_type.parquet'
    pq.write_table(table, filename)

    # Stored in parquet as storage type but with extension metadata saved
    # in the serialized arrow schema
    meta = pq.read_metadata(filename)
    assert meta.schema.column(0).physical_type == "INT64"
    assert b"ARROW:schema" in meta.metadata

    import base64
    decoded_schema = base64.b64decode(meta.metadata[b"ARROW:schema"])
    schema = pa.ipc.read_schema(pa.BufferReader(decoded_schema))
    # Since the type could be reconstructed, the extension type metadata is
    # absent.
    assert schema.field("ext").metadata == {}

    # When reading in, properly create extension type if it is registered
    result = pq.read_table(filename)
    assert result.schema.field("ext").type == period_type
    assert result.schema.field("ext").metadata == {b'PARQUET:field_id': b'1'}
    # Get the exact array class defined by the registered type.
    result_array = result.column("ext").chunk(0)
    assert type(result_array) is period_class

    # When the type is not registered, read in as storage type
    pa.unregister_extension_type(period_type.extension_name)
    result = pq.read_table(filename)
    assert result.schema.field("ext").type == pa.int64()
    # The extension metadata is present for roundtripping.
    assert result.schema.field("ext").metadata == {
        b'ARROW:extension:metadata': b'freq=D',
        b'ARROW:extension:name': b'test.period',
        b'PARQUET:field_id': b'1',
    }

示例#21

0

显示文件

def test_sort_indices_table():
    table = pa.table({"a": [1, 1, 0], "b": [1, 0, 1]})

    result = pc.sort_indices(table, sort_keys=[("a", "ascending")])
    assert result.to_pylist() == [2, 0, 1]

    result = pc.sort_indices(table,
                             sort_keys=[("a", "ascending"),
                                        ("b", "ascending")])
    assert result.to_pylist() == [2, 1, 0]

    with pytest.raises(ValueError, match="Must specify one or more sort keys"):
        pc.sort_indices(table)

    with pytest.raises(ValueError, match="Nonexistent sort key column"):
        pc.sort_indices(table, sort_keys=[("unknown", "ascending")])

    with pytest.raises(ValueError, match="not a valid order"):
        pc.sort_indices(table, sort_keys=[("a", "nonscending")])

示例#22

0

显示文件

文件： test_csv.py 项目： brandonrobertz/cjworkbench

 def test_truncate_csv_repair_utf8(self):
     with _temp_csv("A,B\na,b\nc,d\né,f\ng,h") as path:
         assert_csv_result_equals(
             _internal_parse_csv(path, has_header=True),
             ParseCsvResult(
                 pa.table({
                     "A": ["a", "c", "�"],
                     "B": ["b", "d", None]
                 }),
                 [
                     ParseCsvWarning.TruncatedFile(20, 13),
                     ParseCsvWarning.RepairedEncoding(
                         encoding="utf-8",
                         first_invalid_byte=195,
                         first_invalid_byte_position=12,
                     ),
                 ],
             ),
         )

示例#23

0

显示文件

    def encode(self):
        global Data_decompressed
        '''
        This function will be using decoder2 function defined above for decoding data.
        This Model will create a parquet file for decoded columns
        
        '''

        encoded_array = self.encoder_model.generateEncodings(
            Data_decompressed, Mask_decompressed)
        parquetDic = {}
        for i in range(encoded_array.shape[1]):
            name = f'col_{i+1}'
            parquetDic[name] = encoded_array[:, i]
        print(f'Encoder Columns shape: {encoded_array.shape}')
        log2(encoded_array)
        ndarray_table = pa.table(parquetDic)
        pq.write_table(ndarray_table, 'my_encoder.parquet')
        print('File my_encoder.parquet created')

示例#24

0

显示文件

文件： numpy.py 项目： amzn/amazon-ray

def ndarray_to_file(
        np_array: np.ndarray,
        path: str,
        file_system: AbstractFileSystem,
        content_type: str = ContentType.PARQUET.value,
        **kwargs):
    """
    Writes the given Numpy ndarray to a file.
    """

    # PyArrow only supports 1D ndarrays, so convert to list of 1D arrays
    np_arrays = [array for array in np_array]
    pa_utils.table_to_file(
        pa.table({"data": np_arrays}),
        path,
        file_system,
        content_type,
        **kwargs
    )

示例#25

0

显示文件

文件： test_parquet_to_arrow.py 项目： CJWorkbench/parquet-to-arrow

def test_read_fastparquet_text_categorical():
    # To write this file, install fastparquet and run:
    #
    # import fastparquet
    # import pandas as pd
    # fastparquet.write(
    #     'x.parquet',
    #     pd.DataFrame({"A": pd.Series(["x", None, "y", "x", "x"], dtype="category")})
    # )
    path = (Path(__file__).parent / "files" /
            "column-A-dictionary-from-fastparquet.parquet")
    result = do_convert(path)
    assert_table_equals(
        result,
        pyarrow.table({
            "A":
            pyarrow.array(["x", None, "y", "x", "x"]).dictionary_encode()
        }),
    )

示例#26

0

显示文件

 def test_encode_nested_arrays_and_objects(self):
     assert_json_result_equals(
         _parse_json_with_defaults([{
             "value": {
                 "x": ["y", {
                     "z": True,
                     "Z": ["a", None]
                 }, ["b", "c"]],
                 "X": {},
             }
         }]),
         ParseJsonResult(
             pyarrow.table({
                 "value":
                 ['{"x":["y",{"z":true,"Z":["a",null]},["b","c"]],"X":{}}']
             }),
             [],
         ),
     )

示例#27

0

显示文件

def dataframe_to_arrow_table(
    dataframe: pd.DataFrame, columns: List[Column], path: Path
) -> None:
    """Write `dataframe` to an Arrow file."""
    arrays = []
    for column in columns:
        arrays.append(series_to_arrow_array(dataframe[column.name]))

    arrow_table_without_metadata = pa.Table.from_arrays(
        arrays, names=[c.name for c in columns]
    )
    fields = [
        _fix_arrow_field(arrow_table_without_metadata.schema.field(i), columns[i].type)
        for i in range(len(columns))
    ]
    arrow_table = pa.table(arrow_table_without_metadata.columns, pa.schema(fields))

    with pa.RecordBatchFileWriter(str(path), arrow_table.schema) as writer:
        writer.write_table(arrow_table)

示例#28

0

显示文件

 def render(arrow_table, params, output_path, *, columns, **kwargs):
     # Test the "columns" kwarg
     self.assertEqual(columns, input_columns)
     table = pa.table(
         {
             "A": [1],
             "B": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
             "C": ["a"],
             "D": [1],
             "E": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
             "F": ["a"],
             "G": [1],
             "H": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
             "I": ["a"],
         }
     )
     with pa.ipc.RecordBatchFileWriter(output_path, table.schema) as writer:
         writer.write_table(table)
     return []

示例#29

0

显示文件

文件： construction.py 项目： elferherrera/polars

def pandas_to_pydf(
    data: "pd.DataFrame",
    columns: Optional[Sequence[str]] = None,
    rechunk: bool = True,
    nan_to_none: bool = True,
) -> "PyDataFrame":
    """
    Construct a PyDataFrame from a pandas DataFrame.
    """
    if not _PYARROW_AVAILABLE:
        raise ImportError(
            "'pyarrow' is required when constructing a PyDataFrame from a pandas DataFrame."
        )
    arrow_dict = {
        str(col): _pandas_series_to_arrow(data[col], nan_to_none=nan_to_none)
        for col in data.columns
    }
    arrow_table = pa.table(arrow_dict)
    return arrow_to_pydf(arrow_table, columns=columns, rechunk=rechunk)

示例#30

0

显示文件

def test_orcfile_readwrite():
    from pyarrow import orc

    buffer_output_stream = pa.BufferOutputStream()
    a = pa.array([1, None, 3, None])
    b = pa.array([None, "Arrow", None, "ORC"])
    table = pa.table({"int64": a, "utf8": b})
    orc.write_table(table, buffer_output_stream)
    buffer_reader = pa.BufferReader(buffer_output_stream.getvalue())
    output_table = orc.ORCFile(buffer_reader).read()
    assert table.equals(output_table)

    # deprecated keyword order
    buffer_output_stream = pa.BufferOutputStream()
    with pytest.warns(FutureWarning):
        orc.write_table(buffer_output_stream, table)
    buffer_reader = pa.BufferReader(buffer_output_stream.getvalue())
    output_table = orc.ORCFile(buffer_reader).read()
    assert table.equals(output_table)