Python dictionary示例，pyarrow.dictionary Python示例

示例#1

0

显示文件

def test_data_type():
    a = pbn.DiscreteFactor("A", [])
    with pytest.raises(ValueError) as ex:
        a.data_type()
    "DiscreteFactor factor not fitted." in str(ex.value)

    categories = np.asarray(["a1", "a2"])
    a_values = pd.Categorical(categories[np.random.randint(len(categories),
                                                           size=100)],
                              categories=categories,
                              ordered=False)
    df = pd.DataFrame({'A': a_values})
    a.fit(df)
    assert a.data_type() == pa.dictionary(pa.int8(), pa.string())

    categories = np.asarray(["a" + str(i) for i in range(1, 129)])
    a_values = pd.Categorical(categories[np.random.randint(len(categories),
                                                           size=100)],
                              categories=categories,
                              ordered=False)
    df = pd.DataFrame({'A': a_values})
    a.fit(df)
    assert a.data_type() == pa.dictionary(pa.int8(), pa.string())

    categories = np.asarray(["a" + str(i) for i in range(1, 130)])
    a_values = pd.Categorical(categories[np.random.randint(len(categories),
                                                           size=100)],
                              categories=categories,
                              ordered=False)
    df = pd.DataFrame({'A': a_values})
    a.fit(df)
    assert a.data_type() == pa.dictionary(pa.int16(), pa.string())

示例#2

0

显示文件

文件： test_schema.py 项目： sighingnow/arrow

def test_schema_merge():
    a = pa.schema([
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('baz', pa.list_(pa.int8()))
    ])
    b = pa.schema([pa.field('foo', pa.int32()), pa.field('qux', pa.bool_())])
    c = pa.schema([pa.field('quux', pa.dictionary(pa.int32(), pa.string()))])
    d = pa.schema([pa.field('foo', pa.int64()), pa.field('qux', pa.bool_())])

    result = pa.unify_schemas([a, b, c])
    expected = pa.schema([
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('baz', pa.list_(pa.int8())),
        pa.field('qux', pa.bool_()),
        pa.field('quux', pa.dictionary(pa.int32(), pa.string()))
    ])
    assert result.equals(expected)

    with pytest.raises(pa.ArrowInvalid):
        pa.unify_schemas([b, d])

    # ARROW-14002: Try with tuple instead of list
    result = pa.unify_schemas((a, b, c))
    assert result.equals(expected)

示例#3

0

显示文件

    def test_column_types_dict(self):
        # Ask for dict-encoded column types in ConvertOptions
        column_types = [('a', pa.dictionary(pa.int32(), pa.utf8())),
                        ('b', pa.dictionary(pa.int32(), pa.int64())),
                        ('c', pa.dictionary(pa.int32(), pa.decimal128(11, 2))),
                        ('d', pa.dictionary(pa.int32(), pa.large_utf8()))]

        opts = ConvertOptions(column_types=dict(column_types))
        rows = (b"a,b,c,d\n"
                b"abc,123456,1.0,zz\n"
                b"defg,123456,0.5,xx\n"
                b"abc,N/A,1.0,xx\n")
        table = self.read_bytes(rows, convert_options=opts)

        schema = pa.schema(column_types)
        expected = {
            'a': ["abc", "defg", "abc"],
            'b': [123456, 123456, None],
            'c': [Decimal("1.00"),
                  Decimal("0.50"),
                  Decimal("1.00")],
            'd': ["zz", "xx", "xx"],
        }
        assert table.schema == schema
        assert table.to_pydict() == expected

        # Unsupported index type
        column_types[0] = ('a', pa.dictionary(pa.int8(), pa.utf8()))

        opts = ConvertOptions(column_types=dict(column_types))
        with pytest.raises(NotImplementedError):
            table = self.read_bytes(rows, convert_options=opts)

示例#4

0

显示文件

文件： test.py 项目： fairtide/DataFrame

def test_table(n, types=None, offset=None, length=None, nullable=True):
    if types is None:
        types = [
            pyarrow.null(),
            pyarrow.bool_(),
            pyarrow.int8(),
            pyarrow.int16(),
            pyarrow.int32(),
            pyarrow.int64(),
            pyarrow.uint8(),
            pyarrow.uint16(),
            pyarrow.uint32(),
            pyarrow.uint64(),
            pyarrow.float16(),
            pyarrow.float32(),
            pyarrow.float64(),
            pyarrow.date32(),
            pyarrow.date64(),
            pyarrow.timestamp('s'),
            pyarrow.timestamp('ms'),
            pyarrow.timestamp('us'),
            pyarrow.timestamp('ns'),
            pyarrow.time32('s'),
            pyarrow.time32('ms'),
            pyarrow.time64('us'),
            pyarrow.time64('ns'),
            pyarrow.string(),
            pyarrow.binary(),
            pyarrow.binary(4),
            pyarrow.dictionary(pyarrow.int32(), pyarrow.string(), True),
            pyarrow.dictionary(pyarrow.int64(), pyarrow.int64(), True),
            pyarrow.dictionary(pyarrow.int32(), pyarrow.string(), False),
            pyarrow.dictionary(pyarrow.int64(), pyarrow.int64(), False),
            pyarrow.list_(pyarrow.int32()),
            pyarrow.struct([pyarrow.field('int32', pyarrow.int32())]),
            pyarrow.list_(
                pyarrow.struct([pyarrow.field('int32', pyarrow.int32())])),
            pyarrow.struct(
                [pyarrow.field('int32', pyarrow.list_(pyarrow.int32()))]),
        ]

    data = list()

    for t in types:
        name = str(t)
        array = TestArrayGenerator(n, t, False).array
        if offset is not None:
            array = array.slice(offset, length)
        data.append(pyarrow.column(name, array))

        if nullable:
            name = str(t) + ' (null)'
            array = TestArrayGenerator(n, t, True).array
            if offset is not None:
                array = array.slice(offset, length)
            data.append(pyarrow.column(name, array))

    return pyarrow.Table.from_arrays(data)

示例#5

0

显示文件

文件： test_types.py 项目： wiltonlazary/arrow-1

def test_dictionary_ordered_equals():
    # Python side checking of ARROW-6345
    d1 = pa.dictionary('int32', 'binary', ordered=True)
    d2 = pa.dictionary('int32', 'binary', ordered=False)
    d3 = pa.dictionary('int8', 'binary', ordered=True)
    d4 = pa.dictionary('int32', 'binary', ordered=True)

    assert not d1.equals(d2)
    assert not d1.equals(d3)
    assert d1.equals(d4)

示例#6

0

显示文件

文件： test_types.py 项目： emkornfield/arrow

def test_dictionary_type():
    ty0 = pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c']))
    assert ty0.index_type == pa.int32()
    assert isinstance(ty0.dictionary, pa.Array)
    assert ty0.dictionary.to_pylist() == ['a', 'b', 'c']
    assert ty0.ordered is False

    ty1 = pa.dictionary(pa.int8(), pa.array([1.0, 2.0]), ordered=True)
    assert ty1.index_type == pa.int8()
    assert isinstance(ty0.dictionary, pa.Array)
    assert ty1.dictionary.to_pylist() == [1.0, 2.0]
    assert ty1.ordered is True

示例#7

0

显示文件

文件： test_types.py 项目： msimons4/Python-for-Data-Analysis

def test_dictionary_type():
    ty0 = pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c']))
    assert ty0.index_type == pa.int32()
    assert isinstance(ty0.dictionary, pa.Array)
    assert ty0.dictionary.to_pylist() == ['a', 'b', 'c']
    assert ty0.ordered is False

    ty1 = pa.dictionary(pa.float32(), pa.array([1.0, 2.0]), ordered=True)
    assert ty1.index_type == pa.float32()
    assert isinstance(ty0.dictionary, pa.Array)
    assert ty1.dictionary.to_pylist() == [1.0, 2.0]
    assert ty1.ordered is True

示例#8

0

显示文件

    def test_auto_dict_encode(self):
        opts = ConvertOptions(auto_dict_encode=True)
        rows = "a,b\nab,1\ncdé,2\ncdé,3\nab,4".encode()
        table = self.read_bytes(rows, convert_options=opts)
        schema = pa.schema([('a', pa.dictionary(pa.int32(), pa.string())),
                            ('b', pa.int64())])
        expected = {
            'a': ["ab", "cdé", "cdé", "ab"],
            'b': [1, 2, 3, 4],
        }
        assert table.schema == schema
        assert table.to_pydict() == expected

        opts.auto_dict_max_cardinality = 2
        table = self.read_bytes(rows, convert_options=opts)
        assert table.schema == schema
        assert table.to_pydict() == expected

        # Cardinality above max => plain-encoded
        opts.auto_dict_max_cardinality = 1
        table = self.read_bytes(rows, convert_options=opts)
        assert table.schema == pa.schema([('a', pa.string()),
                                          ('b', pa.int64())])
        assert table.to_pydict() == expected

        # With invalid UTF8, not checked
        opts.auto_dict_max_cardinality = 50
        opts.check_utf8 = False
        rows = b"a,b\nab,1\ncd\xff,2\nab,3"
        table = self.read_bytes(rows,
                                convert_options=opts,
                                validate_full=False)
        assert table.schema == schema
        dict_values = table['a'].chunk(0).dictionary
        assert len(dict_values) == 2
        assert dict_values[0].as_py() == "ab"
        assert dict_values[1].as_buffer() == b"cd\xff"

        # With invalid UTF8, checked
        opts.check_utf8 = True
        table = self.read_bytes(rows, convert_options=opts)
        schema = pa.schema([('a', pa.dictionary(pa.int32(), pa.binary())),
                            ('b', pa.int64())])
        expected = {
            'a': [b"ab", b"cd\xff", b"ab"],
            'b': [1, 2, 3],
        }
        assert table.schema == schema
        assert table.to_pydict() == expected

示例#9

0

显示文件

def test_cast_from_null():
    in_data = [None] * 3
    in_type = pa.null()
    out_types = [
        pa.null(),
        pa.uint8(),
        pa.float16(),
        pa.utf8(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int16()),
        pa.large_list(pa.uint8()),
        pa.decimal128(19, 4),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.list_(pa.int8())),
                   pa.field('c', pa.string())]),
        ]
    for out_type in out_types:
        _check_cast_case((in_data, in_type, in_data, out_type))

    out_types = [
        pa.dictionary(pa.int32(), pa.string()),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        ]
    in_arr = pa.array(in_data, type=pa.null())
    for out_type in out_types:
        with pytest.raises(NotImplementedError):
            in_arr.cast(out_type)

示例#10

0

显示文件

文件： test_ipc.py 项目： zhztheplayer/arrow-1

def test_dictionary_delta(stream_fixture):
    ty = pa.dictionary(pa.int8(), pa.utf8())
    data = [["foo", "foo", None],
            ["foo", "bar", "foo"],  # potential delta
            ["foo", "bar"],
            ["foo", None, "bar", "quux"],  # potential delta
            ["bar", "quux"],  # replacement
            ]
    batches = [
        pa.RecordBatch.from_arrays([pa.array(v, type=ty)], names=['dicts'])
        for v in data]
    schema = batches[0].schema

    def write_batches():
        with stream_fixture._get_writer(pa.MockOutputStream(),
                                        schema) as writer:
            for batch in batches:
                writer.write_batch(batch)
            return writer.stats

    st = write_batches()
    assert st.num_record_batches == 5
    assert st.num_dictionary_batches == 4
    assert st.num_replaced_dictionaries == 3
    assert st.num_dictionary_deltas == 0

    stream_fixture.use_legacy_ipc_format = None
    stream_fixture.options = pa.ipc.IpcWriteOptions(
        emit_dictionary_deltas=True)
    st = write_batches()
    assert st.num_record_batches == 5
    assert st.num_dictionary_batches == 4
    assert st.num_replaced_dictionaries == 1
    assert st.num_dictionary_deltas == 2

示例#11

0

显示文件

文件： encoding.py 项目： t-triobox/vaex

 def decode(encoding, type_spec):
     if isinstance(type_spec, dict):
         if type_spec['type'] == 'duration':
             return DataType(pa.duration(type_spec['unit']))
         elif type_spec['type'] == 'timestamp':
             return DataType(pa.timestamp(type_spec['unit']))
         elif type_spec['type'] == 'list':
             sub = encoding.decode('dtype', type_spec['value_type']).arrow
             return DataType(pa.list_(sub))
         elif type_spec['type'] == 'dict':
             value_type = encoding.decode('dtype', type_spec["value_type"]).arrow
             index_type = encoding.decode('dtype', type_spec["index_type"]).arrow
             bool_ordered = type_spec["ordered"]
             return DataType(pa.dictionary(index_type, value_type, bool_ordered))
         else:
             raise ValueError(f'Do not understand type {type_spec}')
     if type_spec == 'string':
         return DataType(pa.string())
     if type_spec == 'large_string':
         return DataType(pa.large_string())
     # TODO: find a proper way to support all arrow types
     if type_spec == 'timestamp[ms]':
         return DataType(pa.timestamp('ms'))
     else:
         return DataType(np.dtype(type_spec))

示例#12

0

显示文件

文件： test_types.py 项目： vishalbelsare/cjworkbench

 def test_arrow_schema_category_column(self):
     self.assertEqual(
         arrow_schema_to_render_columns(
             pa.schema([pa.field("A", pa.dictionary(pa.int32(), pa.string()))])
         ),
         {"A": RenderColumn("A", "text", None)},
     )

示例#13

0

显示文件

文件： test_array.py 项目： rok/arrow

def test_cast_from_null():
    in_data = [None] * 3
    in_type = pa.null()
    out_types = [
        pa.null(),
        pa.uint8(),
        pa.float16(),
        pa.utf8(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int16()),
        pa.decimal128(19, 4),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.list_(pa.int8())),
                   pa.field('c', pa.string())]),
        ]
    for out_type in out_types:
        _check_cast_case((in_data, in_type, in_data, out_type))

    out_types = [
        pa.dictionary(pa.int32(), pa.string()),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        ]
    in_arr = pa.array(in_data, type=pa.null())
    for out_type in out_types:
        with pytest.raises(NotImplementedError):
            in_arr.cast(out_type)

示例#14

0

显示文件

文件： test_types.py 项目： rok/arrow

def test_dictionary_type():
    ty0 = pa.dictionary(pa.int32(), pa.string())
    assert ty0.index_type == pa.int32()
    assert ty0.value_type == pa.string()
    assert ty0.ordered is False

    ty1 = pa.dictionary(pa.int8(), pa.float64(), ordered=True)
    assert ty1.index_type == pa.int8()
    assert ty1.value_type == pa.float64()
    assert ty1.ordered is True

    # construct from non-arrow objects
    ty2 = pa.dictionary('int8', 'string')
    assert ty2.index_type == pa.int8()
    assert ty2.value_type == pa.string()
    assert ty2.ordered is False

示例#15

0

显示文件

文件： dataset_generator.py 项目： mnicely/cudf

def get_dataframe(parameters, use_threads):
    # Initialize seeds
    if parameters.seed is not None:
        np.random.seed(parameters.seed)

    # For each column, use a generic Mimesis producer to create an Iterable
    # for generating data
    for i, column_params in enumerate(parameters.column_parameters):
        if column_params.dtype is None:
            column_params.generator = column_params.generator(
                Generic("en", seed=parameters.seed))
        else:
            column_params.generator = column_params.generator()

    # Get schema for each column
    schema = pa.schema([
        pa.field(
            name=str(i),
            type=pa.dictionary(
                index_type=pa.int64(),
                value_type=pa.from_numpy_dtype(
                    type(next(iter(column_params.generator)))),
            ) if isinstance(column_params.dtype, str)
            and column_params.dtype == "category" else pa.from_numpy_dtype(
                type(next(iter(column_params.generator)))
                if column_params.dtype is None else column_params.dtype),
            nullable=column_params.null_frequency > 0,
        ) for i, column_params in enumerate(parameters.column_parameters)
    ])

    # Initialize column data and which columns should be sorted
    column_data = [None] * len(parameters.column_parameters)
    columns_to_sort = [
        str(i) for i, column_params in enumerate(parameters.column_parameters)
        if column_params.is_sorted
    ]
    # Generate data
    if not use_threads:
        for i, column_params in enumerate(parameters.column_parameters):
            column_data[i] = _generate_column(column_params,
                                              parameters.num_rows)
    else:
        pool = Pool(pa.cpu_count())
        column_data = pool.starmap(
            _generate_column,
            [(column_params, parameters.num_rows)
             for i, column_params in enumerate(parameters.column_parameters)],
        )
        pool.close()
        pool.join()
    # Convert to Pandas DataFrame and sort columns appropriately
    tbl = pa.Table.from_arrays(
        column_data,
        schema=schema,
    )
    if columns_to_sort:
        tbl = tbl.to_pandas()
        tbl = tbl.sort_values(columns_to_sort)
        tbl = pa.Table.from_pandas(tbl, schema)
    return tbl

示例#16

0

显示文件

文件： test_types.py 项目： msimons4/Python-for-Data-Analysis

def get_many_types():
    # returning them from a function is required because of pa.dictionary
    # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated
    # checks that the default memory pool has zero allocated bytes
    return (pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'),
            pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'),
            pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(),
            pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(),
            pa.binary(10), pa.list_(pa.int32()),
            pa.struct([
                pa.field('a', pa.int32()),
                pa.field('b', pa.int8()),
                pa.field('c', pa.string())
            ]),
            pa.struct([
                pa.field('a', pa.int32(), nullable=False),
                pa.field('b', pa.int8(), nullable=False),
                pa.field('c', pa.string())
            ]),
            pa.union(
                [pa.field('a', pa.binary(10)),
                 pa.field('b', pa.string())],
                mode=pa.lib.UnionMode_DENSE),
            pa.union(
                [pa.field('a', pa.binary(10)),
                 pa.field('b', pa.string())],
                mode=pa.lib.UnionMode_SPARSE),
            pa.union([
                pa.field('a', pa.binary(10), nullable=False),
                pa.field('b', pa.string())
            ],
                     mode=pa.lib.UnionMode_SPARSE),
            pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])))

示例#17

0

显示文件

文件： test_types.py 项目： yisuoyanyudmj/arrow

def test_dictionary_type():
    ty0 = pa.dictionary(pa.int32(), pa.string())
    assert ty0.index_type == pa.int32()
    assert ty0.value_type == pa.string()
    assert ty0.ordered is False

    ty1 = pa.dictionary(pa.int8(), pa.float64(), ordered=True)
    assert ty1.index_type == pa.int8()
    assert ty1.value_type == pa.float64()
    assert ty1.ordered is True

    # construct from non-arrow objects
    ty2 = pa.dictionary('int8', 'string')
    assert ty2.index_type == pa.int8()
    assert ty2.value_type == pa.string()
    assert ty2.ordered is False

示例#18

0

显示文件

文件： dict_test.py 项目： t-triobox/vaex

def test_dict_col(tmpdir):
    # Create the file if necessary
    parquet_path = tmpdir / 'sample_arrow_dict.parquet'
    schema = pa.schema({
        'col1': pa.int32(),
        'col2': pa.float32(),
        'col3': pa.dictionary(pa.int16(), pa.string()),
    })

    table = pa.table(
        {
            'col1': range(10),
            'col2': np.random.randn(10),
            'col3': list(np.random.choice(['A', 'B', 'C'], 10)),
        },
        schema=schema)

    pq.write_table(table, parquet_path)

    # Load df
    df = vaex.open(parquet_path)
    dtypes = df.dtypes
    assert isinstance(dtypes["col3"].arrow, pa.lib.DictionaryType)

    # Filter
    df = df._future()
    dff1 = df[df["col3"] == 'A']
    assert dff1["col3"].unique() == ["A"]

示例#19

0

显示文件

文件： test_series.py 项目： stjordanis/polars

def test_arrow():
    a = Series("a", [1, 2, 3, None])
    out = a.to_arrow()
    assert out == pa.array([1, 2, 3, None])

    a = pa.array(["foo", "bar"], pa.dictionary(pa.int32(), pa.utf8()))
    s = pl.Series("a", a)
    assert s.dtype == pl.Utf8

示例#20

0

显示文件

def test_filesystem_factory(mockfs, paths_or_selector):
    format = ds.ParquetFileFormat(read_options=ds.ParquetReadOptions(
        dictionary_columns={"str"}))

    options = ds.FileSystemFactoryOptions('subdir')
    options.partitioning = ds.DirectoryPartitioning(
        pa.schema(
            [pa.field('group', pa.int32()),
             pa.field('key', pa.string())]))
    assert options.partition_base_dir == 'subdir'
    assert options.ignore_prefixes == ['.', '_']
    assert options.exclude_invalid_files is False

    factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format,
                                          options)
    inspected_schema = factory.inspect()

    assert factory.inspect().equals(pa.schema([
        pa.field('i64', pa.int64()),
        pa.field('f64', pa.float64()),
        pa.field('str', pa.dictionary(pa.int32(), pa.string())),
        pa.field('const', pa.int64()),
        pa.field('group', pa.int32()),
        pa.field('key', pa.string()),
    ]),
                                    check_metadata=False)

    assert isinstance(factory.inspect_schemas(), list)
    assert isinstance(factory.finish(inspected_schema), ds.FileSystemDataset)
    assert factory.root_partition.equals(ds.ScalarExpression(True))

    dataset = factory.finish()
    assert isinstance(dataset, ds.FileSystemDataset)
    assert len(list(dataset.scan())) == 2

    scanner = ds.Scanner(dataset)
    expected_i64 = pa.array([0, 1, 2, 3, 4], type=pa.int64())
    expected_f64 = pa.array([0, 1, 2, 3, 4], type=pa.float64())
    expected_str = pa.DictionaryArray.from_arrays(
        pa.array([0, 1, 2, 3, 4], type=pa.int32()),
        pa.array("0 1 2 3 4".split(), type=pa.string()))
    for task, group, key in zip(scanner.scan(), [1, 2], ['xxx', 'yyy']):
        expected_group = pa.array([group] * 5, type=pa.int32())
        expected_key = pa.array([key] * 5, type=pa.string())
        expected_const = pa.array([group - 1] * 5, type=pa.int64())
        for batch in task.execute():
            assert batch.num_columns == 6
            assert batch[0].equals(expected_i64)
            assert batch[1].equals(expected_f64)
            assert batch[2].equals(expected_str)
            assert batch[3].equals(expected_const)
            assert batch[4].equals(expected_group)
            assert batch[5].equals(expected_key)

    table = dataset.to_table()
    assert isinstance(table, pa.Table)
    assert len(table) == 10
    assert table.num_columns == 6

示例#21

0

显示文件

def test_dictionary_python():
    """
    Python -> Rust -> Python
    """
    a = pa.array(["a", None, "b", None, "a"], type=pa.dictionary(pa.int8(), pa.string()))
    b = rust.round_trip_array(a)
    assert a == b
    del a
    del b

示例#22

0

显示文件

 def _dtype_to_arrow(cls, dtype):
     if dtype is None:
         return None
     tname = dtype if isinstance(dtype, str) else dtype.name
     if tname == "category":
         return pa.dictionary(index_type=pa.int32(), value_type=pa.string())
     elif tname == "string":
         return pa.string()
     else:
         return pa.from_numpy_dtype(tname)

示例#23

0

显示文件

def test_dictionary_type():
    ty0 = pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c']))
    assert ty0.index_type == pa.int32()
    assert isinstance(ty0.dictionary, pa.Array)
    assert ty0.dictionary.to_pylist() == ['a', 'b', 'c']
    assert ty0.ordered is False

    ty1 = pa.dictionary(pa.int8(), pa.array([1.0, 2.0]), ordered=True)
    assert ty1.index_type == pa.int8()
    assert isinstance(ty0.dictionary, pa.Array)
    assert ty1.dictionary.to_pylist() == [1.0, 2.0]
    assert ty1.ordered is True

    # construct from non-arrow objects
    ty2 = pa.dictionary('int8', ['a', 'b', 'c', 'd'])
    assert ty2.index_type == pa.int8()
    assert isinstance(ty2.dictionary, pa.Array)
    assert ty2.dictionary.to_pylist() == ['a', 'b', 'c', 'd']
    assert ty2.ordered is False

示例#24

0

显示文件

 def test_text_dictionary_zero_chunks_is_valid(self):
     validate(
         pyarrow.Table.from_batches(
             [],
             pyarrow.schema([("A",
                              pyarrow.dictionary(pyarrow.int32(),
                                                 pyarrow.string()))]),
         ),
         TableMetadata(0, [Text("A")]),
     )

示例#25

0

显示文件

def test_cat_int_types_3500() -> None:
    with pl.StringCache():
        # Create an enum / categorical / dictionary typed pyarrow array
        # Most simply done by creating a pandas categorical series first
        categorical_df = pd.Series(["a", "a", "b"], dtype="category")
        pyarrow_array = pa.Array.from_pandas(categorical_df)

        # The in-memory representation of each category can either be a signed or unsigned 8-bit integer
        # Pandas uses Int8...
        int_dict_type = pa.dictionary(index_type=pa.int8(),
                                      value_type=pa.utf8())
        # ... while DuckDB uses UInt8
        uint_dict_type = pa.dictionary(index_type=pa.uint8(),
                                       value_type=pa.utf8())

        for t in [int_dict_type, uint_dict_type]:
            s = pl.from_arrow(pyarrow_array.cast(t))
            assert s.series_equal(
                pl.Series(["a", "a", "b"]).cast(pl.Categorical))

示例#26

0

显示文件

def test_empty_table():
    schema = pa.schema([
        pa.field('f0', pa.int64()),
        pa.field('f1', pa.dictionary(pa.int32(), pa.string())),
        pa.field('f2', pa.list_(pa.list_(pa.int64()))),
    ])
    table = schema.empty_table()
    assert isinstance(table, pa.Table)
    assert table.num_rows == 0
    assert table.schema == schema

示例#27

0

显示文件

 def schema(cls):
     return pa.schema(
         {
             "instrument_id": pa.dictionary(pa.int8(), pa.string()),
             "ts_event": pa.int64(),
             "ts_init": pa.int64(),
             "last_traded_price": pa.string(),
             "traded_volume": pa.string(),
         },
         metadata={"type": "BetfairTicker"},
     )

示例#28

0

显示文件

def test_arrow():
    a = pl.Series("a", [1, 2, 3, None])
    out = a.to_arrow()
    assert out == pa.array([1, 2, 3, None])

    a = pa.array(["foo", "bar"], pa.dictionary(pa.int32(), pa.utf8()))
    s = pl.Series("a", a)
    assert s.dtype == pl.Categorical
    assert (pl.from_arrow(
        pa.array([["foo"], ["foo", "bar"]],
                 pa.list_(pa.utf8()))).dtype == pl.List)

示例#29

0

显示文件

def test_schema_repr_with_dictionaries():
    fields = [
        pa.field('one', pa.dictionary(pa.int16(), pa.string())),
        pa.field('two', pa.int32())
    ]
    sch = pa.schema(fields)

    expected = ("""\
one: dictionary<values=string, indices=int16, ordered=0>
two: int32""")

    assert repr(sch) == expected

示例#30

0

显示文件

文件： test_types.py 项目： siawayforward/the-library-is-open

def test_dictionary_type():
    ty0 = pa.dictionary(pa.int32(), pa.string())
    assert ty0.index_type == pa.int32()
    assert ty0.value_type == pa.string()
    assert ty0.ordered is False

    ty1 = pa.dictionary(pa.int8(), pa.float64(), ordered=True)
    assert ty1.index_type == pa.int8()
    assert ty1.value_type == pa.float64()
    assert ty1.ordered is True

    # construct from non-arrow objects
    ty2 = pa.dictionary('int8', 'string')
    assert ty2.index_type == pa.int8()
    assert ty2.value_type == pa.string()
    assert ty2.ordered is False

    # allow unsigned integers for index type
    ty3 = pa.dictionary(pa.uint32(), pa.string())
    assert ty3.index_type == pa.uint32()
    assert ty3.value_type == pa.string()
    assert ty3.ordered is False

    # invalid index type raises
    with pytest.raises(TypeError):
        pa.dictionary(pa.string(), pa.int64())

示例#31

0

显示文件

文件： test_schema.py 项目： rok/arrow

def test_schema_repr_with_dictionaries():
    fields = [
        pa.field('one', pa.dictionary(pa.int16(), pa.string())),
        pa.field('two', pa.int32())
    ]
    sch = pa.schema(fields)

    expected = (
        """\
one: dictionary<values=string, indices=int16, ordered=0>
two: int32""")

    assert repr(sch) == expected

示例#32

0

显示文件

    def test_dict(self):
        """
        Python -> Rust -> Python
        """
        a = pyarrow.array(
            ["a", "a", "b", None, "c"],
            pyarrow.dictionary(pyarrow.int64(), pyarrow.utf8()),
        )
        b = arrow_pyarrow_integration_testing.round_trip_array(a)

        b.validate(full=True)
        assert a.to_pylist() == b.to_pylist()
        assert a.type == b.type

示例#33

0

显示文件

文件： test_testing.py 项目： CJWorkbench/cjwmodule

def test_build_table_infer_type():
    table = make_table(
        make_column("A", ["x"]),
        make_column("B", [datetime.date(2021, 4, 7)]),
        make_column("C", [datetime.datetime(2021, 4, 7, 19, 24, 1, 1)]),
        make_column("D", [1.0]),
        make_column("dict", ["x"], dictionary=True),
    )
    assert table["A"].type == pa.string()
    assert table["B"].type == pa.date32()
    assert table["C"].type == pa.timestamp("ns")
    assert table["D"].type == pa.float64()
    assert table["dict"].type == pa.dictionary(pa.int32(), pa.string())

示例#34

0

显示文件

def create_cems_schema():
    """Make an explicit Arrow schema for the EPA CEMS data.

    Make changes in the types of the generated parquet files by editing this
    function.

    Note that parquet's internal representation doesn't use unsigned numbers or
    16-bit ints, so just keep things simple here and always use int32 and
    float32.

    Returns:
        pyarrow.schema: An Arrow schema for the EPA CEMS data.

    """
    int_nullable = partial(pa.field, type=pa.int32(), nullable=True)
    int_not_null = partial(pa.field, type=pa.int32(), nullable=False)
    str_not_null = partial(pa.field, type=pa.string(), nullable=False)
    # Timestamp resolution is hourly, but second is the largest allowed.
    timestamp = partial(pa.field,
                        type=pa.timestamp("s", tz="UTC"),
                        nullable=False)
    float_nullable = partial(pa.field, type=pa.float32(), nullable=True)
    float_not_null = partial(pa.field, type=pa.float32(), nullable=False)
    # (float32 can accurately hold integers up to 16,777,216 so no need for
    # float64)
    dict_nullable = partial(pa.field,
                            type=pa.dictionary(pa.int8(),
                                               pa.string(),
                                               ordered=False),
                            nullable=True)
    return pa.schema([
        dict_nullable("state"),
        int_not_null("plant_id_eia"),
        str_not_null("unitid"),
        timestamp("operating_datetime_utc"),
        float_nullable("operating_time_hours"),
        float_not_null("gross_load_mw"),
        float_nullable("steam_load_1000_lbs"),
        float_nullable("so2_mass_lbs"),
        dict_nullable("so2_mass_measurement_code"),
        float_nullable("nox_rate_lbs_mmbtu"),
        dict_nullable("nox_rate_measurement_code"),
        float_nullable("nox_mass_lbs"),
        dict_nullable("nox_mass_measurement_code"),
        float_nullable("co2_mass_tons"),
        dict_nullable("co2_mass_measurement_code"),
        float_not_null("heat_content_mmbtu"),
        int_nullable("facility_id"),
        int_nullable("unit_id_epa"),
        int_not_null("year"),
    ])

示例#35

0

显示文件

文件： test_schema.py 项目： giantwhale/arrow

def test_schema_repr_with_dictionaries():
    dct = pa.array(['foo', 'bar', 'baz'], type=pa.string())
    fields = [
        pa.field('one', pa.dictionary(pa.int16(), dct)),
        pa.field('two', pa.int32())
    ]
    sch = pa.schema(fields)

    expected = (
        """\
one: dictionary<values=string, indices=int16, ordered=0>
  dictionary: ["foo", "bar", "baz"]
two: int32""")

    assert repr(sch) == expected

示例#36

0

显示文件

文件： test_types.py 项目： rok/arrow

def get_many_types():
    # returning them from a function is required because of pa.dictionary
    # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated
    # checks that the default memory pool has zero allocated bytes
    return (
        pa.null(),
        pa.bool_(),
        pa.int32(),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.decimal128(19, 4),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int32()),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())]),
        pa.struct([pa.field('a', pa.int32(), nullable=False),
                   pa.field('b', pa.int8(), nullable=False),
                   pa.field('c', pa.string())]),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.union([pa.field('a', pa.binary(10), nullable=False),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.dictionary(pa.int32(), pa.string())
    )

示例#37

0

显示文件

文件： test_types.py 项目： rok/arrow

def test_is_dictionary():
    assert types.is_dictionary(pa.dictionary(pa.int32(), pa.string()))
    assert not types.is_dictionary(pa.int32())

示例#38

0

显示文件

文件： test_types.py 项目： CodingCat/arrow

def test_dictionary_type():
    ty = pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c']))
    assert ty.index_type == pa.int32()
    assert ty.dictionary.to_pylist() == ['a', 'b', 'c']

示例#39

0

显示文件

文件： test_types.py 项目： giantwhale/arrow

def test_is_dictionary():
    assert types.is_dictionary(
        pa.dictionary(pa.int32(),
                      pa.array(['a', 'b', 'c'])))
    assert not types.is_dictionary(pa.int32())