예제 #1
0
 def test_double(self):
     data = [1.5, 1, None, 2.5, None, None]
     arr = pyarrow.from_pylist(data)
     assert len(arr) == 6
     assert arr.null_count == 3
     assert arr.type == pyarrow.double()
     assert arr.to_pylist() == data
예제 #2
0
 def test_double(self):
     data = [1.5, 1, None, 2.5, None, None]
     arr = pyarrow.from_pylist(data)
     assert len(arr) == 6
     assert arr.null_count == 3
     assert arr.type == pyarrow.double()
     assert arr.to_pylist() == data
예제 #3
0
    def test_float_nulls(self):
        num_values = 100

        null_mask = np.random.randint(0, 10, size=num_values) < 3
        dtypes = [('f4', A.float_()), ('f8', A.double())]
        names = ['f4', 'f8']
        expected_cols = []

        arrays = []
        fields = []
        for name, arrow_dtype in dtypes:
            values = np.random.randn(num_values).astype(name)

            arr = A.from_pandas_series(values, null_mask)
            arrays.append(arr)
            fields.append(A.Field.from_py(name, arrow_dtype))
            values[null_mask] = np.nan

            expected_cols.append(values)

        ex_frame = pd.DataFrame(dict(zip(names, expected_cols)),
                                columns=names)

        table = A.Table.from_arrays(arrays, names)
        assert table.schema.equals(A.Schema.from_fields(fields))
        result = table.to_pandas()
        tm.assert_frame_equal(result, ex_frame)
예제 #4
0
    def test_float_nulls(self):
        num_values = 100

        null_mask = np.random.randint(0, 10, size=num_values) < 3
        dtypes = [('f4', A.float_()), ('f8', A.double())]
        names = ['f4', 'f8']
        expected_cols = []

        arrays = []
        fields = []
        for name, arrow_dtype in dtypes:
            values = np.random.randn(num_values).astype(name)

            arr = A.from_pandas_series(values, null_mask)
            arrays.append(arr)
            fields.append(A.Field.from_py(name, arrow_dtype))
            values[null_mask] = np.nan

            expected_cols.append(values)

        ex_frame = pd.DataFrame(dict(zip(names, expected_cols)),
                                columns=names)

        table = A.Table.from_arrays(names, arrays)
        assert table.schema.equals(A.Schema.from_fields(fields))
        result = table.to_pandas()
        tm.assert_frame_equal(result, ex_frame)
예제 #5
0
def dataframe_with_arrays():
    """
    Dataframe with numpy arrays columns of every possible primtive type.

    Returns
    -------
    df: pandas.DataFrame
    schema: pyarrow.Schema
        Arrow schema definition that is in line with the constructed df.
    """
    dtypes = [('i1', pa.int8()), ('i2', pa.int16()),
              ('i4', pa.int32()), ('i8', pa.int64()),
              ('u1', pa.uint8()), ('u2', pa.uint16()),
              ('u4', pa.uint32()), ('u8', pa.uint64()),
              ('f4', pa.float_()), ('f8', pa.double())]

    arrays = OrderedDict()
    fields = []
    for dtype, arrow_dtype in dtypes:
        fields.append(pa.field(dtype, pa.list_(arrow_dtype)))
        arrays[dtype] = [
            np.arange(10, dtype=dtype),
            np.arange(5, dtype=dtype),
            None,
            np.arange(1, dtype=dtype)
        ]

    fields.append(pa.field('str', pa.list_(pa.string())))
    arrays['str'] = [
        np.array([u"1", u"ä"], dtype="object"),
        None,
        np.array([u"1"], dtype="object"),
        np.array([u"1", u"2", u"3"], dtype="object")
    ]

    fields.append(pa.field('datetime64', pa.list_(pa.timestamp('ms'))))
    arrays['datetime64'] = [
        np.array(['2007-07-13T01:23:34.123456789',
                  None,
                  '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ms]'),
        None,
        None,
        np.array(['2007-07-13T02',
                  None,
                  '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ms]'),
    ]

    df = pd.DataFrame(arrays)
    schema = pa.Schema.from_fields(fields)

    return df, schema
예제 #6
0
    def test_float_no_nulls(self):
        data = {}
        fields = []
        dtypes = [('f4', A.float_()), ('f8', A.double())]
        num_values = 100

        for numpy_dtype, arrow_dtype in dtypes:
            values = np.random.randn(num_values)
            data[numpy_dtype] = values.astype(numpy_dtype)
            fields.append(A.Field.from_py(numpy_dtype, arrow_dtype))

        df = pd.DataFrame(data)
        schema = A.Schema.from_fields(fields)
        self._check_pandas_roundtrip(df, expected_schema=schema)
예제 #7
0
    def test_float_no_nulls(self):
        data = {}
        fields = []
        dtypes = [('f4', A.float_()), ('f8', A.double())]
        num_values = 100

        for numpy_dtype, arrow_dtype in dtypes:
            values = np.random.randn(num_values)
            data[numpy_dtype] = values.astype(numpy_dtype)
            fields.append(A.Field.from_py(numpy_dtype, arrow_dtype))

        df = pd.DataFrame(data)
        schema = A.Schema.from_fields(fields)
        self._check_pandas_roundtrip(df, expected_schema=schema)
예제 #8
0
def normalize_arrow_dtype(dtype):  # noqa: C901
    if dtype in ['bool']:
        return pa.bool_()
    if dtype in ['int8_t', 'int8', 'byte']:
        return pa.int8()
    if dtype in ['uint8_t', 'uint8', 'char']:
        return pa.uint8()
    if dtype in ['int16_t', 'int16', 'short']:
        return pa.int16()
    if dtype in ['uint16_t', 'uint16']:
        return pa.uint16()
    if dtype in ['int32_t', 'int32', 'int']:
        return pa.int32()
    if dtype in ['uint32_t', 'uint32']:
        return pa.uint32()
    if dtype in ['int64_t', 'int64', 'long']:
        return pa.int64()
    if dtype in ['uint64_t', 'uint64']:
        return pa.uint64()
    if dtype in ['half']:
        return pa.float16()
    if dtype in ['float', 'float32']:
        return pa.float32()
    if dtype in ['double', 'float64']:
        return pa.float64()
    if dtype in ['string', 'std::string', 'std::__1::string', 'str']:
        return pa.large_string()
    if dtype in ['large_list<item: int32>']:
        return pa.large_list(pa.int32())
    if dtype in ['large_list<item: uint32>']:
        return pa.large_list(pa.uint32())
    if dtype in ['large_list<item: int64>']:
        return pa.large_list(pa.int64())
    if dtype in ['large_list<item: uint64>']:
        return pa.large_list(pa.uint64())
    if dtype in ['large_list<item: float>']:
        return pa.large_list(pa.float())
    if dtype in ['large_list<item: double>']:
        return pa.large_list(pa.double())
    if dtype in ['null', 'NULL', 'None', None]:
        return pa.null()
    raise ValueError('Unsupported data type: %s' % dtype)
예제 #9
0
def dataframe_with_lists():
    """
    Dataframe with list columns of every possible primtive type.

    Returns
    -------
    df: pandas.DataFrame
    schema: pyarrow.Schema
        Arrow schema definition that is in line with the constructed df.
    """
    arrays = OrderedDict()
    fields = []

    fields.append(pa.field('int64', pa.list_(pa.int64())))
    arrays['int64'] = [
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4],
        None,
        [0]
    ]
    fields.append(pa.field('double', pa.list_(pa.double())))
    arrays['double'] = [
        [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
        [0., 1., 2., 3., 4.],
        None,
        [0.]
    ]
    fields.append(pa.field('str_list', pa.list_(pa.string())))
    arrays['str_list'] = [
        [u"1", u"ä"],
        None,
        [u"1"],
        [u"1", u"2", u"3"]
    ]

    df = pd.DataFrame(arrays)
    schema = pa.Schema.from_fields(fields)

    return df, schema