def test_double(self): data = [1.5, 1, None, 2.5, None, None] arr = pyarrow.from_pylist(data) assert len(arr) == 6 assert arr.null_count == 3 assert arr.type == pyarrow.double() assert arr.to_pylist() == data
def test_float_nulls(self): num_values = 100 null_mask = np.random.randint(0, 10, size=num_values) < 3 dtypes = [('f4', A.float_()), ('f8', A.double())] names = ['f4', 'f8'] expected_cols = [] arrays = [] fields = [] for name, arrow_dtype in dtypes: values = np.random.randn(num_values).astype(name) arr = A.from_pandas_series(values, null_mask) arrays.append(arr) fields.append(A.Field.from_py(name, arrow_dtype)) values[null_mask] = np.nan expected_cols.append(values) ex_frame = pd.DataFrame(dict(zip(names, expected_cols)), columns=names) table = A.Table.from_arrays(arrays, names) assert table.schema.equals(A.Schema.from_fields(fields)) result = table.to_pandas() tm.assert_frame_equal(result, ex_frame)
def test_float_nulls(self): num_values = 100 null_mask = np.random.randint(0, 10, size=num_values) < 3 dtypes = [('f4', A.float_()), ('f8', A.double())] names = ['f4', 'f8'] expected_cols = [] arrays = [] fields = [] for name, arrow_dtype in dtypes: values = np.random.randn(num_values).astype(name) arr = A.from_pandas_series(values, null_mask) arrays.append(arr) fields.append(A.Field.from_py(name, arrow_dtype)) values[null_mask] = np.nan expected_cols.append(values) ex_frame = pd.DataFrame(dict(zip(names, expected_cols)), columns=names) table = A.Table.from_arrays(names, arrays) assert table.schema.equals(A.Schema.from_fields(fields)) result = table.to_pandas() tm.assert_frame_equal(result, ex_frame)
def dataframe_with_arrays(): """ Dataframe with numpy arrays columns of every possible primtive type. Returns ------- df: pandas.DataFrame schema: pyarrow.Schema Arrow schema definition that is in line with the constructed df. """ dtypes = [('i1', pa.int8()), ('i2', pa.int16()), ('i4', pa.int32()), ('i8', pa.int64()), ('u1', pa.uint8()), ('u2', pa.uint16()), ('u4', pa.uint32()), ('u8', pa.uint64()), ('f4', pa.float_()), ('f8', pa.double())] arrays = OrderedDict() fields = [] for dtype, arrow_dtype in dtypes: fields.append(pa.field(dtype, pa.list_(arrow_dtype))) arrays[dtype] = [ np.arange(10, dtype=dtype), np.arange(5, dtype=dtype), None, np.arange(1, dtype=dtype) ] fields.append(pa.field('str', pa.list_(pa.string()))) arrays['str'] = [ np.array([u"1", u"ä"], dtype="object"), None, np.array([u"1"], dtype="object"), np.array([u"1", u"2", u"3"], dtype="object") ] fields.append(pa.field('datetime64', pa.list_(pa.timestamp('ms')))) arrays['datetime64'] = [ np.array(['2007-07-13T01:23:34.123456789', None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ms]'), None, None, np.array(['2007-07-13T02', None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ms]'), ] df = pd.DataFrame(arrays) schema = pa.Schema.from_fields(fields) return df, schema
def test_float_no_nulls(self): data = {} fields = [] dtypes = [('f4', A.float_()), ('f8', A.double())] num_values = 100 for numpy_dtype, arrow_dtype in dtypes: values = np.random.randn(num_values) data[numpy_dtype] = values.astype(numpy_dtype) fields.append(A.Field.from_py(numpy_dtype, arrow_dtype)) df = pd.DataFrame(data) schema = A.Schema.from_fields(fields) self._check_pandas_roundtrip(df, expected_schema=schema)
def normalize_arrow_dtype(dtype): # noqa: C901 if dtype in ['bool']: return pa.bool_() if dtype in ['int8_t', 'int8', 'byte']: return pa.int8() if dtype in ['uint8_t', 'uint8', 'char']: return pa.uint8() if dtype in ['int16_t', 'int16', 'short']: return pa.int16() if dtype in ['uint16_t', 'uint16']: return pa.uint16() if dtype in ['int32_t', 'int32', 'int']: return pa.int32() if dtype in ['uint32_t', 'uint32']: return pa.uint32() if dtype in ['int64_t', 'int64', 'long']: return pa.int64() if dtype in ['uint64_t', 'uint64']: return pa.uint64() if dtype in ['half']: return pa.float16() if dtype in ['float', 'float32']: return pa.float32() if dtype in ['double', 'float64']: return pa.float64() if dtype in ['string', 'std::string', 'std::__1::string', 'str']: return pa.large_string() if dtype in ['large_list<item: int32>']: return pa.large_list(pa.int32()) if dtype in ['large_list<item: uint32>']: return pa.large_list(pa.uint32()) if dtype in ['large_list<item: int64>']: return pa.large_list(pa.int64()) if dtype in ['large_list<item: uint64>']: return pa.large_list(pa.uint64()) if dtype in ['large_list<item: float>']: return pa.large_list(pa.float()) if dtype in ['large_list<item: double>']: return pa.large_list(pa.double()) if dtype in ['null', 'NULL', 'None', None]: return pa.null() raise ValueError('Unsupported data type: %s' % dtype)
def dataframe_with_lists(): """ Dataframe with list columns of every possible primtive type. Returns ------- df: pandas.DataFrame schema: pyarrow.Schema Arrow schema definition that is in line with the constructed df. """ arrays = OrderedDict() fields = [] fields.append(pa.field('int64', pa.list_(pa.int64()))) arrays['int64'] = [ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 4], None, [0] ] fields.append(pa.field('double', pa.list_(pa.double()))) arrays['double'] = [ [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.], [0., 1., 2., 3., 4.], None, [0.] ] fields.append(pa.field('str_list', pa.list_(pa.string()))) arrays['str_list'] = [ [u"1", u"ä"], None, [u"1"], [u"1", u"2", u"3"] ] df = pd.DataFrame(arrays) schema = pa.Schema.from_fields(fields) return df, schema