def build_row_desc(data, preserve_index=False): if not isinstance(data, (pd.DataFrame, gpd.GeoDataFrame)): # Once https://issues.apache.org/jira/browse/ARROW-1576 is complete # we can support pa.Table here too raise TypeError("Create table is not supported for type {}. " "Use a pandas DataFrame, or perform the create " "separately".format(type(data))) if preserve_index: data = data.reset_index() dtypes = [] is_array = {} for col in data.columns: _dtype = get_mapd_dtype(data[col]) is_array[col] = True if _dtype.startswith('ARRAY') else None dtypes.append((col, _dtype.replace('ARRAY/', ''))) # row_desc :: List<TColumnType> row_desc = [ TColumnType( name, TTypeInfo(getattr(TDatumType, mapd_type), is_array=is_array[name]), ) for name, mapd_type in dtypes ] # force text encoding dict for all string columns # default is TEXT ENCODING DICT(32) when only tct.col_type.encoding = 4 set # https://github.com/omnisci/pymapd/issues/140#issuecomment-477353420 for tct in row_desc: if tct.col_type.type == 6: tct.col_type.encoding = 4 elif tct.col_type.type in GEO_TYPE_ID: tct.col_type.precision = 23 return row_desc
def test_build_row_desc(self): data = pd.DataFrame( { 'boolean_': [True, False], 'smallint_': np.array([0, 1], dtype=np.int16), 'int_': np.array([0, 1], dtype=np.int32), 'bigint_': np.array([0, 1], dtype=np.int64), 'float_': np.array([0, 1], dtype=np.float32), 'double_': np.array([0, 1], dtype=np.float64), 'varchar_': ['a', 'b'], 'text_': ['a', 'b'], 'time_': [datetime.time(0, 11, 59), datetime.time(13)], 'timestamp1_': [pd.Timestamp('2016'), pd.Timestamp('2017')], 'timestamp2_': [ np.datetime64('2016-01-01 01:01:01.001001001'), np.datetime64('2017-01-01 01:01:01.001001001'), ], 'date_': [ datetime.date(2016, 1, 1), datetime.date(2017, 1, 1), ], }, columns=[ 'boolean_', 'smallint_', 'int_', 'bigint_', 'float_', 'double_', 'varchar_', 'text_', 'time_', 'timestamp1_', 'timestamp2_', 'date_', ], ) result = _pandas_loaders.build_row_desc(data) expected = [ TColumnType( col_name='boolean_', col_type=TTypeInfo(type=10), is_reserved_keyword=None, ), TColumnType( col_name='smallint_', col_type=TTypeInfo(type=0), is_reserved_keyword=None, ), TColumnType( col_name='int_', col_type=TTypeInfo(type=1), is_reserved_keyword=None, ), TColumnType(col_name='bigint_', col_type=TTypeInfo(type=2)), TColumnType(col_name='float_', col_type=TTypeInfo(type=3)), TColumnType(col_name='double_', col_type=TTypeInfo(type=5)), TColumnType(col_name='varchar_', col_type=TTypeInfo(type=6, encoding=4)), TColumnType(col_name='text_', col_type=TTypeInfo(type=6, encoding=4)), TColumnType(col_name='time_', col_type=TTypeInfo(type=7)), TColumnType(col_name='timestamp1_', col_type=TTypeInfo(type=8)), TColumnType(col_name='timestamp2_', col_type=TTypeInfo(type=8, precision=9)), TColumnType(col_name='date_', col_type=TTypeInfo(type=9)), ] assert result == expected data.index.name = 'idx' result = _pandas_loaders.build_row_desc(data, preserve_index=True) expected.insert( 0, TColumnType(col_name='idx', col_type=TTypeInfo(type=2))) assert result == expected
def test_nulls_handled(self): rs = TQueryResult( TRowSet( row_desc=[ TColumnType(col_name='a', col_type=TTypeInfo(type=0, nullable=True)), TColumnType(col_name='b', col_type=TTypeInfo(type=1, nullable=True)), TColumnType(col_name='c', col_type=TTypeInfo(type=2, nullable=True)), TColumnType(col_name='d', col_type=TTypeInfo(type=3, nullable=True)), TColumnType(col_name='e', col_type=TTypeInfo(type=4, nullable=True)), TColumnType(col_name='f', col_type=TTypeInfo(type=5, nullable=True)), TColumnType(col_name='g', col_type=TTypeInfo(type=6, nullable=True)), TColumnType(col_name='h', col_type=TTypeInfo(type=7, nullable=True)), TColumnType(col_name='i', col_type=TTypeInfo(type=8, nullable=True)), TColumnType(col_name='j', col_type=TTypeInfo(type=9, nullable=True)), TColumnType( col_name='k', col_type=TTypeInfo(type=10, nullable=True), ), ], rows=[], columns=[ TColumn(data=TColumnData(int_col=[-2147483648]), nulls=[True]), TColumn(data=TColumnData(int_col=[-2147483648]), nulls=[True]), TColumn(data=TColumnData(int_col=[-2147483648]), nulls=[True]), TColumn(data=TColumnData(real_col=[-2147483648]), nulls=[True]), # noqa TColumn(data=TColumnData(real_col=[-2147483648]), nulls=[True]), # noqa TColumn(data=TColumnData(real_col=[-2147483648]), nulls=[True]), # noqa TColumn(data=TColumnData(str_col=[-2147483648]), nulls=[True]), TColumn(data=TColumnData(int_col=[-2147483648]), nulls=[True]), TColumn(data=TColumnData(int_col=[-2147483648]), nulls=[True]), TColumn(data=TColumnData(int_col=[-2147483648]), nulls=[True]), TColumn(data=TColumnData(int_col=[-2147483648]), nulls=[True]), ], is_columnar=True, )) result = list(make_row_results_set(rs)) assert result == [(None, ) * 11]
def test_extract_row_details(self): data = [ TColumnType( col_name='date_', col_type=TTypeInfo( type=6, encoding=4, nullable=True, is_array=False, precision=0, scale=0, comp_param=32, ), is_reserved_keyword=False, src_name='', ), TColumnType( col_name='trans', col_type=TTypeInfo( type=6, encoding=4, nullable=True, is_array=False, precision=0, scale=0, comp_param=32, ), is_reserved_keyword=False, src_name='', ), TColumnType( col_name='symbol', col_type=TTypeInfo( type=6, encoding=4, nullable=True, is_array=False, precision=0, scale=0, comp_param=32, ), is_reserved_keyword=False, src_name='', ), TColumnType( col_name='qty', col_type=TTypeInfo( type=1, encoding=0, nullable=True, is_array=False, precision=0, scale=0, comp_param=0, ), is_reserved_keyword=False, src_name='', ), TColumnType( col_name='price', col_type=TTypeInfo( type=3, encoding=0, nullable=True, is_array=False, precision=0, scale=0, comp_param=0, ), is_reserved_keyword=False, src_name='', ), TColumnType( col_name='vol', col_type=TTypeInfo( type=3, encoding=0, nullable=True, is_array=False, precision=0, scale=0, comp_param=0, ), is_reserved_keyword=False, src_name='', ), ] result = _extract_column_details(data) expected = [ ColumnDetails( name='date_', type='STR', nullable=True, precision=0, scale=0, comp_param=32, encoding='DICT', is_array=False, ), ColumnDetails( name='trans', type='STR', nullable=True, precision=0, scale=0, comp_param=32, encoding='DICT', is_array=False, ), ColumnDetails( name='symbol', type='STR', nullable=True, precision=0, scale=0, comp_param=32, encoding='DICT', is_array=False, ), ColumnDetails( name='qty', type='INT', nullable=True, precision=0, scale=0, comp_param=0, encoding='NONE', is_array=False, ), ColumnDetails( name='price', type='FLOAT', nullable=True, precision=0, scale=0, comp_param=0, encoding='NONE', is_array=False, ), ColumnDetails( name='vol', type='FLOAT', nullable=True, precision=0, scale=0, comp_param=0, encoding='NONE', is_array=False, ), ] assert result == expected