Exemplo n.º 1
0
def get_expected(data, col_properties):
    expected = []
    _map_col_types = {'INT': 'int_col', 'DOUBLE': 'real_col', 'STR': 'str_col'}
    _map_col_types.update(
        {k: 'str_col'
         for k in _pandas_loaders.GEO_TYPE_NAMES})
    isnull = data.isnull()
    for prop in col_properties:
        nulls = isnull[prop['name']].tolist()
        if prop['is_array']:
            arr_col = []
            for v in data[prop['name']]:
                arr_col.append(
                    TColumn(data=TColumnData(
                        **{_map_col_types[prop['type']]: v})), )
            col = TColumn(data=TColumnData(arr_col=arr_col), nulls=nulls)
        elif prop['type'] in _pandas_loaders.GEO_TYPE_NAMES:
            col = TColumn(
                data=TColumnData(
                    **{
                        _map_col_types[prop['type']]:
                        data[prop['name']].apply(lambda g: g.wkt)
                    }),
                nulls=nulls,
            )
        else:
            col = TColumn(
                data=TColumnData(
                    **{_map_col_types[prop['type']]: data[prop['name']]}),
                nulls=nulls,
            )
        expected.append(col)
    return expected
Exemplo n.º 2
0
def get_expected(data, col_properties):
    expected = []
    _map_col_types = {
        'INT': 'int_col',
        'DOUBLE': 'real_col',
        'STR': 'str_col',
        'TIMESTAMP': 'int_col',
        'DECIMAL': 'int_col',
    }
    _map_col_types.update(
        {k: 'str_col'
         for k in _pandas_loaders.GEO_TYPE_NAMES})
    isnull = data.isnull()

    for prop in col_properties:
        nulls = isnull[prop['name']].tolist()
        if prop['is_array']:
            arr_col = []
            for v in data[prop['name']]:
                arr_col.append(
                    TColumn(data=TColumnData(
                        **{_map_col_types[prop['type']]: v})), )
            col = TColumn(data=TColumnData(arr_col=arr_col), nulls=nulls)
        elif prop['type'] in _pandas_loaders.GEO_TYPE_NAMES:
            col = TColumn(
                data=TColumnData(
                    **{
                        _map_col_types[prop['type']]:
                        data[prop['name']].apply(lambda g: g.wkt)
                    }),
                nulls=nulls,
            )
        else:
            if prop['type'] == 'TIMESTAMP':
                # convert datetime to epoch
                if data[prop['name']].dt.nanosecond.sum():
                    data[prop['name']] = data[prop['name']].astype(int)
                else:
                    data[prop['name']] = (data[prop['name']].astype(int) //
                                          10**9)
            elif prop['type'] == 'DECIMAL':
                # data = (data * 10 ** precision).astype(int) \
                #   * 10 ** (scale - precision)
                data[prop['name']] = (data[prop['name']] * 10**
                                      prop['precision']).astype(int) * 10**(
                                          prop['scale'] - prop['precision'])

            col = TColumn(
                data=TColumnData(
                    **{_map_col_types[prop['type']]: data[prop['name']]}),
                nulls=nulls,
            )
        expected.append(col)
    return expected
Exemplo n.º 3
0
    def test_build_table_columnar_nulls(self):
        common_col_params = dict(
            nullable=True,
            scale=0,
            comp_param=0,
            encoding='NONE',
            is_array=False,
        )

        col_types = [
            ColumnDetails(name='boolean_',
                          type='BOOL',
                          precision=0,
                          **common_col_params),
            ColumnDetails(name='int_',
                          type='INT',
                          precision=0,
                          **common_col_params),
            ColumnDetails(name='bigint_',
                          type='BIGINT',
                          precision=0,
                          **common_col_params),
            ColumnDetails(name='double_',
                          type='DOUBLE',
                          precision=0,
                          **common_col_params),
            ColumnDetails(name='varchar_',
                          type='STR',
                          precision=0,
                          **common_col_params),
            ColumnDetails(name='text_',
                          type='STR',
                          precision=0,
                          **common_col_params),
            ColumnDetails(name='time_',
                          type='TIME',
                          precision=0,
                          **common_col_params),
            ColumnDetails(
                name='timestamp_',
                type='TIMESTAMP',
                **common_col_params,
                precision=0,
            ),
            ColumnDetails(name='date_',
                          type='DATE',
                          precision=0,
                          **common_col_params),
        ]

        data = pd.DataFrame({
            'boolean_': [True, False, None],
            # Currently Pandas does not support storing None or NaN
            # in integer columns, so int cols with null
            # need to be objects. This means our type detection will be
            # unreliable since if there is no number outside the int32
            # bounds in a column with nulls then we will be assuming int
            'int_':
            np.array([0, 1, None], dtype=np.object),
            'bigint_':
            np.array([0, 9223372036854775807, None], dtype=np.object),
            'double_':
            np.array([0, 1, None], dtype=np.float64),
            'varchar_': ['a', 'b', None],
            'text_': ['a', 'b', None],
            'time_': [datetime.time(0, 11, 59),
                      datetime.time(13), None],
            'timestamp_': [
                pd.Timestamp('2016'),
                pd.Timestamp('2017'),
                None,
            ],
            'date_': [
                datetime.date(1001, 1, 1),
                datetime.date(2017, 1, 1),
                None,
            ],
        })

        result = _pandas_loaders.build_input_columnar(
            data,
            preserve_index=False,
            col_names=data.columns,
            col_types=col_types,
        )

        nulls = [False, False, True]
        bool_na = -128
        int_na = -2147483648
        bigint_na = -9223372036854775808
        ns_na = -9223372037
        double_na = 0

        expected = [
            TColumn(TColumnData(int_col=[1, 0, bool_na]), nulls=nulls),
            TColumn(
                TColumnData(int_col=np.array([0, 1, int_na], dtype=np.int32)),
                nulls=nulls,
            ),  # noqa
            TColumn(
                TColumnData(int_col=np.array(
                    [0, 9223372036854775807, bigint_na], dtype=np.int64)),
                nulls=nulls,
            ),  # noqa
            TColumn(
                TColumnData(
                    real_col=np.array([0, 1, double_na], dtype=np.float64)),
                nulls=nulls,
            ),  # noqa
            TColumn(TColumnData(str_col=['a', 'b', '']), nulls=nulls),
            TColumn(TColumnData(str_col=['a', 'b', '']), nulls=nulls),
            TColumn(TColumnData(int_col=[719, 46800, bigint_na]), nulls=nulls),
            TColumn(
                TColumnData(int_col=[1451606400, 1483228800, ns_na]),
                nulls=nulls,
            ),  # noqa
            TColumn(
                TColumnData(int_col=[-30578688000, 1483228800, bigint_na]),
                nulls=nulls,
            ),  # noqa
        ]
        assert_columnar_equal(result[0], expected)
Exemplo n.º 4
0
    def test_build_table_columnar_pandas(self):
        common_col_params = dict(
            nullable=True,
            precision=0,
            scale=0,
            comp_param=0,
            encoding='NONE',
            is_array=False,
        )

        col_types = [
            ColumnDetails(name='boolean_', type='BOOL', **common_col_params),
            ColumnDetails(name='smallint_',
                          type='SMALLINT',
                          **common_col_params),
            ColumnDetails(name='int_', type='INT', **common_col_params),
            ColumnDetails(name='bigint_', type='BIGINT', **common_col_params),
            ColumnDetails(name='float_', type='FLOAT', **common_col_params),
            ColumnDetails(name='double_', type='DOUBLE', **common_col_params),
            ColumnDetails(name='varchar_', type='STR', **common_col_params),
            ColumnDetails(name='text_', type='STR', **common_col_params),
            ColumnDetails(name='time_', type='TIME', **common_col_params),
            ColumnDetails(
                name='timestamp_',
                type='TIMESTAMP',
                nullable=True,
                precision=0,
                scale=0,
                comp_param=0,
                encoding='NONE',
                is_array=False,
            ),
            ColumnDetails(name='date_', type='DATE', **common_col_params),
        ]

        data = pd.DataFrame({
            'boolean_': [True, False],
            'smallint_':
            np.array([0, 1], dtype=np.int16),
            'int_':
            np.array([0, 1], dtype=np.int32),
            'bigint_':
            np.array([0, 1], dtype=np.int64),
            'float_':
            np.array([0, 1], dtype=np.float32),
            'double_':
            np.array([0, 1], dtype=np.float64),
            'varchar_': ['a', 'b'],
            'text_': ['a', 'b'],
            'time_': [datetime.time(0, 11, 59),
                      datetime.time(13)],
            'timestamp_': [pd.Timestamp('2016'),
                           pd.Timestamp('2017')],
            'date_': [
                datetime.date(2016, 1, 1),
                datetime.date(2017, 1, 1),
            ],
        })
        result = _pandas_loaders.build_input_columnar(
            data,
            preserve_index=False,
            col_names=data.columns,
            col_types=col_types,
        )

        nulls = [False, False]
        expected = [
            TColumn(TColumnData(int_col=[True, False]), nulls=nulls),
            TColumn(
                TColumnData(int_col=np.array([0, 1], dtype=np.int16)),
                nulls=nulls,
            ),  # noqa
            TColumn(
                TColumnData(int_col=np.array([0, 1], dtype=np.int32)),
                nulls=nulls,
            ),  # noqa
            TColumn(
                TColumnData(int_col=np.array([0, 1], dtype=np.int64)),
                nulls=nulls,
            ),  # noqa
            TColumn(
                TColumnData(real_col=np.array([0, 1], dtype=np.float32)),
                nulls=nulls,
            ),  # noqa
            TColumn(
                TColumnData(real_col=np.array([0, 1], dtype=np.float64)),
                nulls=nulls,
            ),  # noqa
            TColumn(TColumnData(str_col=['a', 'b']), nulls=nulls),
            TColumn(TColumnData(str_col=['a', 'b']), nulls=nulls),
            TColumn(TColumnData(int_col=[719, 46800]), nulls=nulls),
            TColumn(TColumnData(int_col=[1451606400, 1483228800]),
                    nulls=nulls),  # noqa
            TColumn(TColumnData(int_col=[1451606400, 1483228800]),
                    nulls=nulls),
        ]
        assert_columnar_equal(result[0], expected)