Exemplo n.º 1
0
 (
     cudf.Series([None, 123, None, 1], dtype="uint32"),
     pd.Series([None, 123, None, 1], dtype=pd.UInt32Dtype()),
 ),
 (
     cudf.Series([234, 2323, 23432, None, None, 224], dtype="uint64"),
     pd.Series([234, 2323, 23432, None, None, 224],
               dtype=pd.UInt64Dtype()),
 ),
 (
     cudf.Series([-10, 1, None, -1, None, 3], dtype="int8"),
     pd.Series([-10, 1, None, -1, None, 3], dtype=pd.Int8Dtype()),
 ),
 (
     cudf.Series([111, None, 222, None, 13], dtype="int16"),
     pd.Series([111, None, 222, None, 13], dtype=pd.Int16Dtype()),
 ),
 (
     cudf.Series([11, None, 22, 33, None, 2, None, 3], dtype="int32"),
     pd.Series([11, None, 22, 33, None, 2, None, 3],
               dtype=pd.Int32Dtype()),
 ),
 (
     cudf.Series([32431, None, None, 32322, 0, 10, -32324, None],
                 dtype="int64"),
     pd.Series(
         [32431, None, None, 32322, 0, 10, -32324, None],
         dtype=pd.Int64Dtype(),
     ),
 ),
 (
Exemplo n.º 2
0
    np.uint64: pa.uint64(),
    np.uint32: pa.uint32(),
    np.uint16: pa.uint16(),
    np.uint8: pa.uint8(),
    np.datetime64: pa.date64(),
    np.object_: pa.string(),
    np.str_: pa.string(),
}

cudf_dtypes_to_pandas_dtypes = {
    np.dtype("uint8"): pd.UInt8Dtype(),
    np.dtype("uint16"): pd.UInt16Dtype(),
    np.dtype("uint32"): pd.UInt32Dtype(),
    np.dtype("uint64"): pd.UInt64Dtype(),
    np.dtype("int8"): pd.Int8Dtype(),
    np.dtype("int16"): pd.Int16Dtype(),
    np.dtype("int32"): pd.Int32Dtype(),
    np.dtype("int64"): pd.Int64Dtype(),
    np.dtype("bool_"): pd.BooleanDtype(),
    np.dtype("object"): pd.StringDtype(),
}

SIGNED_INTEGER_TYPES = {"int8", "int16", "int32", "int64"}
UNSIGNED_TYPES = {"uint8", "uint16", "uint32", "uint64"}
INTEGER_TYPES = SIGNED_INTEGER_TYPES | UNSIGNED_TYPES
FLOAT_TYPES = {"float32", "float64"}
SIGNED_TYPES = SIGNED_INTEGER_TYPES | FLOAT_TYPES
NUMERIC_TYPES = SIGNED_TYPES | UNSIGNED_TYPES
DATETIME_TYPES = {
    "datetime64[s]",
    "datetime64[ms]",
Exemplo n.º 3
0
from dask_sql._compat import FLOAT_NAN_IMPLEMENTED
from dask_sql.java import SqlTypeName

logger = logging.getLogger(__name__)

# Default mapping between python types and SQL types
_PYTHON_TO_SQL = {
    np.float64: SqlTypeName.DOUBLE,
    np.float32: SqlTypeName.FLOAT,
    np.int64: SqlTypeName.BIGINT,
    pd.Int64Dtype(): SqlTypeName.BIGINT,
    np.int32: SqlTypeName.INTEGER,
    pd.Int32Dtype(): SqlTypeName.INTEGER,
    np.int16: SqlTypeName.SMALLINT,
    pd.Int16Dtype(): SqlTypeName.SMALLINT,
    np.int8: SqlTypeName.TINYINT,
    pd.Int8Dtype(): SqlTypeName.TINYINT,
    np.uint64: SqlTypeName.BIGINT,
    pd.UInt64Dtype(): SqlTypeName.BIGINT,
    np.uint32: SqlTypeName.INTEGER,
    pd.UInt32Dtype(): SqlTypeName.INTEGER,
    np.uint16: SqlTypeName.SMALLINT,
    pd.UInt16Dtype(): SqlTypeName.SMALLINT,
    np.uint8: SqlTypeName.TINYINT,
    pd.UInt8Dtype(): SqlTypeName.TINYINT,
    np.bool8: SqlTypeName.BOOLEAN,
    pd.BooleanDtype(): SqlTypeName.BOOLEAN,
    np.object_: SqlTypeName.VARCHAR,
    pd.StringDtype(): SqlTypeName.VARCHAR,
    np.datetime64: SqlTypeName.TIMESTAMP,
Exemplo n.º 4
0
    np.uint64: pa.uint64(),
    np.uint32: pa.uint32(),
    np.uint16: pa.uint16(),
    np.uint8: pa.uint8(),
    np.datetime64: pa.date64(),
    np.object_: pa.string(),
    np.str_: pa.string(),
}

np_dtypes_to_pandas_dtypes = {
    np.dtype("uint8"): pd.UInt8Dtype(),
    np.dtype("uint16"): pd.UInt16Dtype(),
    np.dtype("uint32"): pd.UInt32Dtype(),
    np.dtype("uint64"): pd.UInt64Dtype(),
    np.dtype("int8"): pd.Int8Dtype(),
    np.dtype("int16"): pd.Int16Dtype(),
    np.dtype("int32"): pd.Int32Dtype(),
    np.dtype("int64"): pd.Int64Dtype(),
    np.dtype("bool_"): pd.BooleanDtype(),
    np.dtype("object"): pd.StringDtype(),
}

pyarrow_dtypes_to_pandas_dtypes = {
    pa.uint8(): pd.UInt8Dtype(),
    pa.uint16(): pd.UInt16Dtype(),
    pa.uint32(): pd.UInt32Dtype(),
    pa.uint64(): pd.UInt64Dtype(),
    pa.int8(): pd.Int8Dtype(),
    pa.int16(): pd.Int16Dtype(),
    pa.int32(): pd.Int32Dtype(),
    pa.int64(): pd.Int64Dtype(),
def modify_and_save_unformed(df, car_classes_df, cities_df, geo_df, date):
    """
    Transforms and saves to file unformed orders dataframe
    :param df: Pandas DataFrame with unformed orders
    :param car_classes_df: Pandas DataFrame with car classes codes and names
    :param cities_df: Pandas DataFrame with cities id's and names
    :param geo_df: Pandas DataFrame with geo zones names, their boundary points and city names
    :param date: date to load in "YYYY-MM-DD" format
    :return: None, but saving file to local network server "//bigshare/Выгрузки ТФ/Выгрузки My_TK/'year'/'month'"
    """
    df['points'] = df['points'].apply(
        len)  # transitional points list to len of that list
    df['type_auto'] = df.type_auto.astype(int)
    df['is_taxo'] = pd.array(df.is_taxo.replace('', np.NaN),
                             dtype=pd.Int8Dtype())  # so I use Int8
    df['base_price'] = df['base_price'].fillna(0).astype(int)
    df['base_price2'] = df['base_price2'].fillna(0).astype(int)
    # FIXME duct tape for compatibility
    if 'is_b2' in df.columns:
        df['is_b2'] = df.is_b2.replace({1.: 'Да'})
    df['proc_a_in'] = df.proc_a_in / 100
    # FIXME duct tape for compatibility
    if 'k_jam' in df.columns:
        df['k_jam'] = df.k_jam.fillna(1.)
    # Extract car serving time from autos_time
    df['autos_time'] = df.apply(
        lambda x: extract_unf_car_time(x.type_auto, x.autos_time),
        axis=1)  # axis ==1 => apply to each row

    df['autos_time'] = pd.array(df.autos_time, dtype=pd.Int16Dtype())
    # Merge with car classes
    df = df.merge(car_classes_df,
                  left_on='type_auto',
                  right_on='id',
                  how='left')  # retrieve car classes names
    df.drop(columns=['type_auto', 'id'], inplace=True)  # cleaning after merge
    df.rename({'name': 'type_auto'}, axis='columns',
              inplace=True)  # cleaning after merge
    # Merge with cities names df and drop non-taxi entries
    df = df.merge(cities_df,
                  left_on='city',
                  right_on='id',
                  how='left',
                  suffixes=('', '_source'))
    df = df[df.type_source.str.startswith('taxi', na=False)]
    df.drop(columns=['id', 'type_source', 'city', 'to_local_time_corr'],
            inplace=True)
    df.rename(columns={'name': 'city'}, inplace=True)
    # Add 'Регион' field
    df['Регион'] = df.city.map(secrets.region_dict)
    # Separate 'date' column to date and time
    df['date'] = pd.to_datetime(df.date)
    new_dates, new_times = zip(*[(d.date(), d.time()) for d in df['date']])
    df = df.assign(Дата=new_dates, Время=new_times)
    df.drop(columns='date', inplace=True)
    # Drop duplicates!
    df.drop_duplicates(subset=['Дата', 'Время', 'phone'],
                       ignore_index=True,
                       inplace=True)
    # Incoming source mapping
    df['type'] = df.type.map(renaming_dicts.incoming_type)
    # Get rid of possible bug entries
    df = df[df.x_in != 0.]
    # Map geo zones
    get_zone(df=df, geozone_df=geo_df, mode='in')
    get_zone(df=df, geozone_df=geo_df, mode='out')
    # Generate key field: MMDDhhmmss&id (or &phone[-7:] if id == 0)
    df['Номер_неоформленного'] = np.where(
        df.id_client == 0,
        df.Дата.astype(str).str.replace('-', '',
                                        regex=True).apply(lambda x: x[-4:]) +
        df.Время.astype(str).str.replace(':', '', regex=True) +
        df.phone.astype(str).apply(lambda x: x[-7:]),
        df.Дата.astype(str).str.replace('-', '',
                                        regex=True).apply(lambda x: x[-4:]) +
        df.Время.astype(str).str.replace(':', '', regex=True) +
        df.id_client.astype(str))
    # Final renaming, dropping and saving
    df.rename(renaming_dicts.unf, axis='columns', inplace=True)
    df.drop(columns=[
        'option_1', 'option_2', 'option_3', 'c_auto_all', 'proc_a_in_all',
        'id_user'
    ],
            inplace=True)
    df['Статус'] = 'Неоформленный'
    df = df.replace(r'^\s*$', np.NaN,
                    regex=True)  # replace all empty strings with NaNs
    # df.to_csv(f"data/{date}_неоф.csv", sep=';', index=False)
    saving_path = tk_u.set_bigshare_dir(date)
    df.to_csv(f"{saving_path}/{date}_неоф.csv", sep=';', index=False)
Exemplo n.º 6
0
    """Semantic representation of a :class:`pandas.Int64Dtype`."""

    type = pd.Int64Dtype()
    bit_width: int = 64


@Engine.register_dtype(equivalents=[pd.Int32Dtype, pd.Int32Dtype()])
@immutable
class INT32(INT64):
    """Semantic representation of a :class:`pandas.Int32Dtype`."""

    type = pd.Int32Dtype()
    bit_width: int = 32


@Engine.register_dtype(equivalents=[pd.Int16Dtype, pd.Int16Dtype()])
@immutable
class INT16(INT32):
    """Semantic representation of a :class:`pandas.Int16Dtype`."""

    type = pd.Int16Dtype()
    bit_width: int = 16


@Engine.register_dtype(equivalents=[pd.Int8Dtype, pd.Int8Dtype()])
@immutable
class INT8(INT16):
    """Semantic representation of a :class:`pandas.Int8Dtype`."""

    type = pd.Int8Dtype()
    bit_width: int = 8
Exemplo n.º 7
0
    'uint64': (parquet_thrift.Type.INT64, parquet_thrift.ConvertedType.UINT_64, 64),
    'float32': (parquet_thrift.Type.FLOAT, None, 32),
    'float64': (parquet_thrift.Type.DOUBLE, None, 64),
    'float16': (parquet_thrift.Type.FLOAT, None, 16),
}

revmap = {
    parquet_thrift.Type.INT32: np.int32,
    parquet_thrift.Type.INT64: np.int64,
    parquet_thrift.Type.FLOAT: np.float32,
    parquet_thrift.Type.DOUBLE: np.float64
}

pdoptional_to_numpy_typemap = {
    pd.Int8Dtype(): np.int8,
    pd.Int16Dtype(): np.int16,
    pd.Int32Dtype(): np.int32,
    pd.Int64Dtype(): np.int64,
    pd.UInt8Dtype(): np.uint8,
    pd.UInt16Dtype(): np.uint16,
    pd.UInt32Dtype(): np.uint32,
    pd.UInt64Dtype(): np.uint64,
    pd.BooleanDtype(): np.bool
}


def find_type(data, fixed_text=None, object_encoding=None, times='int64'):
    """ Get appropriate typecodes for column dtype

    Data conversion do not happen here, see convert().
Exemplo n.º 8
0
    df['create_order_time'] = pd.to_datetime(df['create_order_time'])
    df['date'] = df['create_order_time'].dt.date
    df['day'] = df['create_order_time'].dt.day
    df['hour'] = df['create_order_time'].dt.hour

    df = pd.merge(df, item, how='left', on='item_id')

    memory = df.memory_usage().sum() / 1024**2
    print('Before memory usage of properties dataframe is :', memory, " MB")

    dtype_dict = {
        'buyer_admin_id': 'int32',
        'item_id': 'int32',
        'store_id': pd.Int32Dtype(),
        'irank': 'int16',
        'item_price': pd.Int16Dtype(),
        'cate_id': pd.Int16Dtype(),
        'is_train': 'int8',
        'day': 'int8',
        'hour': 'int8',
    }

    df = df.astype(dtype_dict)
    memory = df.memory_usage().sum() / 1024**2
    print('After memory usage of properties dataframe is :', memory, " MB")
    del train, test
    gc.collect()

    # Before memory usage of properties dataframe is : 1292.8728713989258  MB
    # After memory usage of properties dataframe is : 696.1623153686523  MB
Exemplo n.º 9
0
from collections import OrderedDict
import datetime

import numpy as np
import pandas as pd

from .librdata import Writer
from .custom_errors import PyreadrError


# configuration

int_types = {np.dtype('int32'), np.dtype('int16'), np.dtype('int8'), np.dtype('uint8'), np.dtype('uint16'),
             np.int32, np.int16, np.int8, np.uint8, np.uint16}
int_mixed_types = {pd.Int8Dtype(), pd.Int16Dtype(), pd.Int32Dtype(), pd.UInt8Dtype(), pd.UInt16Dtype()}
float_types = {np.dtype('int64'), np.dtype('uint64'), np.dtype('uint32'), np.dtype('float'),
               np.int64, np.uint64, np.uint32, np.float, pd.Int64Dtype(), pd.UInt32Dtype(), pd.UInt64Dtype()}
datetime_types = {datetime.datetime, np.datetime64}

pyreadr_to_librdata_types = {"INTEGER": "INTEGER", "NUMERIC": "NUMERIC",
                        "LOGICAL": "LOGICAL", "CHARACTER": "CHARACTER",
                        "OBJECT": "CHARACTER", "DATE": "CHARACTER",
                        "DATETIME":"CHARACTER"}
                        
librdata_min_integer = -2147483648


def get_pyreadr_column_types(df):
    """
    From a pandas data frame, get an OrderedDict with column name as key
Exemplo n.º 10
0
def general_data_from_search(team,
                             year,
                             playertype="pitcher",
                             date1="",
                             date2="",
                             addid=False):
    """Gets data from Baseball Savant's search function.  This function gets every pitch event in the given time frame. The playertype
    argument decides whether you get batter or pitcher data.

    If creating an exhaustive database, note that you can get overlapping data because one teams pitcher data will return the same events
    of another teams batter data with the only change being the "player_name" field.  See 'addid' if interested in adding a unique event id
    for simple duplicate detection.

    Args:
        team (string): Team abbreviation in form 'XXX'
        year (int): Year number between 2012-2021 in form YYYY
        playertype (str, optional): Either 'pitcher' or 'batter'. Defaults to "pitcher".
        date1 (str, optional): The bottom date range to search for. Defaults to empty string. yyyy-mm-dd
        date2 (str, optional): The top date range to search for. Defaults to empty string. yyyy-mm-dd
        addid (bool, optional): Adds a custom ID.  Defaults False.
        clean (bool, optional): Clean data for DB. Defaults True.

    Returns:
        pandas dataframe: dateframe of every event from search parameters
    """

    url = (
        "https://baseballsavant.mlb.com/statcast_search/csv?all=true&hfPT=&hfAB="
        f"&hfGT=R%7CPO%7C&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfPull=&hfC=&hfSea={year}%7C&hfSit="
        f"&player_type={playertype}&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt={date1}"
        f"&game_date_lt={date2}&hfInfield=&team={team}&position=&hfOutfield=&hfRO=&home_road=&hfFlag=&hfBBT="
        "&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name&sort_col=pitches&player_event_sort=api_p_release_speed"
        "&sort_order=desc&min_pas=0&type=details&")

    # Read in the csv and specify certain columns to have nullable int types
    scrapedData = pandas.read_csv(url,
                                  dtype={
                                      'zone': pandas.Int16Dtype(),
                                      'hit_location': pandas.Int16Dtype(),
                                      'on_1b': pandas.Int32Dtype(),
                                      'on_2b': pandas.Int32Dtype(),
                                      'on_3b': pandas.Int32Dtype(),
                                      'hit_distance_sc': pandas.Int16Dtype(),
                                      'launch_angle': pandas.Int16Dtype(),
                                      'release_spin_rate': pandas.Int32Dtype(),
                                      'launch_speed_angle':
                                      pandas.Int16Dtype(),
                                      'spin_axis': pandas.Int32Dtype()
                                  })

    # Data comes with columns that will always be empty **EVEN IN YEARS WHERE THIS DATA WAS NOT DEPRECATED, ITS ALL DELETED**
    # Dropping player name, pitcher.1 and fielder_2.1 because they are duplicates of other cols
    scrapedData.drop([
        'umpire', 'spin_dir', 'spin_rate_deprecated', 'break_angle_deprecated',
        'break_length_deprecated', 'tfs_deprecated', 'tfs_zulu_deprecated',
        'player_name', 'pitcher.1', 'fielder_2.1'
    ],
                     axis=1,
                     inplace=True)

    scrapedData.rename(columns={"type": "type_"}, inplace=True)

    if (addid): addPitchIds(scrapedData)

    return scrapedData
Exemplo n.º 11
0
    np.uint64: pa.uint64(),
    np.uint32: pa.uint32(),
    np.uint16: pa.uint16(),
    np.uint8: pa.uint8(),
    np.datetime64: pa.date64(),
    np.object_: pa.string(),
    np.str_: pa.string(),
}

cudf_dtypes_to_pandas_dtypes = {
    cudf.dtype("uint8"): pd.UInt8Dtype(),
    cudf.dtype("uint16"): pd.UInt16Dtype(),
    cudf.dtype("uint32"): pd.UInt32Dtype(),
    cudf.dtype("uint64"): pd.UInt64Dtype(),
    cudf.dtype("int8"): pd.Int8Dtype(),
    cudf.dtype("int16"): pd.Int16Dtype(),
    cudf.dtype("int32"): pd.Int32Dtype(),
    cudf.dtype("int64"): pd.Int64Dtype(),
    cudf.dtype("bool_"): pd.BooleanDtype(),
    cudf.dtype("object"): pd.StringDtype(),
}

pyarrow_dtypes_to_pandas_dtypes = {
    pa.uint8(): pd.UInt8Dtype(),
    pa.uint16(): pd.UInt16Dtype(),
    pa.uint32(): pd.UInt32Dtype(),
    pa.uint64(): pd.UInt64Dtype(),
    pa.int8(): pd.Int8Dtype(),
    pa.int16(): pd.Int16Dtype(),
    pa.int32(): pd.Int32Dtype(),
    pa.int64(): pd.Int64Dtype(),
Exemplo n.º 12
0
    parquet_thrift.Type.FLOAT: np.dtype('float32'),
    parquet_thrift.Type.DOUBLE: np.dtype('float64'),
    parquet_thrift.Type.BOOLEAN: pd.BooleanDtype(),
    parquet_thrift.Type.INT96: np.dtype('S12'),
    parquet_thrift.Type.BYTE_ARRAY: np.dtype("O"),
    parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY: np.dtype("O")
}
complex = {
    parquet_thrift.ConvertedType.UTF8: np.dtype("O"),
    parquet_thrift.ConvertedType.DECIMAL: np.dtype('float64'),
    parquet_thrift.ConvertedType.UINT_8: pd.UInt8Dtype(),
    parquet_thrift.ConvertedType.UINT_16: pd.UInt16Dtype(),
    parquet_thrift.ConvertedType.UINT_32: pd.UInt32Dtype(),
    parquet_thrift.ConvertedType.UINT_64: pd.UInt64Dtype(),
    parquet_thrift.ConvertedType.INT_8: pd.Int8Dtype(),
    parquet_thrift.ConvertedType.INT_16: pd.Int16Dtype(),
    parquet_thrift.ConvertedType.INT_32: pd.Int32Dtype(),
    parquet_thrift.ConvertedType.INT_64: pd.Int64Dtype(),
    parquet_thrift.ConvertedType.TIME_MILLIS: np.dtype('<m8[ns]'),
    parquet_thrift.ConvertedType.DATE: np.dtype('<M8[ns]'),
    parquet_thrift.ConvertedType.TIMESTAMP_MILLIS: np.dtype('<M8[ns]'),
    parquet_thrift.ConvertedType.TIME_MICROS: np.dtype('<m8[ns]'),
    parquet_thrift.ConvertedType.TIMESTAMP_MICROS: np.dtype('<M8[ns]')
}


def typemap(se):
    """Get the final dtype - no actual conversion"""
    if se.converted_type is None:
        if se.type in simple:
            return simple[se.type]
Exemplo n.º 13
0
class DataMapping:
    """
    Map primary data between different supported data frameworks, preserving equivalent data types.

    DataMapping is for primary data, to map metadata types and values use
    :py:class:`TypeMapping <tracdap.rt.impl.type_system.TypeMapping>` and
    :py:class:`TypeMapping <tracdap.rt.impl.type_system.MetadataCodec>`.
    """

    __log = _util.logger_for_namespace(_DataInternal.__module__ +
                                       ".DataMapping")

    # Matches TRAC_ARROW_TYPE_MAPPING in ArrowSchema, tracdap-lib-data

    __TRAC_DECIMAL_PRECISION = 38
    __TRAC_DECIMAL_SCALE = 12
    __TRAC_TIMESTAMP_UNIT = "ms"
    __TRAC_TIMESTAMP_ZONE = None

    __TRAC_TO_ARROW_BASIC_TYPE_MAPPING = {
        _meta.BasicType.BOOLEAN:
        pa.bool_(),
        _meta.BasicType.INTEGER:
        pa.int64(),
        _meta.BasicType.FLOAT:
        pa.float64(),
        _meta.BasicType.DECIMAL:
        pa.decimal128(__TRAC_DECIMAL_PRECISION, __TRAC_DECIMAL_SCALE),
        _meta.BasicType.STRING:
        pa.utf8(),
        _meta.BasicType.DATE:
        pa.date32(),
        _meta.BasicType.DATETIME:
        pa.timestamp(__TRAC_TIMESTAMP_UNIT, __TRAC_TIMESTAMP_ZONE)
    }

    # Check the Pandas dtypes for handling floats are available before setting up the type mapping
    __PANDAS_FLOAT_DTYPE_CHECK = _DataInternal.float_dtype_check()
    __PANDAS_DATETIME_TYPE = pd.to_datetime([]).dtype

    # Only partial mapping is possible, decimal and temporal dtypes cannot be mapped this way
    __ARROW_TO_PANDAS_TYPE_MAPPING = {
        pa.bool_(): pd.BooleanDtype(),
        pa.int8(): pd.Int8Dtype(),
        pa.int16(): pd.Int16Dtype(),
        pa.int32(): pd.Int32Dtype(),
        pa.int64(): pd.Int64Dtype(),
        pa.uint8(): pd.UInt8Dtype(),
        pa.uint16(): pd.UInt16Dtype(),
        pa.uint32(): pd.UInt32Dtype(),
        pa.uint64(): pd.UInt64Dtype(),
        pa.float16(): pd.Float32Dtype(),
        pa.float32(): pd.Float32Dtype(),
        pa.float64(): pd.Float64Dtype(),
        pa.utf8(): pd.StringDtype()
    }

    @staticmethod
    def arrow_to_python_type(arrow_type: pa.DataType) -> type:

        if pa.types.is_boolean(arrow_type):
            return bool

        if pa.types.is_integer(arrow_type):
            return int

        if pa.types.is_floating(arrow_type):
            return float

        if pa.types.is_decimal(arrow_type):
            return decimal.Decimal

        if pa.types.is_string(arrow_type):
            return str

        if pa.types.is_date(arrow_type):
            return dt.date

        if pa.types.is_timestamp(arrow_type):
            return dt.datetime

        raise _ex.ETracInternal(
            f"No Python type mapping available for Arrow type [{arrow_type}]")

    @classmethod
    def python_to_arrow_type(cls, python_type: type) -> pa.DataType:

        if python_type == bool:
            return pa.bool_()

        if python_type == int:
            return pa.int64()

        if python_type == float:
            return pa.float64()

        if python_type == decimal.Decimal:
            return pa.decimal128(cls.__TRAC_DECIMAL_PRECISION,
                                 cls.__TRAC_DECIMAL_SCALE)

        if python_type == str:
            return pa.utf8()

        if python_type == dt.date:
            return pa.date32()

        if python_type == dt.datetime:
            return pa.timestamp(cls.__TRAC_TIMESTAMP_UNIT,
                                cls.__TRAC_TIMESTAMP_ZONE)

        raise _ex.ETracInternal(
            f"No Arrow type mapping available for Python type [{python_type}]")

    @classmethod
    def trac_to_arrow_type(cls,
                           trac_type: _meta.TypeDescriptor) -> pa.DataType:

        return cls.trac_to_arrow_basic_type(trac_type.basicType)

    @classmethod
    def trac_to_arrow_basic_type(
            cls, trac_basic_type: _meta.BasicType) -> pa.DataType:

        arrow_type = cls.__TRAC_TO_ARROW_BASIC_TYPE_MAPPING.get(
            trac_basic_type)

        if arrow_type is None:
            raise _ex.ETracInternal(
                f"No Arrow type mapping available for TRAC type [{trac_basic_type}]"
            )

        return arrow_type

    @classmethod
    def trac_to_arrow_schema(cls,
                             trac_schema: _meta.SchemaDefinition) -> pa.Schema:

        if trac_schema.schemaType != _meta.SchemaType.TABLE:
            raise _ex.ETracInternal(
                f"Schema type [{trac_schema.schemaType}] cannot be converted for Apache Arrow"
            )

        arrow_fields = [(f.fieldName,
                         cls.trac_to_arrow_basic_type(f.fieldType))
                        for f in trac_schema.table.fields]

        return pa.schema(arrow_fields, metadata={})

    @classmethod
    def trac_arrow_decimal_type(cls) -> pa.Decimal128Type:

        return pa.decimal128(cls.__TRAC_DECIMAL_PRECISION,
                             cls.__TRAC_DECIMAL_SCALE)

    @classmethod
    def pandas_datetime_type(cls):
        return cls.__PANDAS_DATETIME_TYPE

    @classmethod
    def view_to_pandas(cls, view: DataView, part: DataPartKey) -> pd.DataFrame:

        deltas = view.parts.get(part)

        # Sanity checks

        if not view.arrow_schema:
            raise _ex.ETracInternal(f"Data view schema not set")

        if not deltas:
            raise _ex.ETracInternal(
                f"Data view for part [{part.opaque_key}] does not contain any items"
            )

        if len(deltas) == 1:
            return cls.item_to_pandas(deltas[0])

        batches = {
            batch
            for delta in deltas for batch in (
                delta.batches if delta.batches else delta.table.to_batches())
        }

        table = pa.Table.from_batches(batches)  # noqa
        return table.to_pandas()

    @classmethod
    def item_to_pandas(cls, item: DataItem) -> pd.DataFrame:

        if item.pandas is not None:
            return item.pandas.copy()

        if item.table is not None:
            return cls.arrow_to_pandas(item.table)

        if item.batches is not None:
            table = pa.Table.from_batches(item.batches, item.schema)  # noqa
            return cls.arrow_to_pandas(table)

        raise _ex.ETracInternal(f"Data item does not contain any usable data")

    @classmethod
    def arrow_to_pandas(cls, table: pa.Table) -> pd.DataFrame:

        return table.to_pandas(
            ignore_metadata=True,  # noqa
            date_as_object=False,  # noqa
            timestamp_as_object=False,  # noqa
            types_mapper=cls.__ARROW_TO_PANDAS_TYPE_MAPPING.get)

    @classmethod
    def pandas_to_view(cls, df: pd.DataFrame, prior_view: DataView,
                       part: DataPartKey):

        item = cls.pandas_to_item(df, prior_view.arrow_schema)
        return cls.add_item_to_view(prior_view, part, item)

    @classmethod
    def pandas_to_item(cls, df: pd.DataFrame,
                       schema: tp.Optional[pa.Schema]) -> DataItem:

        table = cls.pandas_to_arrow(df, schema)
        return DataItem(table.schema, table)

    @classmethod
    def pandas_to_arrow(cls,
                        df: pd.DataFrame,
                        schema: tp.Optional[pa.Schema] = None) -> pa.Table:

        # Here we convert the whole Pandas df and then pass it to conformance
        # An optimization would be to filter columns before applying conformance
        # To do this, we'd need the case-insensitive field matching logic, including output of warnings

        # Also, note that schema is not applied in from_pandas
        # This is because the conformance logic allows for a wider range of conversions
        # Applying the schema directly would fail for some types where casting is possible

        if len(df) == 0:
            df_schema = pa.Schema.from_pandas(df, preserve_index=False)  # noqa
            table = pa.Table.from_batches(list(), df_schema)  # noqa
        else:
            table = pa.Table.from_pandas(df, preserve_index=False)  # noqa

        # If there is no explict schema, give back the table exactly as it was received from Pandas
        # There could be an option here to coerce types to the appropriate TRAC standard types
        # E.g. unsigned int 32 -> signed int 64, TRAC standard integer type

        if schema is None:
            return table
        else:
            return DataConformance.conform_to_schema(table, schema, df.dtypes)

    @classmethod
    def add_item_to_view(cls, view: DataView, part: DataPartKey,
                         item: DataItem) -> DataView:

        prior_deltas = view.parts.get(part) or list()
        deltas = [*prior_deltas, item]
        parts = {**view.parts, part: deltas}

        return DataView(view.trac_schema, view.arrow_schema, parts)
Exemplo n.º 14
0
def standardize_snf_flag_values(data_frame: pd.DataFrame) -> pd.DataFrame:
    """Replace values of store_and_forward with standardized versions."""

    data_frame['store_and_forward'] = data_frame['store_and_forward'].apply(
        store_and_fwd_flag_mapping_function).astype(pd.Int16Dtype())
    return data_frame
Exemplo n.º 15
0
from apache_beam.utils import proto_utils

__all__ = ('BatchRowsAsDataFrame', 'generate_proxy', 'UnbatchPandas',
           'element_type_from_dataframe')

T = TypeVar('T', bound=NamedTuple)

# Generate type map (presented visually in the docstring)
_BIDIRECTIONAL = [
    (bool, bool),
    (np.int8, np.int8),
    (np.int16, np.int16),
    (np.int32, np.int32),
    (np.int64, np.int64),
    (pd.Int8Dtype(), Optional[np.int8]),
    (pd.Int16Dtype(), Optional[np.int16]),
    (pd.Int32Dtype(), Optional[np.int32]),
    (pd.Int64Dtype(), Optional[np.int64]),
    (np.float32, Optional[np.float32]),
    (np.float64, Optional[np.float64]),
    (object, Any),
    (pd.StringDtype(), Optional[str]),
    (pd.BooleanDtype(), Optional[bool]),
]

PANDAS_TO_BEAM = {
    pd.Series([], dtype=dtype).dtype: fieldtype
    for dtype, fieldtype in _BIDIRECTIONAL
}
BEAM_TO_PANDAS = {fieldtype: dtype for dtype, fieldtype in _BIDIRECTIONAL}
Exemplo n.º 16
0
def pandas_type_casting(df):
    import numpy as np
    import pandas as pd
    global n
    # df = pd.read_csv("users-isprep.zip")
    old = df.memory_usage() / 1024/1024
    
    #numeric cols
    number_cols = list(df.select_dtypes("number").columns)
    n = df[number_cols].fillna(0).agg([min,max]).T.add_suffix("_")
            
    get_cols_names(0, 255, pd.UInt8Dtype())
    get_cols_names(256, 65535, pd.UInt16Dtype())
    get_cols_names(65536, 4294967295, pd.UInt32Dtype())

    get_cols_names(-128, 127, pd.Int8Dtype())
    get_cols_names(-32768, 32767, pd.Int16Dtype())
    get_cols_names(-2147483648, 2147483647, pd.Int32Dtype())

    # date and catagorical datacols
    catagoriacal_cols = list(df.select_dtypes("O").columns)
    date_cols = []
    for i in catagoriacal_cols:
        x = df[i][~df[i].isna()].head()
        try:
            pd.to_datetime(x)
            date_cols.append(i)
        except:
            pass
    catagoriacal_cols = [i for i in catagoriacal_cols if not i in date_cols]

    c = df[catagoriacal_cols].apply(lambda x:x.nunique()/len(df)*100)
    for i in c[c<5].index:
        d[i] = "category"
        
    # del df
    # df = pd.read_csv("users-isprep.zip", parse_dates=date_cols, dtype=d)
    for i in d:
        df[i] = df[i].astype(d[i])
    new = df.memory_usage() / 1024/1024

    m = pd.DataFrame({"new" : new,
                      "old" : old,
                      "Imporovement" : old - new})
    m['Dtype'] = [None] + list(df[list(new.index.drop("Index"))].dtypes.astype(str).values)

    c = df[catagoriacal_cols].apply(lambda x:x.nunique()/len(df)*100)

    m["nunique"] = None
    m.loc[c.index, "nunique"] = list(df[c.index].apply(lambda x:x.nunique() / len(df) * 100).values)

    print("Before :", round(m.old.sum()))
    print("After  :", round(m.new.sum()))
    print("Diff   :", round(m.Imporovement.sum()))
    print("Diff % :", round(m.Imporovement.sum()/m.old.sum(), 2))

    print("\n\nImprovement:")
    print(m.groupby("Dtype").Imporovement.agg([min, max, sum, np.mean, np.median, "count"]))

    print("\n\nDetailed Summary:")
    print(m.to_string())
    return df
Exemplo n.º 17
0
import pandas as pd
import pyorc

import cudf
from cudf.tests.utils import assert_eq
from cudf.utils.dtypes import (
    pandas_dtypes_to_cudf_dtypes,
    pyarrow_dtypes_to_pandas_dtypes,
)

ALL_POSSIBLE_VALUES = "ALL_POSSIBLE_VALUES"

_PANDAS_TO_AVRO_SCHEMA_MAP = {
    np.dtype("int8"): "int",
    pd.Int8Dtype(): ["int", "null"],
    pd.Int16Dtype(): ["int", "null"],
    pd.Int32Dtype(): ["int", "null"],
    pd.Int64Dtype(): ["long", "null"],
    pd.BooleanDtype(): ["boolean", "null"],
    pd.StringDtype(): ["string", "null"],
    np.dtype("bool_"): "boolean",
    np.dtype("int16"): "int",
    np.dtype("int32"): "int",
    np.dtype("int64"): "long",
    np.dtype("O"): "string",
    np.dtype("str"): "string",
    np.dtype("float32"): "float",
    np.dtype("float64"): "double",
    np.dtype("<M8[ns]"): {
        "type": "long",
        "logicalType": "timestamp-millis"
Exemplo n.º 18
0
import pandas as pd
import numpy as np


data = io.StringIO(
    """
id,age,height,weight
129237,32,5.4,126
123083,20,6.1,
123087,25,4.5,unknown
"""
)

df = pd.read_csv(
    data,
    dtype={
        "id": np.int32,
        "age": np.int8,
        "height": np.float16,
        "weight": pd.Int16Dtype(),
    },
    na_values=["unknown"],
    index_col=[0],
)

print(df)
print(df.memory_usage(deep=True))
print(df.dtypes)
print(df.index.dtype)
Exemplo n.º 19
0
class INT16(INT32):
    """Semantic representation of a :class:`pandas.Int16Dtype`."""

    type = pd.Int16Dtype()
    bit_width: int = 16
Exemplo n.º 20
0
VERSION_CATEGORICAL: pd.CategoricalDtype = pd.CategoricalDtype(
    version.ALL_VERSIONS)

# Types of parameters that can only have a single value.
CONFIG_SCALAR: Dict[str, Any] = {
    parameters.ADD_NSV: np.int8,
    parameters.BUNDLE_ID: pd.Int32Dtype(),
    parameters.COVARIATE_ID: pd.Int32Dtype(),
    parameters.CROSSWALK_VERSION_ID: pd.Int32Dtype(),
    parameters.DATA_TRANSFORM: DATA_TRANSFORM_CATEGORICAL,
    parameters.DECOMP_STEP: object,
    parameters.GBD_ROUND_ID: np.uint8,
    parameters.GPR_AMP_CUTOFF: pd.Int32Dtype(),
    parameters.GPR_AMP_FACTOR: np.float64,
    parameters.GPR_AMP_METHOD: GPR_AMP_METHOD_CATEGORICAL,
    parameters.GPR_DRAWS: pd.Int16Dtype(),
    parameters.HOLDOUTS: pd.Int16Dtype(),
    parameters.LOCATION_SET_ID: np.uint32,
    parameters.MODEL_INDEX_ID: pd.Int32Dtype(),
    parameters.MODELABLE_ENTITY_ID: pd.Int32Dtype(),
    parameters.NOTES: object,
    parameters.PATH_TO_CUSTOM_COVARIATES: object,
    parameters.PATH_TO_CUSTOM_STAGE_1: object,
    parameters.PATH_TO_DATA: object,
    parameters.PREDICT_RE: np.int8,
    parameters.PREDICTION_UNITS: str,
    parameters.RAKE_LOGIT: np.int8,
    parameters.ST_VERSION: VERSION_CATEGORICAL,
    parameters.STAGE_1_MODEL_FORMULA: object,
    parameters.TRANSFORM_OFFSET: np.float,
    parameters.YEAR_END: np.uint16,
Exemplo n.º 21
0
    "uint8": "UInt8",
    "float64": "Float64",
    "float32": "Float32",
    "int64": "Int64",
    "int32": "Int32",
    "int16": "Int16",
    "int8": "Int8",
    "datetime64[D]": "Date",
    "datetime64[ns]": "DateTime",
}

PD2CH = keymap(np.dtype, MAPPING)

PD_INT_TYPES = [
    pd.Int8Dtype(),
    pd.Int16Dtype(),
    pd.Int32Dtype(),
    pd.Int64Dtype(),
    pd.UInt8Dtype(),
    pd.UInt16Dtype(),
    pd.UInt32Dtype(),
    pd.UInt64Dtype(),
]

for typ in PD_INT_TYPES:
    PD2CH[typ] = f"Nullable({typ.name})"

CH2PD = itemmap(reversed, MAPPING)
CH2PD["Null"] = "object"
CH2PD["Nothing"] = "object"
Exemplo n.º 22
0
    actual_column = cudf.core.column.as_column(cudf.core.Buffer(data),
                                               dtype=data.dtype)
    assert_eq(cudf.Series(actual_column), cudf.Series(expected))


@pytest.mark.parametrize(
    "pd_dtype,expect_dtype",
    [
        # TODO: Nullable float is coming
        (pd.StringDtype(), np.dtype("O")),
        (pd.UInt8Dtype(), np.dtype("uint8")),
        (pd.UInt16Dtype(), np.dtype("uint16")),
        (pd.UInt32Dtype(), np.dtype("uint32")),
        (pd.UInt64Dtype(), np.dtype("uint64")),
        (pd.Int8Dtype(), np.dtype("int8")),
        (pd.Int16Dtype(), np.dtype("int16")),
        (pd.Int32Dtype(), np.dtype("int32")),
        (pd.Int64Dtype(), np.dtype("int64")),
        (pd.BooleanDtype(), np.dtype("bool")),
    ],
)
def test_build_df_from_nullable_pandas_dtype(pd_dtype, expect_dtype):
    if pd_dtype == pd.StringDtype():
        data = ["a", pd.NA, "c", pd.NA, "e"]
    elif pd_dtype == pd.BooleanDtype():
        data = [True, pd.NA, False, pd.NA, True]
    else:
        data = [1, pd.NA, 3, pd.NA, 5]

    pd_data = pd.DataFrame.from_dict({"a": data}, dtype=pd_dtype)
    gd_data = cudf.DataFrame.from_pandas(pd_data)
Exemplo n.º 23
0
#%% first try to generate data with NaN
# for problems with NaN see 'ex01-pd.dtypes.py'

r=10; c=3; nnans=7
arr = np.random.randint(0, r*c, (r, c)).astype("float32")
arr.dtype

rows = np.random.randint(0, r, (nnans,))
cols = np.random.randint(0, c, (nnans,))
arr[rows, cols] = np.nan    # ok but only because we set dtype=float32
                            # otherwise there would be an error!
                            # in numpy NaN may only be for floats... BAD!!!

df = pd.DataFrame(arr)
df.dtypes   # not good having floats for int...
for c in df.columns: df[c] = df[c].astype(pd.Int16Dtype())
df
# so Pandas accept NaN for int types but they are Pandas' ints!!!

#!!! there's really no shorter way !!!

#%% back to replacing missing data
df
countries = np.array(['SK', 'CZ', 'HG', 'PL'])
key = countries[np.random.randint(0, 4, 10)]

dfg = df.groupby(key)
dfg.groups
dfg.get_group('PL')
dfg.count()    # non NA counts !
Exemplo n.º 24
0
def compute_trips(id_, host_url, offset, limit, sql_dir, psql_credentials,
                  csv_dir, suffix, chunksize):
    """
    Compute query result from Open Trip Planner and save to RESULTS.trips (for example see Google Docs) for each given
    trip, defined by the following attributes/parameters:
        1. OA ID
        2. POI ID
        3. Timestamp (date + time)
    The function does the following:
        1. Loop: Read `chunksize` rows of MODEL.trips into memory, generate corresponding OTP queries
        2. Run the queries and save results to `results.csv`
        3. Save `results.csv` back to RESULTS.trips

    Parameters
    ----------
    id_ : int
        The id number of the portion of the trips table that is being read (e.g. if 6 OTP's are available, we'd split into 6 ID numbers)
    host_url : str
        Base url (of local server) for an OTP query
        Example: 'http://localhost:8080'
    offset : int
        Number of rows to offset, to begin portion {id_} of the table
    limit : int
        Number of rows to limit query to. {offset} + {limit} gives the end trip number of the table
    sql_dir : str
        Directory that stores query_trip_info.sql
    psql_credentials : dict
        Dictionary of PSQL credentials in order to create SQLAlchemy engine
    csv_dir : str
        Directory to save results in csv formats
    suffix : str
        Suffix to append to 'results.trips' as the table name
    chunksize: int
        Rows will be read in batches of this size at a time; all rows will be read at once if not specified

    Returns
    -------
    None

    """

    print(f"{id_} on {host_url} for offset {offset} limit {limit}")

    query_sql_file = os.path.join(sql_dir, 'query_trip_info.sql')
    params = {'suffix': suffix, 'limit': limit, 'offset': offset}
    engine = create_connection_from_dict(psql_credentials, 'postgresql')

    count = 1

    # We chunk up the portion received in order to not crash a DF's memory
    for chunk in execute_sql(query_sql_file,
                             engine,
                             read_file=True,
                             return_df=True,
                             params=params,
                             chunksize=chunksize):
        # Get OTP response
        print(
            f"Getting response from Chunk {count} on OTP {host_url}, for results.trip{suffix}{id_}"
        )
        chunk['response'] = chunk.apply(lambda row: otp.request_otp(
            host_url, row.oa_lat, row.poi_lat, row.oa_lon, row.poi_lon, row.
            date, row.time),
                                        axis=1)

        # Parse OTP response
        chunk[[
            "departure_time", "arrival_time", "total_time", "walk_time",
            "transfer_wait_time", "initial_wait_time", "transit_time",
            "walk_dist", "transit_dist", "total_dist", "num_transfers", "fare"
        ]] = chunk.apply(
            lambda row: otp.parse_response(row.response, row.date, row.time),
            axis=1,
            result_type="expand")
        chunk = chunk[[
            "trip_id", "departure_time", "arrival_time", "total_time",
            "walk_time", "transfer_wait_time", "initial_wait_time",
            "transit_time", "walk_dist", "transit_dist", "total_dist",
            "num_transfers", "fare"
        ]]
        chunk.num_transfers = chunk.num_transfers.astype(pd.Int16Dtype())
        chunk.set_index('trip_id', inplace=True)

        # Write response to CSV
        print(
            f"Writing response to CSV from chunk {count} on OTP {host_url}, for results.trip{suffix}{id_}"
        )
        chunk.to_csv(os.path.join(csv_dir, f"trips{suffix}{id_}.csv"),
                     mode='a',
                     header=False)
        count += 1

    # Copy CSV with this portion to DB
    print(f"Copying csv's to db for results.trips{suffix}{id_}")
    copy_text_to_db(os.path.join(csv_dir, f"trips{suffix}{id_}.csv"),
                    f'results.trips{suffix}',
                    engine,
                    mode='append',
                    header=False)

    # Update the model trips table so we know these trips have been computed, if we ever re-run the pipeline with "append" mode
    update_sql_file = os.path.join(sql_dir, 'update_computed_model_trips.sql')
    execute_sql(update_sql_file, engine, read_file=True, params=params)