Пример #1
0
def series2col(s, name):
    kw = {
        'name': name,
        'kind': fpb.Column.SLICE,
    }

    if is_integer(s.dtype):
        kw['dtype'] = fpb.INTEGER
        kw['ints'] = s
    elif is_float(s.dtype):
        kw['dtype'] = fpb.FLOAT
        kw['floats'] = s
    elif s.dtype == np.object:  # Pandas dtype for str is object
        kw['strings'] = s
        kw['dtype'] = fpb.STRING
    elif is_bool(s.dtype):
        kw['bools'] = s
        kw['dtype'] = fpb.BOOLEAN
    elif is_datetime(s.dtype):
        if s.dt.tz:
            try:
                s = s.dt.tz_localize(pytz.UTC)
            except TypeError:
                s = s.dt.tz_convert('UTC')
        kw['times'] = s.astype(np.int64)
        kw['dtype'] = fpb.TIME
    elif is_categorical_dtype(s.dtype):
        # We assume catgorical data is strings
        kw['strings'] = s.astype(str)
        kw['dtype'] = fpb.STRING
    else:
        raise WriteError('{} - unsupported type - {}'.format(s.name, s.dtype))

    return fpb.Column(**kw)
Пример #2
0
def get_actual_types(df):
    column_types = {}

    for col_name in df.columns:
        col = df[col_name]
        if is_integer(col.dtype):
            column_types[col.name] = fpb.INTEGER
        elif is_float(col.dtype):
            column_types[col.name] = fpb.FLOAT
        elif is_string(col.dtype):
            has_data = False
            for x in col:
                if pd.isnull(x):
                    continue
                if isinstance(x, str):
                    column_types[col.name] = fpb.STRING
                    has_data = True
                    break
                if isinstance(x, bool):
                    column_types[col.name] = fpb.BOOLEAN
                    has_data = True
                    break
                if isinstance(x, pd.Timestamp):
                    column_types[col.name] = fpb.TIME
                    has_data = True
                    break
                if isinstance(x, datetime):
                    column_types[col.name] = fpb.TIME
                    has_data = True
                    break
                raise WriteError(
                    '{} - contains an unsupported value type - {}'.format(
                        col_name, type(x)))
            # If all items in the column are None
            # it does not matter what type the column will be, set the column as INTEGER
            if not has_data:
                column_types[col.name] = fpb.NULL
        elif is_bool(col.dtype):
            column_types[col.name] = fpb.BOOLEAN
        elif is_datetime(col.dtype):
            column_types[col.name] = fpb.TIME
        elif is_categorical_dtype(col.dtype):
            # We assume catgorical data is strings
            column_types[col.name] = fpb.STRING
        else:
            raise WriteError('{} - unsupported type - {}'.format(
                col_name, col.dtype))

    return column_types
Пример #3
0
def _infer_object_dtype(arr):
    # TODO: accelerate with Cython/C

    BOOLEAN, STRING = 0, 1
    state = BOOLEAN

    avalues = arr.values if isinstance(arr, pd.Series) else arr
    nulls = pd.isnull(avalues)

    if nulls.any():
        for i in compat.range(len(avalues)):
            if state == BOOLEAN:
                if not nulls[i] and not pdcom.is_bool(avalues[i]):
                    state = STRING
            elif state == STRING:
                break
        if state == BOOLEAN:
            return 'boolean'
        elif state == STRING:
            return 'string'
    else:
        return infer_dtype(avalues)
Пример #4
0
def check_if_categorical(feature):
    return (is_categorical(feature) or is_string_dtype(feature)
            or is_bool(feature))
Пример #5
0
def make_nums_from_bool(df):
	# convert the True/Fals boolean values to 1/0
	for n,c in df.items():
		# print(n,end=" ")
		if is_bool(c):
			df[n] = (df[n] == True).astype(int)