示例#1
0
def from_wkb(data):
    """
    Convert a list or array of wkb objects to a GeoArray.
    :param data: array-like
            list or array of wkb objects
    :return: GeoArray
    """
    # pandas.infer_type can't infer custom ExtensionDtype
    if not isinstance(getattr(data, "dtype", None),
                      GeoDtype) and len(data) != 0:
        from pandas.api.types import infer_dtype
        inferred = infer_dtype(data, skipna=True)
        if inferred in ("bytes", "empty"):
            pass
        else:
            raise ValueError("'data' must be bytes type array or list.")
    if not isinstance(data, np.ndarray):
        array = np.empty(len(data), dtype=object)
        array[:] = data
    else:
        array = data

    mask = pd.isna(array)
    array[mask] = None
    return GeoArray(array)
示例#2
0
文件: core.py 项目: mcleonard/tufty
def infer_vegalite_type(data):
    """
    From an array-like input, infer the correct vega typecode
    ('ordinal', 'nominal', 'quantitative', or 'temporal')

    Parameters
    ----------
    data: Numpy array or Pandas Series
    """
    # Otherwise, infer based on the dtype of the input
    typ = infer_dtype(data)

    # TODO: Once this returns 'O', please update test_select_x and test_select_y in test_api.py

    if typ in ['floating', 'mixed-integer-float', 'integer',
               'mixed-integer', 'complex']:
        return 'quantitative'
    elif typ in ['string', 'bytes', 'categorical', 'boolean', 'mixed', 'unicode']:
        return 'nominal'
    elif typ in ['datetime', 'datetime64', 'timedelta',
                 'timedelta64', 'date', 'time', 'period']:
        return 'temporal'
    else:
        warnings.warn("I don't know how to infer vegalite type from '{}'.  "
                      "Defaulting to nominal.".format(typ))
        return 'nominal'
示例#3
0
    def __init__(self, data=None, index=None, name=None, crs=None, **kwargs):

        if hasattr(data, "crs") and crs:
            if not data.crs:
                data = data.copy()
            else:
                raise ValueError(
                    "csr of the passed geometry data is different from crs.")
        # scalar wkb or wkt
        if isinstance(data, (bytes, str)):
            n = len(index) if index is not None else 1
            data = [data] * n

        if not is_geometry_array(data):
            s = Series(data, index=index, name=name, **kwargs)
            if s.empty:
                s = s.astype(bytes)
            else:
                from pandas.api.types import infer_dtype
                inferred = infer_dtype(s, skipna=True)
                if inferred in ("bytes", "empty"):
                    pass
                elif inferred == "string":
                    s = arctern.ST_GeomFromText(s)
                else:
                    raise TypeError(
                        "Can not use no bytes or string data to construct GeoSeries."
                    )
            data = GeoArray(s.values)

        super().__init__(data, index=index, name=name, **kwargs)

        self._crs = None
        self.set_crs(crs)
示例#4
0
def infer_vegalite_type(data, field=None):
    """
    From an array-like input, infer the correct vega typecode
    ('ordinal', 'nominal', 'quantitative', or 'temporal')

    Parameters
    ----------
    data: Numpy array or Pandas Series
    field: str column name
    """
    # See if we can read the type from the field
    if field is not None:
        parsed = parse_shorthand(field)
        if parsed.get('type'):
            return parsed['type']

    # Otherwise, infer based on the dtype of the input
    typ = infer_dtype(data)

    # TODO: Once this returns 'O', please update test_select_x and test_select_y in test_api.py

    if typ in ['floating', 'mixed-integer-float', 'integer',
               'mixed-integer', 'complex']:
        return 'quantitative'
    elif typ in ['string', 'bytes', 'categorical', 'boolean', 'mixed', 'unicode']:
        return 'nominal'
    elif typ in ['datetime', 'datetime64', 'timedelta',
                 'timedelta64', 'date', 'time', 'period']:
        return 'temporal'
    else:
        warnings.warn("I don't know how to infer vegalite type from '{0}'.  "
                      "Defaulting to nominal.".format(typ))
        return 'nominal'
示例#5
0
def from_wkb_or_wkt(data):
    """
    Convert a list or array of wkb objects to a GeoArray.
    :param data: array-like
            list or array of wkb objects
    :return: GeoArray
    """
    if not isinstance(data, np.ndarray):
        array = np.empty(len(data), dtype=object)
        array[:] = data
    else:
        array = data.astype(object)

    mask = pd.isna(array)
    array[mask] = None

    if not isinstance(getattr(array, "dtype", None),
                      GeoDtype) and len(array) != 0:
        from pandas.api.types import infer_dtype
        inferred = infer_dtype(array, skipna=True)
        if inferred in ("bytes", "empty"):
            pass
        elif inferred == "string":
            array = arctern.ST_GeomFromText(array).values
        else:
            raise TypeError("'data' must be bytes type array or list.")
    return GeoArray(array)
示例#6
0
    def ensure_stream(self, key, value=None):
        self.graph.add(key, [])

        if key not in self.streams:
            configuration = self._stream_conf(key)
            dtype = configuration.get("dtype", infer_dtype([value]) if value else None)
            self.add_stream(key, Stream(name=key, dtype=dtype, configuration=configuration))
示例#7
0
文件: core.py 项目: rpmunoz/altair
def infer_vegalite_type(data, field=None):
    """
    From an array-like input, infer the correct vega typecode
    ('ordinal', 'nominal', 'quantitative', or 'temporal')

    Parameters
    ----------
    data: Numpy array or Pandas Series
    field: str column name
    """
    # See if we can read the type from the field
    if field is not None:
        parsed = parse_shorthand(field)
        if parsed.get('type'):
            return parsed['type']

    # Otherwise, infer based on the dtype of the input
    typ = infer_dtype(data)

    # TODO: Once this returns 'O', please update test_select_x and test_select_y in test_api.py

    if typ in ['floating', 'mixed-integer-float', 'integer',
               'mixed-integer', 'complex']:
        return 'quantitative'
    elif typ in ['string', 'bytes', 'categorical', 'boolean', 'mixed', 'unicode']:
        return 'nominal'
    elif typ in ['datetime', 'datetime64', 'timedelta',
                 'timedelta64', 'date', 'time', 'period']:
        return 'temporal'
    else:
        warnings.warn("I don't know how to infer vegalite type from '{0}'.  "
                      "Defaulting to nominal.".format(typ))
        return 'nominal'
示例#8
0
 def _ensure_supported_dtypes(array):
     # We only support these types for now, as we need to read them in Java
     if (array.dtype.kind) == 'i':
         array = array.astype('<i8')
     elif (array.dtype.kind) == 'f':
         array = array.astype('<f8')
     elif (array.dtype.kind) in ('O', 'U', 'S'):
         if (array.dtype.kind) == 'O' and infer_dtype(array) not in [
                 'unicode', 'string', 'bytes'
         ]:
             # `string` in python2 and `bytes` in python3
             raise UnhandledDtypeException(
                 "Casting object column to string failed")
         try:
             array = array.astype(np.unicode_)
         except (UnicodeDecodeError, SystemError):
             # `UnicodeDecodeError` in python2 and `SystemError` in python3
             array = np.array([s.decode('utf-8') for s in array])
         except:
             raise UnhandledDtypeException(
                 "Only unicode and utf8 strings are supported.")
     else:
         raise UnhandledDtypeException(
             "Unsupported dtype '%s' - only int64, float64 and U are supported"
             % array.dtype)
     # Everything is little endian in tickstore
     if array.dtype.byteorder != '<':
         array = array.astype(array.dtype.newbyteorder('<'))
     return array
示例#9
0
 def _get_categorical_columns(self):
     result = [
         col for col in self._obj.columns if infer_dtype(self._obj[col]) in
         ['object', 'string', 'category', 'categorical']
     ]
     self._categorical_columns = result
     return result
示例#10
0
def infer_vegalite_type(data):
    """
    From an array-like input, infer the correct vega typecode
    ('ordinal', 'nominal', 'quantitative', or 'temporal')

    Parameters
    ----------
    data: Numpy array or Pandas Series
    """
    # Otherwise, infer based on the dtype of the input
    typ = infer_dtype(data, **_infer_dtype_kwds)

    # TODO: Once this returns 'O', please update test_select_x and test_select_y in test_api.py

    if typ in [
            'floating', 'mixed-integer-float', 'integer', 'mixed-integer',
            'complex'
    ]:
        return 'quantitative'
    elif typ in [
            'string', 'bytes', 'categorical', 'boolean', 'mixed', 'unicode'
    ]:
        return 'nominal'
    elif typ in [
            'datetime', 'datetime64', 'timedelta', 'timedelta64', 'date',
            'time', 'period'
    ]:
        return 'temporal'
    else:
        warnings.warn("I don't know how to infer vegalite type from '{}'.  "
                      "Defaulting to nominal.".format(typ))
        return 'nominal'
    def _calculate_pattern(self):
        from pandas.api.types import infer_dtype
        self._type = infer_dtype(self, skipna=True)
        if self._type == 'integer':
            pass
        elif self._type == 'floating' or self._type == 'mixed-integer-float':
            self._type = 'float'
        elif self._type in ['string', 'mixed-integer', 'mixed']:
            self._type = 'string'
            if all(map(utils.is_datetime, self._values)):
                self._type = 'datetime'

        # fill the missing values with the most frequent value
        if self.hasnans:
            self.fillna(self.mode()[0], inplace=True)

        # for datetime attribute is converted to seconds since Unix epoch time
        if self.type == 'datetime':
            self.update(self.map(self._to_seconds))

        if self.type == 'float':
            self._decimals = self.decimals()

        # The `categorical` option can be set to true when the attribute is
        # string-typed and all values are not unique, and its value can be
        # overrode by user.
        self.categorical = self.categorical or (self.type == 'string'
                                                and not self.is_unique)
        self._set_domain()
        self._set_distribution()
示例#12
0
 def from_example(name, configuration, date, example):
     if "dtype" not in configuration:
         configuration["dtype"] = infer_dtype([example])
     tree = DateTree(name,
                     index=[pd.to_datetime(date, utc=True)],
                     values=[example],
                     **configuration)
     return tree
示例#13
0
    def generate_profile(self):
        """
        Generate a profile JSON output for the data.

        Returns:
            JSON
        """

        _data = self.data

        fields = []
        for k in _data.keys():
            typ = ptypes.infer_dtype(_data[k])

            if typ == "floating":
                fields.append({
                    "name":
                    k,
                    "min":
                    _data[k].min(),
                    "max":
                    _data[k].max(),
                    "mean":
                    _data[k].mean(),
                    "std":
                    _data[k].std(),
                    "quantiles":
                    [_data[k].quantile(f / 10) for f in range(0, 10, 1)]
                })
            elif typ == "string":
                if len(_data[k].unique()) < _MAX_UNIQ:
                    fields.append({
                        "name":
                        k,
                        "options": [{
                            "value": v,
                            "percent": sum(_data[k] == v) / len(_data)
                        } for v in _data[k].unique()]
                    })
                else:
                    try:
                        dateparser.parse(_data[k][0])
                        _data[k] = pd.to_datetime(_data[k])
                        fields.append({
                            "name": k,
                            "type": "datetime",
                            "min": _data[k].min(),
                            "max": _data[k].max()
                        })
                    except Exception as e:
                        # print(e)
                        fields.append({
                            "name": k,
                            "type": "string",
                            "is_uniq": _data[k].is_unique
                        })

        return [{'columns': fields}]
示例#14
0
    def _clean_cols_for_hdf(data):

        types = data.apply(lambda x: infer_dtype(x.values))

        for col in types.index:

            data[col] = pd.to_numeric(data[col])

        return data
示例#15
0
def infer_schema(survey_df, metadata_df):
    """Get schema from dataframe.

    Parameters
    ----------
    survey_df : instance of pd.Dataframe
        The dataframe for which to infer the schema.
    metadata_df : instance of pd.Dataframe
        The metadata dataframe containing information
        about all the columns in all the surveys.
    """
    # pandas to bigquery datatype mapping
    # https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#tablefieldschema
    mapping = dict(floating='float', integer='integer', string='string')

    dtypes = survey_df.dtypes.to_dict()
    schema = dict()
    for column in survey_df.columns:

        choice = dict()
        question = ''
        field_type = ''
        if column in metadata_df.index:
            row = metadata_df.loc[column]

            question = row['field_label']
            if question.startswith('<'):  # html
                question = ''

            field_type = row['field_type']

            choices = row['select_choices_or_calculations']
            if not pd.isnull(choices):
                choices = choices.split('|')
                for c in choices:
                    k, v = c.strip().split(', ')
                    choice[k] = v
        else:
            print(f'Skipping {column}')

        dtype = infer_dtype(survey_df[column], skipna=True)
        dtype = mapping[dtype]
        if dtype == 'string':
            val = survey_df[column].dropna().iloc[0]
        if column.startswith('date'):  # hardcode for now
            dtype = 'datetime'

        schema[column] = {
            'name': column,
            'type': dtype,
            'mode': 'NULLABLE',
            'choices': choice,
            'question': question,
            'field_type': field_type
        }

    return schema
示例#16
0
def gen_min_itemsize(df, print_result=False):
    """生成min_itemsize辅助工具(观察字符型字段长度)"""
    res = {}
    for col in df.columns:
        if infer_dtype(df[col]) == 'string':
            l = df[col].str.len().max()
            res[col] = l
    if print_result:
        pprint(res)
    return res
示例#17
0
    def add(self, date, state):
        if pd.isna(state) and self.empty:
            return

        if self.dtype is None:
            self.dtype = infer_dtype([state])

        if self._tree is None or self._tree.empty:
            self._tree = DateTree.from_example(self.name, self.configuration, date, example=state)
        else:
            self._tree.add(date, state)
示例#18
0
def _infer_atomic_data_type(column: pd.Series) -> Any:
    """
    Function to infer the atomic data type for a column.

    Parameters
    ----------
    column
        A Column of a Pandas DataFrame to be checked.
    """

    return infer_dtype(column[column.apply(_check_valid_values, 0)])
    def __init__(self, data, name=None, dtype=None, index=None, copy=False,
                 fastpath=False, categorical=False):
        """
        A Series with extra information, e.g. categorical.

        Parameters
        ----------
        categorical : bool
            set categorical label for attribute. If categorical, this attribute
            takes on a limited and fixed number of possible values. Examples:
            blood type, gender.
        """
        Series.__init__(self, data, name=name, dtype=dtype, index=index,
                        copy=copy, fastpath=fastpath)

        # bins can be int (size of histogram bins), str (as algorithm name),
        self._bins = ds4ml.params['attribute.bins']

        self._min = None
        self._max = None
        self._step = None

        # probability distribution (pr)
        self.bins = None
        self.prs = None

        from pandas.api.types import infer_dtype
        # atype: date type for handle different kinds of attributes in data
        # synthesis, support: integer, float, string, datetime.
        self.atype = infer_dtype(self, skipna=True)
        if self.atype == 'integer':
            pass
        elif self.atype == 'floating' or self.atype == 'mixed-integer-float':
            self.atype = 'float'
        elif self.atype in ['string', 'mixed-integer', 'mixed']:
            self.atype = 'string'
            if all(map(utils.is_datetime, self._values)):
                self.atype = 'datetime'

        # fill the missing values with the most frequent value
        self.fillna(self.mode()[0], inplace=True)

        # special handling for datetime attribute
        if self.atype == 'datetime':
            self.update(self.map(self._to_seconds).map(self._date_formatter))

        if self.atype == 'float':
            self._decimals = self.decimals()

        # how to define the attribute is categorical.
        self.categorical = categorical or (
                self.atype == 'string' and not self.is_unique)
        self._set_domain()
        self._set_distribution()
示例#20
0
    def _get_stats(self):
        counts = self.df.count()
        counts.name = 'counts'
        uniques = self._get_uniques()
        missing = self._get_missing(counts)
        stats = pd.concat([counts, uniques, missing], axis=1, sort=False)

        # settings types
        stats['types'] = ''
        for idx in stats.index:
            stats.loc[idx, 'types'] = infer_dtype(self.df[idx])
        return stats.transpose()[self.df.columns]
示例#21
0
文件: devices.py 项目: tcsvn/pyadlml
def inferdtypes(df):
    """
    device dataframe

    returns list of tuples with device and corresponding dtype
    """
    dev_lst = df[DEVICE].unique()
    res_lst = []
    for dev in dev_lst:
        vals = df[df[DEVICE] == dev][VAL]
        dtype = infer_dtype(vals)
        res_lst.append((dev, dtype))
    return res_lst
示例#22
0
    def _convert_types(self, a):
        """
        Converts object arrays of strings to numpy string arrays
        """
        # No conversion for scalar type
        if a.dtype != 'object':
            return a, None

        # We can't infer the type of an empty array, so just
        # assume strings
        if len(a) == 0:
            return a.astype('U1'), None

        # Compute a mask of missing values. Replace NaNs and Nones with
        # empty strings so that type inference has a chance.
        mask = pd.isnull(a)
        if mask.sum() > 0:
            a = a.copy()
            np.putmask(a, mask, '')
        else:
            mask = None

        if infer_dtype(a, skipna=False) == 'mixed':
            # assume its a string, otherwise raise an error
            try:
                a = np.array([s.encode('ascii') for s in a])
                a = a.astype('O')
            except:
                raise ValueError(
                    "Column of type 'mixed' cannot be converted to string")

        type_ = infer_dtype(a, skipna=False)
        if type_ in ['unicode', 'string']:
            max_len = max_len_string_array(a)
            return a.astype('U{:d}'.format(max_len)), mask
        else:
            raise ValueError('Cannot store arrays with {} dtype'.format(type_))
示例#23
0
    def _convert_types(self, a):
        """
        Converts object arrays of strings to numpy string arrays
        """
        # No conversion for scalar type
        if a.dtype != 'object':
            return a, None

        # We can't infer the type of an empty array, so just
        # assume strings
        if len(a) == 0:
            return a.astype('U1'), None

        # Compute a mask of missing values. Replace NaNs and Nones with
        # empty strings so that type inference has a chance.
        mask = pd.isnull(a)
        if mask.sum() > 0:
            a = a.copy()
            np.putmask(a, mask, '')
        else:
            mask = None

        if infer_dtype(a, skipna=False) == 'mixed':
            # assume its a string, otherwise raise an error
            try:
                a = np.array([s.encode('ascii') for s in a])
                a = a.astype('O')
            except:
                raise ValueError("Column of type 'mixed' cannot be converted to string")

        type_ = infer_dtype(a, skipna=False)
        if type_ in ['unicode', 'string']:
            max_len = max_len_string_array(a)
            return a.astype('U{:d}'.format(max_len)), mask
        else:
            raise ValueError('Cannot store arrays with {} dtype'.format(type_))
示例#24
0
    def docify(self, df):
        """
        Convert a Pandas DataFrame to SON.

        Parameters
        ----------
        df:  DataFrame
            The Pandas DataFrame to encode
        """
        dtypes = {}
        masks = {}
        lengths = {}
        columns = []
        data = Binary(b'')
        start = 0

        arrays = []
        for c in df:
            try:
                columns.append(str(c))
                arr, mask = self._convert_types(df[c].values)
                dtypes[str(c)] = arr.dtype.str
                if mask is not None:
                    masks[str(c)] = Binary(compress(mask.tostring()))
                arrays.append(arr.tostring())
            except Exception as e:
                typ = infer_dtype(df[c], skipna=False)
                msg = "Column '{}' type is {}".format(str(c), typ)
                logging.warning(msg)
                raise e

        arrays = compress_array(arrays)
        for index, c in enumerate(df):
            d = Binary(arrays[index])
            lengths[str(c)] = (start, start + len(d) - 1)
            start += len(d)
            data += d

        doc = SON({DATA: data, METADATA: {}})
        doc[METADATA] = {
            COLUMNS: columns,
            MASK: masks,
            LENGTHS: lengths,
            DTYPE: dtypes
        }

        return doc
示例#25
0
    def docify(self, df):
        """
        Convert a Pandas DataFrame to SON.

        Parameters
        ----------
        df:  DataFrame
            The Pandas DataFrame to encode
        """
        dtypes = {}
        masks = {}
        lengths = {}
        columns = []
        data = Binary(b'')
        start = 0

        arrays = []
        for c in df:
            try:
                columns.append(str(c))
                arr, mask = self._convert_types(df[c].values)
                dtypes[str(c)] = arr.dtype.str
                if mask is not None:
                    masks[str(c)] = Binary(compress(mask.tostring()))
                arrays.append(arr.tostring())
            except Exception as e:
                typ = infer_dtype(df[c], skipna=False)
                msg = "Column '{}' type is {}".format(str(c), typ)
                logging.info(msg)
                raise e

        arrays = compress_array(arrays)
        for index, c in enumerate(df):
            d = Binary(arrays[index])
            lengths[str(c)] = (start, start + len(d) - 1)
            start += len(d)
            data += d

        doc = SON({DATA: data, METADATA: {}})
        doc[METADATA] = {COLUMNS: columns,
                         MASK: masks,
                         LENGTHS: lengths,
                         DTYPE: dtypes
                         }

        return doc
示例#26
0
def _fill_missing_values_column(column: pd.Series) -> None:
    """
    Function to standardize the missing values for a column, depending on its data type detection:
    1. For date, change to pd.NaT
    3. For any other data type, change to np.nan

    Parameters
    ----------
    column
        Columns of a Pandas DataFrame to be standardized.
    """

    if infer_dtype(column) == "date":
        column[column.apply(_check_null_values, 0)] = pd.NaT
        column = pd.to_datetime(column)
    else:
        column[column.apply(_check_null_values, 0)] = np.nan
示例#27
0
def generate_tuples():
    filename = '/home/cot/hw5-overbyc-ilya-shpitser/seer_app/patient_table_data5000.csv'
    self = data_import(filename)

    x_tuple = ()
    y_tuple = ()
    x_tuples = []
    y_tuples = []
    for col in self:
        typ = ptypes.infer_dtype(self[col])
        if typ == "integer":
            y_tuple = (col, col)
            y_tuples.append(y_tuple)  # continuous variables
        elif typ == "string":
            x_tuple = (col, col)
            x_tuples.append(x_tuple)  # categorical variables
    tuples = [x_tuples, y_tuples]
    return (tuples)
示例#28
0
def generate_tuples():
    self = data_import('1000.csv')

    x_tuple = ()
    y_tuple = ()
    x_tuples = []
    y_tuples = []
    for col in self:
        if col not in ['subject_id', 'icd9_code']:
            typ = ptypes.infer_dtype(self[col])
            if typ == "integer":
                x_tuple = (col, col)
                x_tuples.append(x_tuple)  # continuous variables
            elif typ == "string":
                x_tuple = (col, col)
                x_tuples.append(x_tuple)  # categorical variables
    tuples = [x_tuples]
    return (tuples)
示例#29
0
    def fillna(self, value=None, method=None, limit=None):
        from pandas.util._validators import validate_fillna_kwargs
        value, method = validate_fillna_kwargs(value, method)

        mask = self.isna()
        from pandas.api.types import is_array_like, infer_dtype
        if is_array_like(value):
            if len(value) != len(self):
                raise ValueError(
                    f"Length of 'value' does not match. Got ({len(value)}) "
                    f"expected {len(self)}")
            value = value[mask]
        else:
            # because pandas infer_type(scalar) cant work on scalar value, we put the value into a list
            value = [value]
        if mask.any():
            if method is not None:
                from pandas.core.missing import pad_1d
                from pandas.core.missing import backfill_1d
                func = pad_1d if method == "pad" else backfill_1d
                new_values = func(self.astype(object), limit=limit, mask=mask)
                new_values = self._from_sequence(new_values, dtype=self.dtype)
                # raise NotImplementedError("not support fillna with method")
            else:
                # translate value
                if not isinstance(getattr(value, "dtype", value),
                                  (GeoDtype, type(None))):
                    inferred_type = infer_dtype(value, skipna=True)
                    if inferred_type == "string":
                        value = arctern.ST_GeomFromText(value)
                    elif inferred_type == "bytes":
                        pass
                    else:
                        raise ValueError(
                            "can only fillna with wkt formed string or wkb formed bytes"
                        )

                # fill with value
                new_values = self.copy()
                new_values[mask] = value
        else:
            new_values = self.copy()
        return new_values
示例#30
0
def infer_file_cols_dtypes(filepath, ftype='csv', skipna=True):
    '''
        Returns:
            (dict): inferred dtypes of columns, {COLNAME: DTYPE}
    '''
    fpath = Path(filepath)
    if not fpath.exists():
        msg = f'You must provide an existing and valid path, instead {fpath} was not.'
        ERROR['error_invalid_path'](msg)
    err = lambda x, t: ERROR['error_invalid_path'](
        f'{x} is not a {t.upper()} file.')

    if ftype == 'csv':
        if not fpath.suffix.lower() == '.csv': err(fpath, 'csv')
        temp_df = pd.read_csv(fpath, nrows=5)

    types = {}
    for c in temp_df.columns:
        types[c] = infer_dtype(temp_df[c], skipna=skipna)
    return types
示例#31
0
def _infer_types(df):
    """
    dataframe in raw representation where the columns correspond to devices
    """
    dev_cat = []
    dev_bool = []
    dev_num = []

    dev_lst = df.columns[1:]
    for dev in dev_lst:
        inf = infer_dtype(df[dev], skipna=True)
        if inf == 'string' or inf == 'object':
            dev_cat.append(dev)
        elif inf == 'boolean':
            dev_bool.append(dev)
        elif inf == 'floating':
            dev_num.append(dev)
        else:
            raise ValueError(
                'could not infer correct dtype for device {}'.format(dev))

    return {'categorical': dev_cat, 'boolean': dev_bool, 'numerical': dev_num}
示例#32
0
def _infer_object_dtype(arr):
    # TODO: accelerate with Cython/C

    BOOLEAN, STRING = 0, 1
    state = BOOLEAN

    avalues = arr.values if isinstance(arr, pd.Series) else arr
    nulls = pd.isnull(avalues)

    if nulls.any():
        for i in compat.range(len(avalues)):
            if state == BOOLEAN:
                if not nulls[i] and not pdcom.is_bool(avalues[i]):
                    state = STRING
            elif state == STRING:
                break
        if state == BOOLEAN:
            return 'boolean'
        elif state == STRING:
            return 'string'
    else:
        return infer_dtype(avalues)
示例#33
0
 def _ensure_supported_dtypes(array):
     # We only support these types for now, as we need to read them in Java
     if array.dtype.kind == 'i':
         array = array.astype('<i8')
     elif array.dtype.kind == 'f':
         array = array.astype('<f8')
     elif array.dtype.kind in ('O', 'U', 'S'):
         if array.dtype.kind == 'O' and infer_dtype(array) not in ['unicode', 'string', 'bytes']:
             # `string` in python2 and `bytes` in python3
             raise UnhandledDtypeException("Casting object column to string failed")
         try:
             array = array.astype(np.unicode_)
         except (UnicodeDecodeError, SystemError):
             # `UnicodeDecodeError` in python2 and `SystemError` in python3
             array = np.array([s.decode('utf-8') for s in array])
         except:
             raise UnhandledDtypeException("Only unicode and utf8 strings are supported.")
     else:
         raise UnhandledDtypeException("Unsupported dtype '%s' - only int64, float64 and U are supported" % array.dtype)
     # Everything is little endian in tickstore
     if array.dtype.byteorder != '<':
         array = array.astype(array.dtype.newbyteorder('<'))
     return array
示例#34
0
def extract_dataframe_dtypes(df: pd.DataFrame) -> List[GenericDataType]:
    """Serialize pandas/numpy dtypes to generic types"""

    # omitting string types as those will be the default type
    inferred_type_map: Dict[str, GenericDataType] = {
        "floating": GenericDataType.NUMERIC,
        "integer": GenericDataType.NUMERIC,
        "mixed-integer-float": GenericDataType.NUMERIC,
        "decimal": GenericDataType.NUMERIC,
        "boolean": GenericDataType.BOOLEAN,
        "datetime64": GenericDataType.TEMPORAL,
        "datetime": GenericDataType.TEMPORAL,
        "date": GenericDataType.TEMPORAL,
    }

    generic_types: List[GenericDataType] = []
    for column in df.columns:
        series = df[column]
        inferred_type = infer_dtype(series)
        generic_type = inferred_type_map.get(inferred_type, GenericDataType.STRING)
        generic_types.append(generic_type)

    return generic_types
示例#35
0
def infer_dtype_bydata(data):
    d_type = DataType.UNKNOWN
    if is_scalar(data):
        d_type = infer_dtype_by_scaladata(data)
        return d_type

    if is_list_like(data) or is_array_like(data):
        failed = False
        try:
            type_str = infer_dtype(data)
        except TypeError:
            failed = True
        if not failed:
            d_type = dtype_str_map.get(type_str, DataType.UNKNOWN)
            if is_numeric_datatype(d_type):
                d_type = DataType.FLOAT_VECTOR
            else:
                d_type = DataType.UNKNOWN

            return d_type

    if d_type == DataType.UNKNOWN:
        try:
            elem = data[0]
        except:
            elem = None

        if elem is not None and is_scalar(elem):
            d_type = infer_dtype_by_scaladata(elem)

    if d_type == DataType.UNKNOWN:
        _dtype = getattr(data, "dtype", None)

        if _dtype is not None:
            d_type = map_numpy_dtype_to_datatype(_dtype)

    return d_type