def from_wkb(data): """ Convert a list or array of wkb objects to a GeoArray. :param data: array-like list or array of wkb objects :return: GeoArray """ # pandas.infer_type can't infer custom ExtensionDtype if not isinstance(getattr(data, "dtype", None), GeoDtype) and len(data) != 0: from pandas.api.types import infer_dtype inferred = infer_dtype(data, skipna=True) if inferred in ("bytes", "empty"): pass else: raise ValueError("'data' must be bytes type array or list.") if not isinstance(data, np.ndarray): array = np.empty(len(data), dtype=object) array[:] = data else: array = data mask = pd.isna(array) array[mask] = None return GeoArray(array)
def infer_vegalite_type(data): """ From an array-like input, infer the correct vega typecode ('ordinal', 'nominal', 'quantitative', or 'temporal') Parameters ---------- data: Numpy array or Pandas Series """ # Otherwise, infer based on the dtype of the input typ = infer_dtype(data) # TODO: Once this returns 'O', please update test_select_x and test_select_y in test_api.py if typ in ['floating', 'mixed-integer-float', 'integer', 'mixed-integer', 'complex']: return 'quantitative' elif typ in ['string', 'bytes', 'categorical', 'boolean', 'mixed', 'unicode']: return 'nominal' elif typ in ['datetime', 'datetime64', 'timedelta', 'timedelta64', 'date', 'time', 'period']: return 'temporal' else: warnings.warn("I don't know how to infer vegalite type from '{}'. " "Defaulting to nominal.".format(typ)) return 'nominal'
def __init__(self, data=None, index=None, name=None, crs=None, **kwargs): if hasattr(data, "crs") and crs: if not data.crs: data = data.copy() else: raise ValueError( "csr of the passed geometry data is different from crs.") # scalar wkb or wkt if isinstance(data, (bytes, str)): n = len(index) if index is not None else 1 data = [data] * n if not is_geometry_array(data): s = Series(data, index=index, name=name, **kwargs) if s.empty: s = s.astype(bytes) else: from pandas.api.types import infer_dtype inferred = infer_dtype(s, skipna=True) if inferred in ("bytes", "empty"): pass elif inferred == "string": s = arctern.ST_GeomFromText(s) else: raise TypeError( "Can not use no bytes or string data to construct GeoSeries." ) data = GeoArray(s.values) super().__init__(data, index=index, name=name, **kwargs) self._crs = None self.set_crs(crs)
def infer_vegalite_type(data, field=None): """ From an array-like input, infer the correct vega typecode ('ordinal', 'nominal', 'quantitative', or 'temporal') Parameters ---------- data: Numpy array or Pandas Series field: str column name """ # See if we can read the type from the field if field is not None: parsed = parse_shorthand(field) if parsed.get('type'): return parsed['type'] # Otherwise, infer based on the dtype of the input typ = infer_dtype(data) # TODO: Once this returns 'O', please update test_select_x and test_select_y in test_api.py if typ in ['floating', 'mixed-integer-float', 'integer', 'mixed-integer', 'complex']: return 'quantitative' elif typ in ['string', 'bytes', 'categorical', 'boolean', 'mixed', 'unicode']: return 'nominal' elif typ in ['datetime', 'datetime64', 'timedelta', 'timedelta64', 'date', 'time', 'period']: return 'temporal' else: warnings.warn("I don't know how to infer vegalite type from '{0}'. " "Defaulting to nominal.".format(typ)) return 'nominal'
def from_wkb_or_wkt(data): """ Convert a list or array of wkb objects to a GeoArray. :param data: array-like list or array of wkb objects :return: GeoArray """ if not isinstance(data, np.ndarray): array = np.empty(len(data), dtype=object) array[:] = data else: array = data.astype(object) mask = pd.isna(array) array[mask] = None if not isinstance(getattr(array, "dtype", None), GeoDtype) and len(array) != 0: from pandas.api.types import infer_dtype inferred = infer_dtype(array, skipna=True) if inferred in ("bytes", "empty"): pass elif inferred == "string": array = arctern.ST_GeomFromText(array).values else: raise TypeError("'data' must be bytes type array or list.") return GeoArray(array)
def ensure_stream(self, key, value=None): self.graph.add(key, []) if key not in self.streams: configuration = self._stream_conf(key) dtype = configuration.get("dtype", infer_dtype([value]) if value else None) self.add_stream(key, Stream(name=key, dtype=dtype, configuration=configuration))
def _ensure_supported_dtypes(array): # We only support these types for now, as we need to read them in Java if (array.dtype.kind) == 'i': array = array.astype('<i8') elif (array.dtype.kind) == 'f': array = array.astype('<f8') elif (array.dtype.kind) in ('O', 'U', 'S'): if (array.dtype.kind) == 'O' and infer_dtype(array) not in [ 'unicode', 'string', 'bytes' ]: # `string` in python2 and `bytes` in python3 raise UnhandledDtypeException( "Casting object column to string failed") try: array = array.astype(np.unicode_) except (UnicodeDecodeError, SystemError): # `UnicodeDecodeError` in python2 and `SystemError` in python3 array = np.array([s.decode('utf-8') for s in array]) except: raise UnhandledDtypeException( "Only unicode and utf8 strings are supported.") else: raise UnhandledDtypeException( "Unsupported dtype '%s' - only int64, float64 and U are supported" % array.dtype) # Everything is little endian in tickstore if array.dtype.byteorder != '<': array = array.astype(array.dtype.newbyteorder('<')) return array
def _get_categorical_columns(self): result = [ col for col in self._obj.columns if infer_dtype(self._obj[col]) in ['object', 'string', 'category', 'categorical'] ] self._categorical_columns = result return result
def infer_vegalite_type(data): """ From an array-like input, infer the correct vega typecode ('ordinal', 'nominal', 'quantitative', or 'temporal') Parameters ---------- data: Numpy array or Pandas Series """ # Otherwise, infer based on the dtype of the input typ = infer_dtype(data, **_infer_dtype_kwds) # TODO: Once this returns 'O', please update test_select_x and test_select_y in test_api.py if typ in [ 'floating', 'mixed-integer-float', 'integer', 'mixed-integer', 'complex' ]: return 'quantitative' elif typ in [ 'string', 'bytes', 'categorical', 'boolean', 'mixed', 'unicode' ]: return 'nominal' elif typ in [ 'datetime', 'datetime64', 'timedelta', 'timedelta64', 'date', 'time', 'period' ]: return 'temporal' else: warnings.warn("I don't know how to infer vegalite type from '{}'. " "Defaulting to nominal.".format(typ)) return 'nominal'
def _calculate_pattern(self): from pandas.api.types import infer_dtype self._type = infer_dtype(self, skipna=True) if self._type == 'integer': pass elif self._type == 'floating' or self._type == 'mixed-integer-float': self._type = 'float' elif self._type in ['string', 'mixed-integer', 'mixed']: self._type = 'string' if all(map(utils.is_datetime, self._values)): self._type = 'datetime' # fill the missing values with the most frequent value if self.hasnans: self.fillna(self.mode()[0], inplace=True) # for datetime attribute is converted to seconds since Unix epoch time if self.type == 'datetime': self.update(self.map(self._to_seconds)) if self.type == 'float': self._decimals = self.decimals() # The `categorical` option can be set to true when the attribute is # string-typed and all values are not unique, and its value can be # overrode by user. self.categorical = self.categorical or (self.type == 'string' and not self.is_unique) self._set_domain() self._set_distribution()
def from_example(name, configuration, date, example): if "dtype" not in configuration: configuration["dtype"] = infer_dtype([example]) tree = DateTree(name, index=[pd.to_datetime(date, utc=True)], values=[example], **configuration) return tree
def generate_profile(self): """ Generate a profile JSON output for the data. Returns: JSON """ _data = self.data fields = [] for k in _data.keys(): typ = ptypes.infer_dtype(_data[k]) if typ == "floating": fields.append({ "name": k, "min": _data[k].min(), "max": _data[k].max(), "mean": _data[k].mean(), "std": _data[k].std(), "quantiles": [_data[k].quantile(f / 10) for f in range(0, 10, 1)] }) elif typ == "string": if len(_data[k].unique()) < _MAX_UNIQ: fields.append({ "name": k, "options": [{ "value": v, "percent": sum(_data[k] == v) / len(_data) } for v in _data[k].unique()] }) else: try: dateparser.parse(_data[k][0]) _data[k] = pd.to_datetime(_data[k]) fields.append({ "name": k, "type": "datetime", "min": _data[k].min(), "max": _data[k].max() }) except Exception as e: # print(e) fields.append({ "name": k, "type": "string", "is_uniq": _data[k].is_unique }) return [{'columns': fields}]
def _clean_cols_for_hdf(data): types = data.apply(lambda x: infer_dtype(x.values)) for col in types.index: data[col] = pd.to_numeric(data[col]) return data
def infer_schema(survey_df, metadata_df): """Get schema from dataframe. Parameters ---------- survey_df : instance of pd.Dataframe The dataframe for which to infer the schema. metadata_df : instance of pd.Dataframe The metadata dataframe containing information about all the columns in all the surveys. """ # pandas to bigquery datatype mapping # https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#tablefieldschema mapping = dict(floating='float', integer='integer', string='string') dtypes = survey_df.dtypes.to_dict() schema = dict() for column in survey_df.columns: choice = dict() question = '' field_type = '' if column in metadata_df.index: row = metadata_df.loc[column] question = row['field_label'] if question.startswith('<'): # html question = '' field_type = row['field_type'] choices = row['select_choices_or_calculations'] if not pd.isnull(choices): choices = choices.split('|') for c in choices: k, v = c.strip().split(', ') choice[k] = v else: print(f'Skipping {column}') dtype = infer_dtype(survey_df[column], skipna=True) dtype = mapping[dtype] if dtype == 'string': val = survey_df[column].dropna().iloc[0] if column.startswith('date'): # hardcode for now dtype = 'datetime' schema[column] = { 'name': column, 'type': dtype, 'mode': 'NULLABLE', 'choices': choice, 'question': question, 'field_type': field_type } return schema
def gen_min_itemsize(df, print_result=False): """生成min_itemsize辅助工具(观察字符型字段长度)""" res = {} for col in df.columns: if infer_dtype(df[col]) == 'string': l = df[col].str.len().max() res[col] = l if print_result: pprint(res) return res
def add(self, date, state): if pd.isna(state) and self.empty: return if self.dtype is None: self.dtype = infer_dtype([state]) if self._tree is None or self._tree.empty: self._tree = DateTree.from_example(self.name, self.configuration, date, example=state) else: self._tree.add(date, state)
def _infer_atomic_data_type(column: pd.Series) -> Any: """ Function to infer the atomic data type for a column. Parameters ---------- column A Column of a Pandas DataFrame to be checked. """ return infer_dtype(column[column.apply(_check_valid_values, 0)])
def __init__(self, data, name=None, dtype=None, index=None, copy=False, fastpath=False, categorical=False): """ A Series with extra information, e.g. categorical. Parameters ---------- categorical : bool set categorical label for attribute. If categorical, this attribute takes on a limited and fixed number of possible values. Examples: blood type, gender. """ Series.__init__(self, data, name=name, dtype=dtype, index=index, copy=copy, fastpath=fastpath) # bins can be int (size of histogram bins), str (as algorithm name), self._bins = ds4ml.params['attribute.bins'] self._min = None self._max = None self._step = None # probability distribution (pr) self.bins = None self.prs = None from pandas.api.types import infer_dtype # atype: date type for handle different kinds of attributes in data # synthesis, support: integer, float, string, datetime. self.atype = infer_dtype(self, skipna=True) if self.atype == 'integer': pass elif self.atype == 'floating' or self.atype == 'mixed-integer-float': self.atype = 'float' elif self.atype in ['string', 'mixed-integer', 'mixed']: self.atype = 'string' if all(map(utils.is_datetime, self._values)): self.atype = 'datetime' # fill the missing values with the most frequent value self.fillna(self.mode()[0], inplace=True) # special handling for datetime attribute if self.atype == 'datetime': self.update(self.map(self._to_seconds).map(self._date_formatter)) if self.atype == 'float': self._decimals = self.decimals() # how to define the attribute is categorical. self.categorical = categorical or ( self.atype == 'string' and not self.is_unique) self._set_domain() self._set_distribution()
def _get_stats(self): counts = self.df.count() counts.name = 'counts' uniques = self._get_uniques() missing = self._get_missing(counts) stats = pd.concat([counts, uniques, missing], axis=1, sort=False) # settings types stats['types'] = '' for idx in stats.index: stats.loc[idx, 'types'] = infer_dtype(self.df[idx]) return stats.transpose()[self.df.columns]
def inferdtypes(df): """ device dataframe returns list of tuples with device and corresponding dtype """ dev_lst = df[DEVICE].unique() res_lst = [] for dev in dev_lst: vals = df[df[DEVICE] == dev][VAL] dtype = infer_dtype(vals) res_lst.append((dev, dtype)) return res_lst
def _convert_types(self, a): """ Converts object arrays of strings to numpy string arrays """ # No conversion for scalar type if a.dtype != 'object': return a, None # We can't infer the type of an empty array, so just # assume strings if len(a) == 0: return a.astype('U1'), None # Compute a mask of missing values. Replace NaNs and Nones with # empty strings so that type inference has a chance. mask = pd.isnull(a) if mask.sum() > 0: a = a.copy() np.putmask(a, mask, '') else: mask = None if infer_dtype(a, skipna=False) == 'mixed': # assume its a string, otherwise raise an error try: a = np.array([s.encode('ascii') for s in a]) a = a.astype('O') except: raise ValueError( "Column of type 'mixed' cannot be converted to string") type_ = infer_dtype(a, skipna=False) if type_ in ['unicode', 'string']: max_len = max_len_string_array(a) return a.astype('U{:d}'.format(max_len)), mask else: raise ValueError('Cannot store arrays with {} dtype'.format(type_))
def _convert_types(self, a): """ Converts object arrays of strings to numpy string arrays """ # No conversion for scalar type if a.dtype != 'object': return a, None # We can't infer the type of an empty array, so just # assume strings if len(a) == 0: return a.astype('U1'), None # Compute a mask of missing values. Replace NaNs and Nones with # empty strings so that type inference has a chance. mask = pd.isnull(a) if mask.sum() > 0: a = a.copy() np.putmask(a, mask, '') else: mask = None if infer_dtype(a, skipna=False) == 'mixed': # assume its a string, otherwise raise an error try: a = np.array([s.encode('ascii') for s in a]) a = a.astype('O') except: raise ValueError("Column of type 'mixed' cannot be converted to string") type_ = infer_dtype(a, skipna=False) if type_ in ['unicode', 'string']: max_len = max_len_string_array(a) return a.astype('U{:d}'.format(max_len)), mask else: raise ValueError('Cannot store arrays with {} dtype'.format(type_))
def docify(self, df): """ Convert a Pandas DataFrame to SON. Parameters ---------- df: DataFrame The Pandas DataFrame to encode """ dtypes = {} masks = {} lengths = {} columns = [] data = Binary(b'') start = 0 arrays = [] for c in df: try: columns.append(str(c)) arr, mask = self._convert_types(df[c].values) dtypes[str(c)] = arr.dtype.str if mask is not None: masks[str(c)] = Binary(compress(mask.tostring())) arrays.append(arr.tostring()) except Exception as e: typ = infer_dtype(df[c], skipna=False) msg = "Column '{}' type is {}".format(str(c), typ) logging.warning(msg) raise e arrays = compress_array(arrays) for index, c in enumerate(df): d = Binary(arrays[index]) lengths[str(c)] = (start, start + len(d) - 1) start += len(d) data += d doc = SON({DATA: data, METADATA: {}}) doc[METADATA] = { COLUMNS: columns, MASK: masks, LENGTHS: lengths, DTYPE: dtypes } return doc
def docify(self, df): """ Convert a Pandas DataFrame to SON. Parameters ---------- df: DataFrame The Pandas DataFrame to encode """ dtypes = {} masks = {} lengths = {} columns = [] data = Binary(b'') start = 0 arrays = [] for c in df: try: columns.append(str(c)) arr, mask = self._convert_types(df[c].values) dtypes[str(c)] = arr.dtype.str if mask is not None: masks[str(c)] = Binary(compress(mask.tostring())) arrays.append(arr.tostring()) except Exception as e: typ = infer_dtype(df[c], skipna=False) msg = "Column '{}' type is {}".format(str(c), typ) logging.info(msg) raise e arrays = compress_array(arrays) for index, c in enumerate(df): d = Binary(arrays[index]) lengths[str(c)] = (start, start + len(d) - 1) start += len(d) data += d doc = SON({DATA: data, METADATA: {}}) doc[METADATA] = {COLUMNS: columns, MASK: masks, LENGTHS: lengths, DTYPE: dtypes } return doc
def _fill_missing_values_column(column: pd.Series) -> None: """ Function to standardize the missing values for a column, depending on its data type detection: 1. For date, change to pd.NaT 3. For any other data type, change to np.nan Parameters ---------- column Columns of a Pandas DataFrame to be standardized. """ if infer_dtype(column) == "date": column[column.apply(_check_null_values, 0)] = pd.NaT column = pd.to_datetime(column) else: column[column.apply(_check_null_values, 0)] = np.nan
def generate_tuples(): filename = '/home/cot/hw5-overbyc-ilya-shpitser/seer_app/patient_table_data5000.csv' self = data_import(filename) x_tuple = () y_tuple = () x_tuples = [] y_tuples = [] for col in self: typ = ptypes.infer_dtype(self[col]) if typ == "integer": y_tuple = (col, col) y_tuples.append(y_tuple) # continuous variables elif typ == "string": x_tuple = (col, col) x_tuples.append(x_tuple) # categorical variables tuples = [x_tuples, y_tuples] return (tuples)
def generate_tuples(): self = data_import('1000.csv') x_tuple = () y_tuple = () x_tuples = [] y_tuples = [] for col in self: if col not in ['subject_id', 'icd9_code']: typ = ptypes.infer_dtype(self[col]) if typ == "integer": x_tuple = (col, col) x_tuples.append(x_tuple) # continuous variables elif typ == "string": x_tuple = (col, col) x_tuples.append(x_tuple) # categorical variables tuples = [x_tuples] return (tuples)
def fillna(self, value=None, method=None, limit=None): from pandas.util._validators import validate_fillna_kwargs value, method = validate_fillna_kwargs(value, method) mask = self.isna() from pandas.api.types import is_array_like, infer_dtype if is_array_like(value): if len(value) != len(self): raise ValueError( f"Length of 'value' does not match. Got ({len(value)}) " f"expected {len(self)}") value = value[mask] else: # because pandas infer_type(scalar) cant work on scalar value, we put the value into a list value = [value] if mask.any(): if method is not None: from pandas.core.missing import pad_1d from pandas.core.missing import backfill_1d func = pad_1d if method == "pad" else backfill_1d new_values = func(self.astype(object), limit=limit, mask=mask) new_values = self._from_sequence(new_values, dtype=self.dtype) # raise NotImplementedError("not support fillna with method") else: # translate value if not isinstance(getattr(value, "dtype", value), (GeoDtype, type(None))): inferred_type = infer_dtype(value, skipna=True) if inferred_type == "string": value = arctern.ST_GeomFromText(value) elif inferred_type == "bytes": pass else: raise ValueError( "can only fillna with wkt formed string or wkb formed bytes" ) # fill with value new_values = self.copy() new_values[mask] = value else: new_values = self.copy() return new_values
def infer_file_cols_dtypes(filepath, ftype='csv', skipna=True): ''' Returns: (dict): inferred dtypes of columns, {COLNAME: DTYPE} ''' fpath = Path(filepath) if not fpath.exists(): msg = f'You must provide an existing and valid path, instead {fpath} was not.' ERROR['error_invalid_path'](msg) err = lambda x, t: ERROR['error_invalid_path']( f'{x} is not a {t.upper()} file.') if ftype == 'csv': if not fpath.suffix.lower() == '.csv': err(fpath, 'csv') temp_df = pd.read_csv(fpath, nrows=5) types = {} for c in temp_df.columns: types[c] = infer_dtype(temp_df[c], skipna=skipna) return types
def _infer_types(df): """ dataframe in raw representation where the columns correspond to devices """ dev_cat = [] dev_bool = [] dev_num = [] dev_lst = df.columns[1:] for dev in dev_lst: inf = infer_dtype(df[dev], skipna=True) if inf == 'string' or inf == 'object': dev_cat.append(dev) elif inf == 'boolean': dev_bool.append(dev) elif inf == 'floating': dev_num.append(dev) else: raise ValueError( 'could not infer correct dtype for device {}'.format(dev)) return {'categorical': dev_cat, 'boolean': dev_bool, 'numerical': dev_num}
def _infer_object_dtype(arr): # TODO: accelerate with Cython/C BOOLEAN, STRING = 0, 1 state = BOOLEAN avalues = arr.values if isinstance(arr, pd.Series) else arr nulls = pd.isnull(avalues) if nulls.any(): for i in compat.range(len(avalues)): if state == BOOLEAN: if not nulls[i] and not pdcom.is_bool(avalues[i]): state = STRING elif state == STRING: break if state == BOOLEAN: return 'boolean' elif state == STRING: return 'string' else: return infer_dtype(avalues)
def _ensure_supported_dtypes(array): # We only support these types for now, as we need to read them in Java if array.dtype.kind == 'i': array = array.astype('<i8') elif array.dtype.kind == 'f': array = array.astype('<f8') elif array.dtype.kind in ('O', 'U', 'S'): if array.dtype.kind == 'O' and infer_dtype(array) not in ['unicode', 'string', 'bytes']: # `string` in python2 and `bytes` in python3 raise UnhandledDtypeException("Casting object column to string failed") try: array = array.astype(np.unicode_) except (UnicodeDecodeError, SystemError): # `UnicodeDecodeError` in python2 and `SystemError` in python3 array = np.array([s.decode('utf-8') for s in array]) except: raise UnhandledDtypeException("Only unicode and utf8 strings are supported.") else: raise UnhandledDtypeException("Unsupported dtype '%s' - only int64, float64 and U are supported" % array.dtype) # Everything is little endian in tickstore if array.dtype.byteorder != '<': array = array.astype(array.dtype.newbyteorder('<')) return array
def extract_dataframe_dtypes(df: pd.DataFrame) -> List[GenericDataType]: """Serialize pandas/numpy dtypes to generic types""" # omitting string types as those will be the default type inferred_type_map: Dict[str, GenericDataType] = { "floating": GenericDataType.NUMERIC, "integer": GenericDataType.NUMERIC, "mixed-integer-float": GenericDataType.NUMERIC, "decimal": GenericDataType.NUMERIC, "boolean": GenericDataType.BOOLEAN, "datetime64": GenericDataType.TEMPORAL, "datetime": GenericDataType.TEMPORAL, "date": GenericDataType.TEMPORAL, } generic_types: List[GenericDataType] = [] for column in df.columns: series = df[column] inferred_type = infer_dtype(series) generic_type = inferred_type_map.get(inferred_type, GenericDataType.STRING) generic_types.append(generic_type) return generic_types
def infer_dtype_bydata(data): d_type = DataType.UNKNOWN if is_scalar(data): d_type = infer_dtype_by_scaladata(data) return d_type if is_list_like(data) or is_array_like(data): failed = False try: type_str = infer_dtype(data) except TypeError: failed = True if not failed: d_type = dtype_str_map.get(type_str, DataType.UNKNOWN) if is_numeric_datatype(d_type): d_type = DataType.FLOAT_VECTOR else: d_type = DataType.UNKNOWN return d_type if d_type == DataType.UNKNOWN: try: elem = data[0] except: elem = None if elem is not None and is_scalar(elem): d_type = infer_dtype_by_scaladata(elem) if d_type == DataType.UNKNOWN: _dtype = getattr(data, "dtype", None) if _dtype is not None: d_type = map_numpy_dtype_to_datatype(_dtype) return d_type