def test_convert_pandas_type_to_json_field_datetime( self, dt_args, extra_exp, wrapper): data = [1.0, 2.0, 3.0] data = pd.to_datetime(data, **dt_args) if wrapper is pd.Series: data = pd.Series(data, name="values") result = convert_pandas_type_to_json_field(data) expected = {"name": "values", "type": "datetime"} expected.update(extra_exp) assert result == expected
def infer_schema_from_df( df: pd.DataFrame, features, entities, timestamp_key: str = None, entity_columns=None, options: InferOptions = InferOptions.Null, ): """infer feature set schema from dataframe""" timestamp_fields = [] current_entities = list(entities.keys()) entity_columns = entity_columns or [] def upsert_entity(name, value_type): if name in current_entities: entities[name].value_type = value_type else: entities[name] = {"name": name, "value_type": value_type} schema = pyarrow.Schema.from_pandas(df) index_type = None for i in range(len(schema)): column = schema.names[i] value_type = pa_type_to_value_type(schema.types[i]) if column in df.index.names: index_type = value_type continue is_entity = column in entity_columns or column in current_entities if is_entity: upsert_entity(column, value_type) elif (InferOptions.get_common_options(options, InferOptions.Features) and column != timestamp_key): if column in features.keys(): features[column].value_type = value_type else: features[column] = {"name": column, "value_type": value_type} if value_type == "datetime" and not is_entity: timestamp_fields.append(column) if InferOptions.get_common_options(options, InferOptions.Index): # infer types of index fields if df.index.name: if not index_type: field = convert_pandas_type_to_json_field(df.index) index_type = pd_schema_to_value_type(field["type"]) # Workaround to infer a boolean index correctly, and not as 'str'. upsert_entity(df.index.name, index_type) elif df.index.nlevels > 1: for level, name in zip(df.index.levels, df.index.names): upsert_entity(name, index_type) if index_type == "datetime": timestamp_fields.append(name) return timestamp_key
def test_convert_pandas_type_to_json_field_categorical(self, kind, ordered): data = ["a", "b", "c"] if kind is pd.Categorical: arr = pd.Series(kind(data, ordered=ordered), name="cats") elif kind is pd.CategoricalIndex: arr = kind(data, ordered=ordered, name="cats") result = convert_pandas_type_to_json_field(arr) expected = { "name": "cats", "type": "any", "constraints": {"enum": data}, "ordered": ordered, } assert result == expected
def test_convert_pandas_type_to_json_period_range(self): arr = pd.period_range("2016", freq="A-DEC", periods=4) result = convert_pandas_type_to_json_field(arr) expected = {"name": "values", "type": "datetime", "freq": "A-DEC"} assert result == expected
def test_convert_pandas_type_to_json_field_float(self, index_or_series): kind = index_or_series data = [1.0, 2.0, 3.0] result = convert_pandas_type_to_json_field(kind(data, name="name")) expected = {"name": "name", "type": "number"} assert result == expected
def test_convert_pandas_type_to_json_field_int(self, index_or_series): kind = index_or_series data = [1, 2, 3] result = convert_pandas_type_to_json_field(kind(data, name="name")) expected = {"name": "name", "type": "integer"} assert result == expected
def _get_column_type(column): field = convert_pandas_type_to_json_field(column) return pd_schema_to_value_type(field["type"])