def test_convert_pandas_type_to_json_field_datetime(
         self, dt_args, extra_exp, wrapper):
     data = [1.0, 2.0, 3.0]
     data = pd.to_datetime(data, **dt_args)
     if wrapper is pd.Series:
         data = pd.Series(data, name="values")
     result = convert_pandas_type_to_json_field(data)
     expected = {"name": "values", "type": "datetime"}
     expected.update(extra_exp)
     assert result == expected
Exemplo n.º 2
0
def infer_schema_from_df(
    df: pd.DataFrame,
    features,
    entities,
    timestamp_key: str = None,
    entity_columns=None,
    options: InferOptions = InferOptions.Null,
):
    """infer feature set schema from dataframe"""
    timestamp_fields = []
    current_entities = list(entities.keys())
    entity_columns = entity_columns or []

    def upsert_entity(name, value_type):
        if name in current_entities:
            entities[name].value_type = value_type
        else:
            entities[name] = {"name": name, "value_type": value_type}

    schema = pyarrow.Schema.from_pandas(df)
    index_type = None
    for i in range(len(schema)):
        column = schema.names[i]
        value_type = pa_type_to_value_type(schema.types[i])
        if column in df.index.names:
            index_type = value_type
            continue
        is_entity = column in entity_columns or column in current_entities
        if is_entity:
            upsert_entity(column, value_type)
        elif (InferOptions.get_common_options(options, InferOptions.Features)
              and column != timestamp_key):
            if column in features.keys():
                features[column].value_type = value_type
            else:
                features[column] = {"name": column, "value_type": value_type}
        if value_type == "datetime" and not is_entity:
            timestamp_fields.append(column)

    if InferOptions.get_common_options(options, InferOptions.Index):
        # infer types of index fields
        if df.index.name:
            if not index_type:
                field = convert_pandas_type_to_json_field(df.index)
                index_type = pd_schema_to_value_type(field["type"])
            # Workaround to infer a boolean index correctly, and not as 'str'.
            upsert_entity(df.index.name, index_type)
        elif df.index.nlevels > 1:
            for level, name in zip(df.index.levels, df.index.names):
                upsert_entity(name, index_type)
                if index_type == "datetime":
                    timestamp_fields.append(name)

    return timestamp_key
Exemplo n.º 3
0
    def test_convert_pandas_type_to_json_field_categorical(self, kind, ordered):
        data = ["a", "b", "c"]
        if kind is pd.Categorical:
            arr = pd.Series(kind(data, ordered=ordered), name="cats")
        elif kind is pd.CategoricalIndex:
            arr = kind(data, ordered=ordered, name="cats")

        result = convert_pandas_type_to_json_field(arr)
        expected = {
            "name": "cats",
            "type": "any",
            "constraints": {"enum": data},
            "ordered": ordered,
        }
        assert result == expected
Exemplo n.º 4
0
 def test_convert_pandas_type_to_json_period_range(self):
     arr = pd.period_range("2016", freq="A-DEC", periods=4)
     result = convert_pandas_type_to_json_field(arr)
     expected = {"name": "values", "type": "datetime", "freq": "A-DEC"}
     assert result == expected
Exemplo n.º 5
0
 def test_convert_pandas_type_to_json_field_float(self, index_or_series):
     kind = index_or_series
     data = [1.0, 2.0, 3.0]
     result = convert_pandas_type_to_json_field(kind(data, name="name"))
     expected = {"name": "name", "type": "number"}
     assert result == expected
Exemplo n.º 6
0
 def test_convert_pandas_type_to_json_field_int(self, index_or_series):
     kind = index_or_series
     data = [1, 2, 3]
     result = convert_pandas_type_to_json_field(kind(data, name="name"))
     expected = {"name": "name", "type": "integer"}
     assert result == expected
Exemplo n.º 7
0
def _get_column_type(column):
    field = convert_pandas_type_to_json_field(column)
    return pd_schema_to_value_type(field["type"])