Пример #1
0
def _init_primitive_options(primitive_options, es):
    # Flatten all tuple keys, convert value lists into sets, check for
    # conflicting keys
    flattened_options = {}
    for primitive_key, options in primitive_options.items():
        if isinstance(options, list):
            primitive = primitives.get_aggregation_primitives().get(primitive_key) or \
                primitives.get_transform_primitives().get(primitive_key)
            assert len(primitive.input_types[0]) == len(options) if \
                isinstance(primitive.input_types[0], list) else \
                len(primitive.input_types) == len(options), \
                "Number of options does not match number of inputs for primitive %s" \
                % (primitive_key)
            options = [
                _init_option_dict(primitive_key, option, es)
                for option in options
            ]
        else:
            options = [_init_option_dict(primitive_key, options, es)]
        if not isinstance(primitive_key, tuple):
            primitive_key = (primitive_key, )
        for each_primitive in primitive_key:
            # if primitive is specified more than once, raise error
            if each_primitive in flattened_options:
                raise KeyError('Multiple options found for primitive %s' %
                               (each_primitive))
            flattened_options[each_primitive] = options
    return flattened_options
def test_init_and_name(es):
    log = es['log']
    features = [Feature(v) for v in log.variables] +\
        [GreaterThan(Feature(es["products"]["rating"], es["log"]), 2.5)]
    # Add Timedelta feature
    features.append(pd.Timestamp.now() - Feature(log['datetime']))
    for transform_prim in get_transform_primitives():
        if issubclass(transform_prim, Compare):
            continue
        # use the input_types matching function from DFS
        input_types = transform_prim.input_types
        if type(input_types[0]) == list:
            matching_inputs = [
                g for s in input_types for g in match(s, features)
            ]
        else:
            matching_inputs = match(input_types, features)
        if len(matching_inputs) == 0:
            raise Exception("Transform Primitive %s not tested" %
                            transform_prim.name)
        for s in matching_inputs:
            instance = transform_prim(*s)

            # try to get name and calculate
            instance.get_name()
            instance.head()
def test_init_and_name(es):
    log = es['log']
    rating = ft.Feature(es["products"]["rating"], es["log"])
    features = [ft.Feature(v) for v in log.variables] +\
        [ft.Feature(rating, primitive=GreaterThanScalar(2.5))]
    # Add Timedelta feature
    # features.append(pd.Timestamp.now() - ft.Feature(log['datetime']))
    for transform_prim in get_transform_primitives().values():

        # skip automated testing if a few special cases
        if transform_prim in [NotEqual, Equal]:
            continue

        # use the input_types matching function from DFS
        input_types = transform_prim.input_types
        if type(input_types[0]) == list:
            matching_inputs = match(input_types[0], features)
        else:
            matching_inputs = match(input_types, features)
        if len(matching_inputs) == 0:
            raise Exception("Transform Primitive %s not tested" %
                            transform_prim.name)
        for s in matching_inputs:
            instance = ft.Feature(s, primitive=transform_prim)

            # try to get name and calculate
            instance.get_name()
            ft.calculate_feature_matrix([instance], entityset=es).head(5)
def test_init_and_name(es):
    log = es['log']
    features = [Feature(v) for v in log.variables] +\
        [GreaterThan(Feature(es["products"]["rating"], es["log"]), 2.5)]
    # Add Timedelta feature
    features.append(pd.Timestamp.now() - Feature(log['datetime']))
    for transform_prim in get_transform_primitives().values():
        if issubclass(transform_prim, Compare):
            continue
        # use the input_types matching function from DFS
        input_types = transform_prim.input_types
        if type(input_types[0]) == list:
            matching_inputs = [g for s in input_types
                               for g in match(s, features)]
        else:
            matching_inputs = match(input_types, features)
        if len(matching_inputs) == 0:
            raise Exception(
                "Transform Primitive %s not tested" % transform_prim.name)
        for s in matching_inputs:
            instance = transform_prim(*s)

            # try to get name and calculate
            instance.get_name()
            instance.head()
def test_init_and_name(es):
    log = es['log']
    rating = ft.Feature(ft.IdentityFeature(es["products"].ww["rating"]), "log")
    log_features = [ft.Feature(es['log'].ww[col]) for col in log.columns] +\
        [ft.Feature(rating, primitive=GreaterThanScalar(2.5)),
         ft.Feature(rating, primitive=GreaterThanScalar(3.5))]
    # Add Timedelta feature
    # features.append(pd.Timestamp.now() - ft.Feature(log['datetime']))
    customers_features = [
        ft.Feature(es["customers"].ww[col]) for col in es["customers"].columns
    ]

    # check all transform primitives have a name
    for attribute_string in dir(ft.primitives):
        attr = getattr(ft.primitives, attribute_string)
        if isclass(attr):
            if issubclass(attr,
                          TransformPrimitive) and attr != TransformPrimitive:
                assert getattr(attr, "name") is not None

    trans_primitives = get_transform_primitives().values()
    # If Dask EntitySet use only Dask compatible primitives
    if es.dataframe_type == Library.DASK.value:
        trans_primitives = [
            prim for prim in trans_primitives
            if Library.DASK in prim.compatibility
        ]
    if es.dataframe_type == Library.KOALAS.value:
        trans_primitives = [
            prim for prim in trans_primitives
            if Library.KOALAS in prim.compatibility
        ]

    for transform_prim in trans_primitives:
        # skip automated testing if a few special cases
        features_to_use = log_features
        if transform_prim in [NotEqual, Equal]:
            continue
        if transform_prim in [Age]:
            features_to_use = customers_features

        # use the input_types matching function from DFS
        input_types = transform_prim.input_types
        if type(input_types[0]) == list:
            matching_inputs = match(input_types[0], features_to_use)
        else:
            matching_inputs = match(input_types, features_to_use)
        if len(matching_inputs) == 0:
            raise Exception("Transform Primitive %s not tested" %
                            transform_prim.name)
        for prim in matching_inputs:
            instance = ft.Feature(prim, primitive=transform_prim)

            # try to get name and calculate
            instance.get_name()
            ft.calculate_feature_matrix([instance], entityset=es)
Пример #6
0
def test_summarize_primitives():
    df = summarize_primitives()
    trans_prims = get_transform_primitives()
    agg_prims = get_aggregation_primitives()
    tot_trans = len(trans_prims)
    tot_agg = len(agg_prims)
    tot_prims = tot_trans + tot_agg

    assert df["Count"].iloc[0] == tot_prims
    assert df["Count"].iloc[1] == tot_agg
    assert df["Count"].iloc[2] == tot_trans
Пример #7
0
def test_list_primitives_order():
    df = list_primitives()
    all_primitives = get_transform_primitives()
    all_primitives.update(get_aggregation_primitives())

    for name, primitive in all_primitives.items():
        assert name in df['name'].values
        row = df.loc[df['name'] == name].iloc[0]
        actual_desc = _get_descriptions([primitive])[0]
        if actual_desc:
            assert actual_desc == row['description']

    types = df['type'].values
    assert 'aggregation' in types
    assert 'transform' in types
Пример #8
0
def check_trans_primitive(primitive):
    trans_prim_dict = primitives.get_transform_primitives()

    if isinstance(primitive, str):
        if primitive.lower() not in trans_prim_dict:
            raise ValueError("Unknown transform primitive {}. ".format(primitive),
                             "Call ft.primitives.list_primitives() to get",
                             " a list of available primitives")
        primitive = trans_prim_dict[primitive.lower()]
    primitive = handle_primitive(primitive)
    if not isinstance(primitive, TransformPrimitive):
        raise ValueError("Primitive {} in trans_primitives or "
                         "groupby_trans_primitives is not a transform "
                         "primitive".format(type(primitive)))
    return primitive
Пример #9
0
def _init_primitive_options(primitive_options, es):
    # Flatten all tuple keys, convert value lists into sets, check for
    # conflicting keys
    flattened_options = {}
    for primitive_keys, options in primitive_options.items():
        if not isinstance(primitive_keys, tuple):
            primitive_keys = (primitive_keys, )
        if isinstance(options, list):
            for primitive_key in primitive_keys:
                if isinstance(primitive_key, str):
                    primitive = primitives.get_aggregation_primitives().get(
                        primitive_key) or primitives.get_transform_primitives(
                        ).get(primitive_key)
                    if not primitive:
                        msg = "Unknown primitive with name '{}'".format(
                            primitive_key)
                        raise ValueError(msg)
                else:
                    primitive = primitive_key
                assert (
                    len(primitive.input_types[0]) == len(options)
                    if isinstance(primitive.input_types[0], list) else len(
                        primitive.input_types) == len(options)
                ), ("Number of options does not match number of inputs for primitive %s"
                    % (primitive_key))
            options = [
                _init_option_dict(primitive_keys, option, es)
                for option in options
            ]
        else:
            options = [_init_option_dict(primitive_keys, options, es)]

        for primitive in primitive_keys:
            if isinstance(primitive, type):
                primitive = primitive.name

            # if primitive is specified more than once, raise error
            if primitive in flattened_options:
                raise KeyError("Multiple options found for primitive %s" %
                               (primitive))

            flattened_options[primitive] = options
    return flattened_options
Пример #10
0
def test_init_and_name(es):
    log = es['log']
    rating = ft.Feature(es["products"]["rating"], es["log"])
    log_features = [ft.Feature(v) for v in log.variables] +\
        [ft.Feature(rating, primitive=GreaterThanScalar(2.5))]
    # Add Timedelta feature
    # features.append(pd.Timestamp.now() - ft.Feature(log['datetime']))
    customers_features = [ft.Feature(v) for v in es["customers"].variables]
    trans_primitives = get_transform_primitives().values()
    # If Dask EntitySet use only Dask compatible primitives
    if isinstance(es['log'].df, dd.DataFrame):
        trans_primitives = [
            prim for prim in trans_primitives
            if Library.DASK in prim.compatibility
        ]
    if ks and isinstance(es['log'].df, ks.DataFrame):
        trans_primitives = [
            prim for prim in trans_primitives
            if Library.KOALAS in prim.compatibility
        ]
    for transform_prim in trans_primitives:
        # skip automated testing if a few special cases
        features_to_use = log_features
        if transform_prim in [NotEqual, Equal]:
            continue
        if transform_prim in [Age]:
            features_to_use = customers_features

        # use the input_types matching function from DFS
        input_types = transform_prim.input_types
        if type(input_types[0]) == list:
            matching_inputs = match(input_types[0], features_to_use)
        else:
            matching_inputs = match(input_types, features_to_use)
        if len(matching_inputs) == 0:
            raise Exception("Transform Primitive %s not tested" %
                            transform_prim.name)
        for prim in matching_inputs:
            instance = ft.Feature(prim, primitive=transform_prim)

            # try to get name and calculate
            instance.get_name()
            ft.calculate_feature_matrix([instance], entityset=es)
Пример #11
0
def test_list_primitives_order():
    df = list_primitives()
    all_primitives = get_transform_primitives()
    all_primitives.update(get_aggregation_primitives())

    for name, primitive in all_primitives.items():
        assert name in df["name"].values
        row = df.loc[df["name"] == name].iloc[0]
        actual_desc = _get_descriptions([primitive])[0]
        if actual_desc:
            assert actual_desc == row["description"]
        assert row["dask_compatible"] == (Library.DASK in primitive.compatibility)
        assert row["valid_inputs"] == ", ".join(
            _get_unique_input_types(primitive.input_types)
        )
        assert row["return_type"] == getattr(primitive.return_type, "__name__", None)

    types = df["type"].values
    assert "aggregation" in types
    assert "transform" in types
def test_list_primitives_order():
    df = list_primitives()
    all_primitives = get_transform_primitives()
    all_primitives.update(get_aggregation_primitives())

    for name, primitive in all_primitives.items():
        assert name in df['name'].values
        row = df.loc[df['name'] == name].iloc[0]
        actual_desc = _get_descriptions([primitive])[0]
        if actual_desc:
            assert actual_desc == row['description']
        assert row['dask_compatible'] == (Library.DASK
                                          in primitive.compatibility)
        assert row['valid_inputs'] == ', '.join(
            _get_names_valid_inputs(primitive.input_types))
        assert row['return_type'] == getattr(primitive.return_type, '__name__',
                                             None)

    types = df['type'].values
    assert 'aggregation' in types
    assert 'transform' in types
Пример #13
0
def test_init_and_name(es):
    from featuretools import calculate_feature_matrix
    log = es['log']
    features = [Feature(v) for v in log.variables] +\
        [GreaterThan(Feature(es["products"]["rating"], es["log"]), 2.5)]
    # Add Timedelta feature
    features.append(pd.Timestamp.now() - Feature(log['datetime']))
    for transform_prim in get_transform_primitives().values():
        # use the input_types matching function from DFS
        input_types = transform_prim.input_types
        if type(input_types[0]) == list:
            matching_inputs = match(input_types[0], features)
        else:
            matching_inputs = match(input_types, features)
        if len(matching_inputs) == 0:
            raise Exception("Transform Primitive %s not tested" %
                            transform_prim.name)
        for s in matching_inputs:
            instance = transform_prim(*s)

            # try to get name and calculate
            instance.get_name()
            calculate_feature_matrix([instance], entityset=es).head(5)
def test_init_and_name(es):
    from featuretools import calculate_feature_matrix
    log = es['log']
    features = [Feature(v) for v in log.variables] +\
        [GreaterThan(Feature(es["products"]["rating"], es["log"]), 2.5)]
    # Add Timedelta feature
    features.append(pd.Timestamp.now() - Feature(log['datetime']))
    for transform_prim in get_transform_primitives().values():
        # use the input_types matching function from DFS
        input_types = transform_prim.input_types
        if type(input_types[0]) == list:
            matching_inputs = match(input_types[0], features)
        else:
            matching_inputs = match(input_types, features)
        if len(matching_inputs) == 0:
            raise Exception(
                "Transform Primitive %s not tested" % transform_prim.name)
        for s in matching_inputs:
            instance = transform_prim(*s)

            # try to get name and calculate
            instance.get_name()
            calculate_feature_matrix([instance], entityset=es).head(5)
Пример #15
0
import pandas as pd
import pytest

import featuretools as ft
from featuretools.primitives import get_aggregation_primitives, get_transform_primitives
from featuretools.utils.gen_utils import Library

UNSUPPORTED = [
    p.name for p in get_transform_primitives().values()
    if Library.DASK not in p.compatibility
]
UNSUPPORTED += [
    p.name for p in get_aggregation_primitives().values()
    if Library.DASK not in p.compatibility
]


def test_transform(pd_es, dask_es):
    pytest.skip(
        "TODO: Dask issue with `series.eq`. Fix once Dask Issue #7957 is closed."
    )
    primitives = ft.list_primitives()
    trans_list = primitives[primitives["type"] == "transform"]["name"].tolist()
    trans_primitives = [prim for prim in trans_list if prim not in UNSUPPORTED]
    agg_primitives = []
    cutoff_time = pd.Timestamp("2019-01-05 04:00")

    assert pd_es == dask_es

    # Run DFS using each dataframe as a target and confirm results match
    for df in pd_es.dataframes:
Пример #16
0
    def __init__(
        self,
        target_dataframe_name,
        entityset,
        agg_primitives=None,
        trans_primitives=None,
        where_primitives=None,
        groupby_trans_primitives=None,
        max_depth=2,
        max_features=-1,
        allowed_paths=None,
        ignore_dataframes=None,
        ignore_columns=None,
        primitive_options=None,
        seed_features=None,
        drop_contains=None,
        drop_exact=None,
        where_stacking_limit=1,
    ):

        if target_dataframe_name not in entityset.dataframe_dict:
            es_name = entityset.id or "entity set"
            msg = "Provided target dataframe %s does not exist in %s" % (
                target_dataframe_name,
                es_name,
            )
            raise KeyError(msg)

        # need to change max_depth to None because DFs terminates when  <0
        if max_depth == -1:
            max_depth = None

        # if just one dataframe, set max depth to 1 (transform stacking rule)
        if len(entityset.dataframe_dict) == 1 and (max_depth is None
                                                   or max_depth > 1):
            warnings.warn(
                "Only one dataframe in entityset, changing max_depth to "
                "1 since deeper features cannot be created")
            max_depth = 1

        self.max_depth = max_depth

        self.max_features = max_features

        self.allowed_paths = allowed_paths
        if self.allowed_paths:
            self.allowed_paths = set()
            for path in allowed_paths:
                self.allowed_paths.add(tuple(path))

        if ignore_dataframes is None:
            self.ignore_dataframes = set()
        else:
            if not isinstance(ignore_dataframes, list):
                raise TypeError("ignore_dataframes must be a list")
            assert (target_dataframe_name
                    not in ignore_dataframes), "Can't ignore target_dataframe!"
            self.ignore_dataframes = set(ignore_dataframes)

        self.ignore_columns = defaultdict(set)
        if ignore_columns is not None:
            # check if ignore_columns is not {str: list}
            if not all(isinstance(i, str)
                       for i in ignore_columns.keys()) or not all(
                           isinstance(i, list)
                           for i in ignore_columns.values()):
                raise TypeError("ignore_columns should be dict[str -> list]")
            # check if list values are all of type str
            elif not all(
                    all(isinstance(v, str) for v in value)
                    for value in ignore_columns.values()):
                raise TypeError("list values should be of type str")
            for df_name, cols in ignore_columns.items():
                self.ignore_columns[df_name] = set(cols)
        self.target_dataframe_name = target_dataframe_name
        self.es = entityset

        for library in Library:
            if library.value == self.es.dataframe_type:
                df_library = library
                break

        aggregation_primitive_dict = primitives.get_aggregation_primitives()
        transform_primitive_dict = primitives.get_transform_primitives()
        if agg_primitives is None:
            agg_primitives = [
                p for p in primitives.get_default_aggregation_primitives()
                if df_library in p.compatibility
            ]
        self.agg_primitives = []
        self.agg_primitives = sorted([
            check_primitive(
                p,
                "aggregation",
                aggregation_primitive_dict,
                transform_primitive_dict,
            ) for p in agg_primitives
        ])

        if trans_primitives is None:
            trans_primitives = [
                p for p in primitives.get_default_transform_primitives()
                if df_library in p.compatibility
            ]
        self.trans_primitives = sorted([
            check_primitive(p, "transform", aggregation_primitive_dict,
                            transform_primitive_dict) for p in trans_primitives
        ])

        if where_primitives is None:
            where_primitives = [primitives.Count]
        self.where_primitives = sorted([
            check_primitive(p, "where", aggregation_primitive_dict,
                            transform_primitive_dict) for p in where_primitives
        ])

        if groupby_trans_primitives is None:
            groupby_trans_primitives = []
        self.groupby_trans_primitives = sorted([
            check_primitive(
                p,
                "groupby transform",
                aggregation_primitive_dict,
                transform_primitive_dict,
            ) for p in groupby_trans_primitives
        ])

        if primitive_options is None:
            primitive_options = {}
        all_primitives = (self.trans_primitives + self.agg_primitives +
                          self.where_primitives +
                          self.groupby_trans_primitives)
        bad_primitives = [
            prim.name for prim in all_primitives
            if df_library not in prim.compatibility
        ]
        if bad_primitives:
            msg = "Selected primitives are incompatible with {} EntitySets: {}"
            raise ValueError(
                msg.format(df_library.value, ", ".join(bad_primitives)))

        (
            self.primitive_options,
            self.ignore_dataframes,
            self.ignore_columns,
        ) = generate_all_primitive_options(
            all_primitives,
            primitive_options,
            self.ignore_dataframes,
            self.ignore_columns,
            self.es,
        )
        self.seed_features = sorted(seed_features or [],
                                    key=lambda f: f.unique_name())
        self.drop_exact = drop_exact or []
        self.drop_contains = drop_contains or []
        self.where_stacking_limit = where_stacking_limit
Пример #17
0
def test_trans_primitives_can_init_without_params():
    trans_primitives = get_transform_primitives().values()
    for trans_primitive in trans_primitives:
        trans_primitive()