def test_init_and_name(es): log = es['log'] rating = ft.Feature(ft.IdentityFeature(es["products"].ww["rating"]), "log") log_features = [ft.Feature(es['log'].ww[col]) for col in log.columns] +\ [ft.Feature(rating, primitive=GreaterThanScalar(2.5)), ft.Feature(rating, primitive=GreaterThanScalar(3.5))] # Add Timedelta feature # features.append(pd.Timestamp.now() - ft.Feature(log['datetime'])) customers_features = [ ft.Feature(es["customers"].ww[col]) for col in es["customers"].columns ] # check all transform primitives have a name for attribute_string in dir(ft.primitives): attr = getattr(ft.primitives, attribute_string) if isclass(attr): if issubclass(attr, TransformPrimitive) and attr != TransformPrimitive: assert getattr(attr, "name") is not None trans_primitives = get_transform_primitives().values() # If Dask EntitySet use only Dask compatible primitives if es.dataframe_type == Library.DASK.value: trans_primitives = [ prim for prim in trans_primitives if Library.DASK in prim.compatibility ] if es.dataframe_type == Library.KOALAS.value: trans_primitives = [ prim for prim in trans_primitives if Library.KOALAS in prim.compatibility ] for transform_prim in trans_primitives: # skip automated testing if a few special cases features_to_use = log_features if transform_prim in [NotEqual, Equal]: continue if transform_prim in [Age]: features_to_use = customers_features # use the input_types matching function from DFS input_types = transform_prim.input_types if type(input_types[0]) == list: matching_inputs = match(input_types[0], features_to_use) else: matching_inputs = match(input_types, features_to_use) if len(matching_inputs) == 0: raise Exception("Transform Primitive %s not tested" % transform_prim.name) for prim in matching_inputs: instance = ft.Feature(prim, primitive=transform_prim) # try to get name and calculate instance.get_name() ft.calculate_feature_matrix([instance], entityset=es)
def test_init_and_name(es): log = es['log'] rating = ft.Feature(es["products"]["rating"], es["log"]) features = [ft.Feature(v) for v in log.variables] +\ [ft.Feature(rating, primitive=GreaterThanScalar(2.5))] # Add Timedelta feature # features.append(pd.Timestamp.now() - ft.Feature(log['datetime'])) for transform_prim in get_transform_primitives().values(): # skip automated testing if a few special cases if transform_prim in [NotEqual, Equal]: continue # use the input_types matching function from DFS input_types = transform_prim.input_types if type(input_types[0]) == list: matching_inputs = match(input_types[0], features) else: matching_inputs = match(input_types, features) if len(matching_inputs) == 0: raise Exception("Transform Primitive %s not tested" % transform_prim.name) for s in matching_inputs: instance = ft.Feature(s, primitive=transform_prim) # try to get name and calculate instance.get_name() ft.calculate_feature_matrix([instance], entityset=es).head(5)
def test_does_not_warn_with_stacking_feature(pd_es): with pytest.warns(None) as record: dfs( entityset=pd_es, target_dataframe_name="régions", agg_primitives=["percent_true"], trans_primitives=[GreaterThanScalar(5)], primitive_options={ "greater_than_scalar": {"include_dataframes": ["stores"]} }, features_only=True, ) assert not record
def test_does_not_warn_with_stacking_feature(pd_es): with pytest.warns(None) as record: dfs(entityset=pd_es, target_entity='régions', agg_primitives=['percent_true'], trans_primitives=[GreaterThanScalar(5)], primitive_options={ 'greater_than_scalar': { 'include_entities': ['stores'] } }, features_only=True) assert not record
def test_override_boolean(es): count = ft.Feature(es['log']['id'], parent_entity=es['sessions'], primitive=Count) count_lo = ft.Feature(count, primitive=GreaterThanScalar(1)) count_hi = ft.Feature(count, primitive=LessThanScalar(10)) to_test = [[True, True, True], [True, True, False], [False, False, True]] features = [] features.append(count_lo.OR(count_hi)) features.append(count_lo.AND(count_hi)) features.append(~(count_lo.AND(count_hi))) df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=[0, 1, 2]) for i, test in enumerate(to_test): v = df[features[i].get_name()].values.tolist() assert v == test
def test_init_and_name(es): log = es['log'] rating = ft.Feature(es["products"]["rating"], es["log"]) log_features = [ft.Feature(v) for v in log.variables] +\ [ft.Feature(rating, primitive=GreaterThanScalar(2.5))] # Add Timedelta feature # features.append(pd.Timestamp.now() - ft.Feature(log['datetime'])) customers_features = [ft.Feature(v) for v in es["customers"].variables] trans_primitives = get_transform_primitives().values() # If Dask EntitySet use only Dask compatible primitives if isinstance(es['log'].df, dd.DataFrame): trans_primitives = [ prim for prim in trans_primitives if Library.DASK in prim.compatibility ] if ks and isinstance(es['log'].df, ks.DataFrame): trans_primitives = [ prim for prim in trans_primitives if Library.KOALAS in prim.compatibility ] for transform_prim in trans_primitives: # skip automated testing if a few special cases features_to_use = log_features if transform_prim in [NotEqual, Equal]: continue if transform_prim in [Age]: features_to_use = customers_features # use the input_types matching function from DFS input_types = transform_prim.input_types if type(input_types[0]) == list: matching_inputs = match(input_types[0], features_to_use) else: matching_inputs = match(input_types, features_to_use) if len(matching_inputs) == 0: raise Exception("Transform Primitive %s not tested" % transform_prim.name) for prim in matching_inputs: instance = ft.Feature(prim, primitive=transform_prim) # try to get name and calculate instance.get_name() ft.calculate_feature_matrix([instance], entityset=es)
def test_override_boolean(es): count = ft.Feature(es["log"].ww["id"], parent_dataframe_name="sessions", primitive=Count) count_lo = ft.Feature(count, primitive=GreaterThanScalar(1)) count_hi = ft.Feature(count, primitive=LessThanScalar(10)) to_test = [[True, True, True], [True, True, False], [False, False, True]] features = [] features.append(count_lo.OR(count_hi)) features.append(count_lo.AND(count_hi)) features.append(~(count_lo.AND(count_hi))) df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=[0, 1, 2]) df = to_pandas(df, index="id", sort_index=True) for i, test in enumerate(to_test): v = df[features[i].get_name()].tolist() assert v == test