def test_get_default_transform_primitives(): primitives = get_default_transform_primitives() expected_primitives = [ Age, Day, Year, Month, Weekday, Haversine, NumWords, NumCharacters, ] assert set(primitives) == set(expected_primitives)
def __init__(self, target_entity_id, entityset, agg_primitives=None, trans_primitives=None, where_primitives=None, groupby_trans_primitives=None, max_depth=2, max_features=-1, allowed_paths=None, ignore_entities=None, ignore_variables=None, primitive_options=None, seed_features=None, drop_contains=None, drop_exact=None, where_stacking_limit=1): if target_entity_id not in entityset.entity_dict: es_name = entityset.id or 'entity set' msg = 'Provided target entity %s does not exist in %s' % ( target_entity_id, es_name) raise KeyError(msg) # need to change max_depth to None because DFs terminates when <0 if max_depth == -1: max_depth = None self.max_depth = max_depth self.max_features = max_features self.allowed_paths = allowed_paths if self.allowed_paths: self.allowed_paths = set() for path in allowed_paths: self.allowed_paths.add(tuple(path)) if ignore_entities is None: self.ignore_entities = set() else: if not isinstance(ignore_entities, list): raise TypeError('ignore_entities must be a list') assert target_entity_id not in ignore_entities,\ "Can't ignore target_entity!" self.ignore_entities = set(ignore_entities) self.ignore_variables = defaultdict(set) if ignore_variables is not None: # check if ignore_variables is not {str: list} if not all(isinstance(i, str) for i in ignore_variables.keys()) or not all( isinstance(i, list) for i in ignore_variables.values()): raise TypeError('ignore_variables should be dict[str -> list]') # check if list values are all of type str elif not all( all(isinstance(v, str) for v in value) for value in ignore_variables.values()): raise TypeError('list values should be of type str') for eid, vars in ignore_variables.items(): self.ignore_variables[eid] = set(vars) self.target_entity_id = target_entity_id self.es = entityset if agg_primitives is None: agg_primitives = primitives.get_default_aggregation_primitives() self.agg_primitives = [] agg_prim_dict = primitives.get_aggregation_primitives() for a in agg_primitives: if isinstance(a, str): if a.lower() not in agg_prim_dict: raise ValueError( "Unknown aggregation primitive {}. ".format(a), "Call ft.primitives.list_primitives() to get", " a list of available primitives") a = agg_prim_dict[a.lower()] a = handle_primitive(a) if not isinstance(a, AggregationPrimitive): raise ValueError("Primitive {} in agg_primitives is not an " "aggregation primitive".format(type(a))) self.agg_primitives.append(a) if trans_primitives is None: trans_primitives = primitives.get_default_transform_primitives() self.trans_primitives = [] for t in trans_primitives: t = check_trans_primitive(t) self.trans_primitives.append(t) if where_primitives is None: where_primitives = [primitives.Count] self.where_primitives = [] for p in where_primitives: if isinstance(p, str): prim_obj = agg_prim_dict.get(p.lower(), None) if prim_obj is None: raise ValueError( "Unknown where primitive {}. ".format(p), "Call ft.primitives.list_primitives() to get", " a list of available primitives") p = prim_obj p = handle_primitive(p) self.where_primitives.append(p) if groupby_trans_primitives is None: groupby_trans_primitives = [] self.groupby_trans_primitives = [] for p in groupby_trans_primitives: p = check_trans_primitive(p) self.groupby_trans_primitives.append(p) if primitive_options is None: primitive_options = {} all_primitives = self.trans_primitives + self.agg_primitives + \ self.where_primitives + self.groupby_trans_primitives self.primitive_options, self.ignore_entities, self.ignore_variables =\ generate_all_primitive_options(all_primitives, primitive_options, self.ignore_entities, self.ignore_variables, self.es) self.seed_features = seed_features or [] self.drop_exact = drop_exact or [] self.drop_contains = drop_contains or [] self.where_stacking_limit = where_stacking_limit
def __init__( self, target_dataframe_name, entityset, agg_primitives=None, trans_primitives=None, where_primitives=None, groupby_trans_primitives=None, max_depth=2, max_features=-1, allowed_paths=None, ignore_dataframes=None, ignore_columns=None, primitive_options=None, seed_features=None, drop_contains=None, drop_exact=None, where_stacking_limit=1, ): if target_dataframe_name not in entityset.dataframe_dict: es_name = entityset.id or "entity set" msg = "Provided target dataframe %s does not exist in %s" % ( target_dataframe_name, es_name, ) raise KeyError(msg) # need to change max_depth to None because DFs terminates when <0 if max_depth == -1: max_depth = None # if just one dataframe, set max depth to 1 (transform stacking rule) if len(entityset.dataframe_dict) == 1 and (max_depth is None or max_depth > 1): warnings.warn( "Only one dataframe in entityset, changing max_depth to " "1 since deeper features cannot be created") max_depth = 1 self.max_depth = max_depth self.max_features = max_features self.allowed_paths = allowed_paths if self.allowed_paths: self.allowed_paths = set() for path in allowed_paths: self.allowed_paths.add(tuple(path)) if ignore_dataframes is None: self.ignore_dataframes = set() else: if not isinstance(ignore_dataframes, list): raise TypeError("ignore_dataframes must be a list") assert (target_dataframe_name not in ignore_dataframes), "Can't ignore target_dataframe!" self.ignore_dataframes = set(ignore_dataframes) self.ignore_columns = defaultdict(set) if ignore_columns is not None: # check if ignore_columns is not {str: list} if not all(isinstance(i, str) for i in ignore_columns.keys()) or not all( isinstance(i, list) for i in ignore_columns.values()): raise TypeError("ignore_columns should be dict[str -> list]") # check if list values are all of type str elif not all( all(isinstance(v, str) for v in value) for value in ignore_columns.values()): raise TypeError("list values should be of type str") for df_name, cols in ignore_columns.items(): self.ignore_columns[df_name] = set(cols) self.target_dataframe_name = target_dataframe_name self.es = entityset for library in Library: if library.value == self.es.dataframe_type: df_library = library break aggregation_primitive_dict = primitives.get_aggregation_primitives() transform_primitive_dict = primitives.get_transform_primitives() if agg_primitives is None: agg_primitives = [ p for p in primitives.get_default_aggregation_primitives() if df_library in p.compatibility ] self.agg_primitives = [] self.agg_primitives = sorted([ check_primitive( p, "aggregation", aggregation_primitive_dict, transform_primitive_dict, ) for p in agg_primitives ]) if trans_primitives is None: trans_primitives = [ p for p in primitives.get_default_transform_primitives() if df_library in p.compatibility ] self.trans_primitives = sorted([ check_primitive(p, "transform", aggregation_primitive_dict, transform_primitive_dict) for p in trans_primitives ]) if where_primitives is None: where_primitives = [primitives.Count] self.where_primitives = sorted([ check_primitive(p, "where", aggregation_primitive_dict, transform_primitive_dict) for p in where_primitives ]) if groupby_trans_primitives is None: groupby_trans_primitives = [] self.groupby_trans_primitives = sorted([ check_primitive( p, "groupby transform", aggregation_primitive_dict, transform_primitive_dict, ) for p in groupby_trans_primitives ]) if primitive_options is None: primitive_options = {} all_primitives = (self.trans_primitives + self.agg_primitives + self.where_primitives + self.groupby_trans_primitives) bad_primitives = [ prim.name for prim in all_primitives if df_library not in prim.compatibility ] if bad_primitives: msg = "Selected primitives are incompatible with {} EntitySets: {}" raise ValueError( msg.format(df_library.value, ", ".join(bad_primitives))) ( self.primitive_options, self.ignore_dataframes, self.ignore_columns, ) = generate_all_primitive_options( all_primitives, primitive_options, self.ignore_dataframes, self.ignore_columns, self.es, ) self.seed_features = sorted(seed_features or [], key=lambda f: f.unique_name()) self.drop_exact = drop_exact or [] self.drop_contains = drop_contains or [] self.where_stacking_limit = where_stacking_limit
def __init__(self, target_dataframe_name, entityset, agg_primitives=None, trans_primitives=None, where_primitives=None, groupby_trans_primitives=None, max_depth=2, max_features=-1, allowed_paths=None, ignore_dataframes=None, ignore_columns=None, primitive_options=None, seed_features=None, drop_contains=None, drop_exact=None, where_stacking_limit=1): if target_dataframe_name not in entityset.dataframe_dict: es_name = entityset.id or 'entity set' msg = 'Provided target dataframe %s does not exist in %s' % ( target_dataframe_name, es_name) raise KeyError(msg) # need to change max_depth to None because DFs terminates when <0 if max_depth == -1: max_depth = None # if just one dataframe, set max depth to 1 (transform stacking rule) if len(entityset.dataframe_dict) == 1 and (max_depth is None or max_depth > 1): warnings.warn( "Only one dataframe in entityset, changing max_depth to " "1 since deeper features cannot be created") max_depth = 1 self.max_depth = max_depth self.max_features = max_features self.allowed_paths = allowed_paths if self.allowed_paths: self.allowed_paths = set() for path in allowed_paths: self.allowed_paths.add(tuple(path)) if ignore_dataframes is None: self.ignore_dataframes = set() else: if not isinstance(ignore_dataframes, list): raise TypeError('ignore_dataframes must be a list') assert target_dataframe_name not in ignore_dataframes,\ "Can't ignore target_dataframe!" self.ignore_dataframes = set(ignore_dataframes) self.ignore_columns = defaultdict(set) if ignore_columns is not None: # check if ignore_columns is not {str: list} if not all(isinstance(i, str) for i in ignore_columns.keys()) or not all( isinstance(i, list) for i in ignore_columns.values()): raise TypeError('ignore_columns should be dict[str -> list]') # check if list values are all of type str elif not all( all(isinstance(v, str) for v in value) for value in ignore_columns.values()): raise TypeError('list values should be of type str') for df_name, cols in ignore_columns.items(): self.ignore_columns[df_name] = set(cols) self.target_dataframe_name = target_dataframe_name self.es = entityset for library in Library: if library.value == self.es.dataframe_type: df_library = library break if agg_primitives is None: agg_primitives = [ p for p in primitives.get_default_aggregation_primitives() if df_library in p.compatibility ] self.agg_primitives = [] agg_prim_dict = primitives.get_aggregation_primitives() for a in agg_primitives: if isinstance(a, str): if a.lower() not in agg_prim_dict: raise ValueError( "Unknown aggregation primitive {}. ".format(a), "Call ft.primitives.list_primitives() to get", " a list of available primitives") a = agg_prim_dict[a.lower()] a = handle_primitive(a) if not isinstance(a, AggregationPrimitive): raise ValueError("Primitive {} in agg_primitives is not an " "aggregation primitive".format(type(a))) self.agg_primitives.append(a) self.agg_primitives.sort() if trans_primitives is None: trans_primitives = [ p for p in primitives.get_default_transform_primitives() if df_library in p.compatibility ] self.trans_primitives = [] for t in trans_primitives: t = check_trans_primitive(t) self.trans_primitives.append(t) self.trans_primitives.sort() if where_primitives is None: where_primitives = [primitives.Count] self.where_primitives = [] for p in where_primitives: if isinstance(p, str): prim_obj = agg_prim_dict.get(p.lower(), None) if prim_obj is None: raise ValueError( "Unknown where primitive {}. ".format(p), "Call ft.primitives.list_primitives() to get", " a list of available primitives") p = prim_obj p = handle_primitive(p) self.where_primitives.append(p) self.where_primitives.sort() if groupby_trans_primitives is None: groupby_trans_primitives = [] self.groupby_trans_primitives = [] for p in groupby_trans_primitives: p = check_trans_primitive(p) self.groupby_trans_primitives.append(p) self.groupby_trans_primitives.sort() if primitive_options is None: primitive_options = {} all_primitives = self.trans_primitives + self.agg_primitives + \ self.where_primitives + self.groupby_trans_primitives bad_primitives = [ prim.name for prim in all_primitives if df_library not in prim.compatibility ] if bad_primitives: msg = 'Selected primitives are incompatible with {} EntitySets: {}' raise ValueError( msg.format(df_library.value, ', '.join(bad_primitives))) self.primitive_options, self.ignore_dataframes, self.ignore_columns =\ generate_all_primitive_options(all_primitives, primitive_options, self.ignore_dataframes, self.ignore_columns, self.es) self.seed_features = sorted(seed_features or [], key=lambda f: f.unique_name()) self.drop_exact = drop_exact or [] self.drop_contains = drop_contains or [] self.where_stacking_limit = where_stacking_limit