def __init__(self,
                 target_entity_id,
                 entityset,
                 agg_primitives=None,
                 trans_primitives=None,
                 where_primitives=None,
                 groupby_trans_primitives=None,
                 max_depth=2,
                 max_features=-1,
                 allowed_paths=None,
                 ignore_entities=None,
                 ignore_variables=None,
                 primitive_options=None,
                 seed_features=None,
                 drop_contains=None,
                 drop_exact=None,
                 where_stacking_limit=1):

        if target_entity_id not in entityset.entity_dict:
            es_name = entityset.id or 'entity set'
            msg = 'Provided target entity %s does not exist in %s' % (
                target_entity_id, es_name)
            raise KeyError(msg)

        # need to change max_depth to None because DFs terminates when  <0
        if max_depth == -1:
            max_depth = None
        self.max_depth = max_depth

        self.max_features = max_features

        self.allowed_paths = allowed_paths
        if self.allowed_paths:
            self.allowed_paths = set()
            for path in allowed_paths:
                self.allowed_paths.add(tuple(path))

        if ignore_entities is None:
            self.ignore_entities = set()
        else:
            if not isinstance(ignore_entities, list):
                raise TypeError('ignore_entities must be a list')
            assert target_entity_id not in ignore_entities,\
                "Can't ignore target_entity!"
            self.ignore_entities = set(ignore_entities)

        self.ignore_variables = defaultdict(set)
        if ignore_variables is not None:
            for eid, vars in ignore_variables.items():
                self.ignore_variables[eid] = set(vars)
        self.target_entity_id = target_entity_id
        self.es = entityset

        if agg_primitives is None:
            agg_primitives = [
                primitives.Sum, primitives.Std, primitives.Max,
                primitives.Skew, primitives.Min, primitives.Mean,
                primitives.Count, primitives.PercentTrue, primitives.NumUnique,
                primitives.Mode
            ]
        self.agg_primitives = []
        agg_prim_dict = primitives.get_aggregation_primitives()
        for a in agg_primitives:
            if is_string(a):
                if a.lower() not in agg_prim_dict:
                    raise ValueError(
                        "Unknown aggregation primitive {}. ".format(a),
                        "Call ft.primitives.list_primitives() to get",
                        " a list of available primitives")
                a = agg_prim_dict[a.lower()]
            a = handle_primitive(a)
            if not isinstance(a, AggregationPrimitive):
                raise ValueError("Primitive {} in agg_primitives is not an "
                                 "aggregation primitive".format(type(a)))
            self.agg_primitives.append(a)

        if trans_primitives is None:
            trans_primitives = [
                primitives.Day, primitives.Year, primitives.Month,
                primitives.Weekday, primitives.Haversine, primitives.NumWords,
                primitives.NumCharacters
            ]  # primitives.TimeSince
        self.trans_primitives = []
        for t in trans_primitives:
            t = check_trans_primitive(t)
            self.trans_primitives.append(t)

        if where_primitives is None:
            where_primitives = [primitives.Count]
        self.where_primitives = []
        for p in where_primitives:
            if is_string(p):
                prim_obj = agg_prim_dict.get(p.lower(), None)
                if prim_obj is None:
                    raise ValueError(
                        "Unknown where primitive {}. ".format(p),
                        "Call ft.primitives.list_primitives() to get",
                        " a list of available primitives")
                p = prim_obj
            p = handle_primitive(p)
            self.where_primitives.append(p)

        if groupby_trans_primitives is None:
            groupby_trans_primitives = []
        self.groupby_trans_primitives = []
        for p in groupby_trans_primitives:
            p = check_trans_primitive(p)
            self.groupby_trans_primitives.append(p)

        if primitive_options is None:
            primitive_options = {}
        all_primitives = self.trans_primitives + self.agg_primitives + \
            self.where_primitives + self.groupby_trans_primitives
        self.primitive_options, self.ignore_entities =\
            generate_all_primitive_options(all_primitives,
                                           primitive_options,
                                           self.ignore_entities,
                                           self.ignore_variables,
                                           self.es)

        self.seed_features = seed_features or []
        self.drop_exact = drop_exact or []
        self.drop_contains = drop_contains or []
        self.where_stacking_limit = where_stacking_limit
    def __init__(
        self,
        target_dataframe_name,
        entityset,
        agg_primitives=None,
        trans_primitives=None,
        where_primitives=None,
        groupby_trans_primitives=None,
        max_depth=2,
        max_features=-1,
        allowed_paths=None,
        ignore_dataframes=None,
        ignore_columns=None,
        primitive_options=None,
        seed_features=None,
        drop_contains=None,
        drop_exact=None,
        where_stacking_limit=1,
    ):

        if target_dataframe_name not in entityset.dataframe_dict:
            es_name = entityset.id or "entity set"
            msg = "Provided target dataframe %s does not exist in %s" % (
                target_dataframe_name,
                es_name,
            )
            raise KeyError(msg)

        # need to change max_depth to None because DFs terminates when  <0
        if max_depth == -1:
            max_depth = None

        # if just one dataframe, set max depth to 1 (transform stacking rule)
        if len(entityset.dataframe_dict) == 1 and (max_depth is None
                                                   or max_depth > 1):
            warnings.warn(
                "Only one dataframe in entityset, changing max_depth to "
                "1 since deeper features cannot be created")
            max_depth = 1

        self.max_depth = max_depth

        self.max_features = max_features

        self.allowed_paths = allowed_paths
        if self.allowed_paths:
            self.allowed_paths = set()
            for path in allowed_paths:
                self.allowed_paths.add(tuple(path))

        if ignore_dataframes is None:
            self.ignore_dataframes = set()
        else:
            if not isinstance(ignore_dataframes, list):
                raise TypeError("ignore_dataframes must be a list")
            assert (target_dataframe_name
                    not in ignore_dataframes), "Can't ignore target_dataframe!"
            self.ignore_dataframes = set(ignore_dataframes)

        self.ignore_columns = defaultdict(set)
        if ignore_columns is not None:
            # check if ignore_columns is not {str: list}
            if not all(isinstance(i, str)
                       for i in ignore_columns.keys()) or not all(
                           isinstance(i, list)
                           for i in ignore_columns.values()):
                raise TypeError("ignore_columns should be dict[str -> list]")
            # check if list values are all of type str
            elif not all(
                    all(isinstance(v, str) for v in value)
                    for value in ignore_columns.values()):
                raise TypeError("list values should be of type str")
            for df_name, cols in ignore_columns.items():
                self.ignore_columns[df_name] = set(cols)
        self.target_dataframe_name = target_dataframe_name
        self.es = entityset

        for library in Library:
            if library.value == self.es.dataframe_type:
                df_library = library
                break

        aggregation_primitive_dict = primitives.get_aggregation_primitives()
        transform_primitive_dict = primitives.get_transform_primitives()
        if agg_primitives is None:
            agg_primitives = [
                p for p in primitives.get_default_aggregation_primitives()
                if df_library in p.compatibility
            ]
        self.agg_primitives = []
        self.agg_primitives = sorted([
            check_primitive(
                p,
                "aggregation",
                aggregation_primitive_dict,
                transform_primitive_dict,
            ) for p in agg_primitives
        ])

        if trans_primitives is None:
            trans_primitives = [
                p for p in primitives.get_default_transform_primitives()
                if df_library in p.compatibility
            ]
        self.trans_primitives = sorted([
            check_primitive(p, "transform", aggregation_primitive_dict,
                            transform_primitive_dict) for p in trans_primitives
        ])

        if where_primitives is None:
            where_primitives = [primitives.Count]
        self.where_primitives = sorted([
            check_primitive(p, "where", aggregation_primitive_dict,
                            transform_primitive_dict) for p in where_primitives
        ])

        if groupby_trans_primitives is None:
            groupby_trans_primitives = []
        self.groupby_trans_primitives = sorted([
            check_primitive(
                p,
                "groupby transform",
                aggregation_primitive_dict,
                transform_primitive_dict,
            ) for p in groupby_trans_primitives
        ])

        if primitive_options is None:
            primitive_options = {}
        all_primitives = (self.trans_primitives + self.agg_primitives +
                          self.where_primitives +
                          self.groupby_trans_primitives)
        bad_primitives = [
            prim.name for prim in all_primitives
            if df_library not in prim.compatibility
        ]
        if bad_primitives:
            msg = "Selected primitives are incompatible with {} EntitySets: {}"
            raise ValueError(
                msg.format(df_library.value, ", ".join(bad_primitives)))

        (
            self.primitive_options,
            self.ignore_dataframes,
            self.ignore_columns,
        ) = generate_all_primitive_options(
            all_primitives,
            primitive_options,
            self.ignore_dataframes,
            self.ignore_columns,
            self.es,
        )
        self.seed_features = sorted(seed_features or [],
                                    key=lambda f: f.unique_name())
        self.drop_exact = drop_exact or []
        self.drop_contains = drop_contains or []
        self.where_stacking_limit = where_stacking_limit
    def __init__(self,
                 target_entity_id,
                 entityset,
                 agg_primitives=None,
                 trans_primitives=None,
                 where_primitives=None,
                 groupby_trans_primitives=None,
                 max_depth=2,
                 max_features=-1,
                 allowed_paths=None,
                 ignore_entities=None,
                 ignore_variables=None,
                 primitive_options=None,
                 seed_features=None,
                 drop_contains=None,
                 drop_exact=None,
                 where_stacking_limit=1):

        if target_entity_id not in entityset.entity_dict:
            es_name = entityset.id or 'entity set'
            msg = 'Provided target entity %s does not exist in %s' % (
                target_entity_id, es_name)
            raise KeyError(msg)

        # need to change max_depth to None because DFs terminates when  <0
        if max_depth == -1:
            max_depth = None
        self.max_depth = max_depth

        self.max_features = max_features

        self.allowed_paths = allowed_paths
        if self.allowed_paths:
            self.allowed_paths = set()
            for path in allowed_paths:
                self.allowed_paths.add(tuple(path))

        if ignore_entities is None:
            self.ignore_entities = set()
        else:
            if not isinstance(ignore_entities, list):
                raise TypeError('ignore_entities must be a list')
            assert target_entity_id not in ignore_entities,\
                "Can't ignore target_entity!"
            self.ignore_entities = set(ignore_entities)

        self.ignore_variables = defaultdict(set)
        if ignore_variables is not None:
            # check if ignore_variables is not {str: list}
            if not all(isinstance(i, str)
                       for i in ignore_variables.keys()) or not all(
                           isinstance(i, list)
                           for i in ignore_variables.values()):
                raise TypeError('ignore_variables should be dict[str -> list]')
            # check if list values are all of type str
            elif not all(
                    all(isinstance(v, str) for v in value)
                    for value in ignore_variables.values()):
                raise TypeError('list values should be of type str')
            for eid, vars in ignore_variables.items():
                self.ignore_variables[eid] = set(vars)
        self.target_entity_id = target_entity_id
        self.es = entityset

        if agg_primitives is None:
            agg_primitives = primitives.get_default_aggregation_primitives()
        self.agg_primitives = []
        agg_prim_dict = primitives.get_aggregation_primitives()
        for a in agg_primitives:
            if isinstance(a, str):
                if a.lower() not in agg_prim_dict:
                    raise ValueError(
                        "Unknown aggregation primitive {}. ".format(a),
                        "Call ft.primitives.list_primitives() to get",
                        " a list of available primitives")
                a = agg_prim_dict[a.lower()]
            a = handle_primitive(a)
            if not isinstance(a, AggregationPrimitive):
                raise ValueError("Primitive {} in agg_primitives is not an "
                                 "aggregation primitive".format(type(a)))
            self.agg_primitives.append(a)

        if trans_primitives is None:
            trans_primitives = primitives.get_default_transform_primitives()
        self.trans_primitives = []
        for t in trans_primitives:
            t = check_trans_primitive(t)
            self.trans_primitives.append(t)

        if where_primitives is None:
            where_primitives = [primitives.Count]
        self.where_primitives = []
        for p in where_primitives:
            if isinstance(p, str):
                prim_obj = agg_prim_dict.get(p.lower(), None)
                if prim_obj is None:
                    raise ValueError(
                        "Unknown where primitive {}. ".format(p),
                        "Call ft.primitives.list_primitives() to get",
                        " a list of available primitives")
                p = prim_obj
            p = handle_primitive(p)
            self.where_primitives.append(p)

        if groupby_trans_primitives is None:
            groupby_trans_primitives = []
        self.groupby_trans_primitives = []
        for p in groupby_trans_primitives:
            p = check_trans_primitive(p)
            self.groupby_trans_primitives.append(p)

        if primitive_options is None:
            primitive_options = {}
        all_primitives = self.trans_primitives + self.agg_primitives + \
            self.where_primitives + self.groupby_trans_primitives
        self.primitive_options, self.ignore_entities, self.ignore_variables =\
            generate_all_primitive_options(all_primitives,
                                           primitive_options,
                                           self.ignore_entities,
                                           self.ignore_variables,
                                           self.es)
        self.seed_features = seed_features or []
        self.drop_exact = drop_exact or []
        self.drop_contains = drop_contains or []
        self.where_stacking_limit = where_stacking_limit
예제 #4
0
    def __init__(self,
                 target_dataframe_name,
                 entityset,
                 agg_primitives=None,
                 trans_primitives=None,
                 where_primitives=None,
                 groupby_trans_primitives=None,
                 max_depth=2,
                 max_features=-1,
                 allowed_paths=None,
                 ignore_dataframes=None,
                 ignore_columns=None,
                 primitive_options=None,
                 seed_features=None,
                 drop_contains=None,
                 drop_exact=None,
                 where_stacking_limit=1):

        if target_dataframe_name not in entityset.dataframe_dict:
            es_name = entityset.id or 'entity set'
            msg = 'Provided target dataframe %s does not exist in %s' % (
                target_dataframe_name, es_name)
            raise KeyError(msg)

        # need to change max_depth to None because DFs terminates when  <0
        if max_depth == -1:
            max_depth = None

        # if just one dataframe, set max depth to 1 (transform stacking rule)
        if len(entityset.dataframe_dict) == 1 and (max_depth is None
                                                   or max_depth > 1):
            warnings.warn(
                "Only one dataframe in entityset, changing max_depth to "
                "1 since deeper features cannot be created")
            max_depth = 1

        self.max_depth = max_depth

        self.max_features = max_features

        self.allowed_paths = allowed_paths
        if self.allowed_paths:
            self.allowed_paths = set()
            for path in allowed_paths:
                self.allowed_paths.add(tuple(path))

        if ignore_dataframes is None:
            self.ignore_dataframes = set()
        else:
            if not isinstance(ignore_dataframes, list):
                raise TypeError('ignore_dataframes must be a list')
            assert target_dataframe_name not in ignore_dataframes,\
                "Can't ignore target_dataframe!"
            self.ignore_dataframes = set(ignore_dataframes)

        self.ignore_columns = defaultdict(set)
        if ignore_columns is not None:
            # check if ignore_columns is not {str: list}
            if not all(isinstance(i, str)
                       for i in ignore_columns.keys()) or not all(
                           isinstance(i, list)
                           for i in ignore_columns.values()):
                raise TypeError('ignore_columns should be dict[str -> list]')
            # check if list values are all of type str
            elif not all(
                    all(isinstance(v, str) for v in value)
                    for value in ignore_columns.values()):
                raise TypeError('list values should be of type str')
            for df_name, cols in ignore_columns.items():
                self.ignore_columns[df_name] = set(cols)
        self.target_dataframe_name = target_dataframe_name
        self.es = entityset

        for library in Library:
            if library.value == self.es.dataframe_type:
                df_library = library
                break

        if agg_primitives is None:
            agg_primitives = [
                p for p in primitives.get_default_aggregation_primitives()
                if df_library in p.compatibility
            ]
        self.agg_primitives = []
        agg_prim_dict = primitives.get_aggregation_primitives()
        for a in agg_primitives:
            if isinstance(a, str):
                if a.lower() not in agg_prim_dict:
                    raise ValueError(
                        "Unknown aggregation primitive {}. ".format(a),
                        "Call ft.primitives.list_primitives() to get",
                        " a list of available primitives")
                a = agg_prim_dict[a.lower()]
            a = handle_primitive(a)
            if not isinstance(a, AggregationPrimitive):
                raise ValueError("Primitive {} in agg_primitives is not an "
                                 "aggregation primitive".format(type(a)))
            self.agg_primitives.append(a)
        self.agg_primitives.sort()

        if trans_primitives is None:
            trans_primitives = [
                p for p in primitives.get_default_transform_primitives()
                if df_library in p.compatibility
            ]
        self.trans_primitives = []
        for t in trans_primitives:
            t = check_trans_primitive(t)
            self.trans_primitives.append(t)
        self.trans_primitives.sort()

        if where_primitives is None:
            where_primitives = [primitives.Count]
        self.where_primitives = []
        for p in where_primitives:
            if isinstance(p, str):
                prim_obj = agg_prim_dict.get(p.lower(), None)
                if prim_obj is None:
                    raise ValueError(
                        "Unknown where primitive {}. ".format(p),
                        "Call ft.primitives.list_primitives() to get",
                        " a list of available primitives")
                p = prim_obj
            p = handle_primitive(p)
            self.where_primitives.append(p)
        self.where_primitives.sort()

        if groupby_trans_primitives is None:
            groupby_trans_primitives = []
        self.groupby_trans_primitives = []
        for p in groupby_trans_primitives:
            p = check_trans_primitive(p)
            self.groupby_trans_primitives.append(p)
        self.groupby_trans_primitives.sort()

        if primitive_options is None:
            primitive_options = {}
        all_primitives = self.trans_primitives + self.agg_primitives + \
            self.where_primitives + self.groupby_trans_primitives
        bad_primitives = [
            prim.name for prim in all_primitives
            if df_library not in prim.compatibility
        ]
        if bad_primitives:
            msg = 'Selected primitives are incompatible with {} EntitySets: {}'
            raise ValueError(
                msg.format(df_library.value, ', '.join(bad_primitives)))

        self.primitive_options, self.ignore_dataframes, self.ignore_columns =\
            generate_all_primitive_options(all_primitives,
                                           primitive_options,
                                           self.ignore_dataframes,
                                           self.ignore_columns,
                                           self.es)
        self.seed_features = sorted(seed_features or [],
                                    key=lambda f: f.unique_name())
        self.drop_exact = drop_exact or []
        self.drop_contains = drop_contains or []
        self.where_stacking_limit = where_stacking_limit