示例#1
0
def _validate_entity_params(id, df, time_index):
    '''Validation checks for Entity inputs'''
    assert is_string(id), "Entity id must be a string"
    assert len(df.columns) == len(set(df.columns)), "Duplicate column names"
    for c in df.columns:
        if not is_string(c):
            raise ValueError("All column names must be strings (Column {} "
                             "is not a string)".format(c))
    if time_index is not None and time_index not in df.columns:
        raise LookupError('Time index not found in dataframe')
示例#2
0
def _validate_entity_params(id, df, time_index):
    '''Validation checks for Entity inputs'''
    assert is_string(id), "Entity id must be a string"
    assert len(df.columns) == len(set(df.columns)), "Duplicate column names"
    for c in df.columns:
        if not is_string(c):
            raise ValueError("All column names must be strings (Column {} "
                             "is not a string)".format(c))
    if time_index is not None and time_index not in df.columns:
        raise LookupError('Time index not found in dataframe')
示例#3
0
def _check_timedelta(td):
    """
    Convert strings to Timedelta objects
    Allows for both shortform and longform units, as well as any form of capitalization
    '2 Minutes'
    '2 minutes'
    '2 m'
    '1 Minute'
    '1 minute'
    '1 m'
    '1 units'
    '1 Units'
    '1 u'
    Shortform is fine if space is dropped
    '2m'
    '1u"
    If a pd.Timedelta object is passed, units will be converted to seconds due to the underlying representation
        of pd.Timedelta.
    If a pd.DateOffset object is passed, it will be converted to a Featuretools Timedelta if it has one
        temporal parameter. Otherwise, it will remain a pd.DateOffset.
    """
    if td is None:
        return td
    if isinstance(td, Timedelta):
        return td
    elif not (is_string(td) or isinstance(td, pd.Timedelta) or
              isinstance(td, (int, float)) or isinstance(td, pd.DateOffset)):
        raise ValueError("Unable to parse timedelta: {}".format(td))
    if isinstance(td, pd.Timedelta):
        unit = 's'
        value = td.total_seconds()
        times = {unit: value}
        return Timedelta(times, delta_obj=td)
    elif isinstance(td, pd.DateOffset):
        # DateOffsets
        if td.__class__.__name__ == "DateOffset":
            times = dict()
            for td_unit, td_value in td.kwds.items():
                times[td_unit] = td_value
            return Timedelta(times, delta_obj=td)
        # Special offsets (such as BDay)
        else:
            unit = td.__class__.__name__
            value = td.__dict__['n']
            times = dict([(unit, value)])
            return Timedelta(times, delta_obj=td)
    else:
        pattern = '([0-9]+) *([a-zA-Z]+)$'
        match = re.match(pattern, td)
        value, unit = match.groups()
        try:
            value = int(value)
        except Exception:
            try:
                value = float(value)
            except Exception:
                raise ValueError("Unable to parse value {} from ".format(value) +
                                 "timedelta string: {}".format(td))
        times = {unit: value}
        return Timedelta(times)
示例#4
0
    def __init__(self, value, unit=None):
        """
        Args:
            value (float, str) : Value of timedelta, or string providing
                both unit and value.
            unit (str) : Unit of time delta.
        """
        # TODO: check if value is int or float
        if is_string(value):
            from featuretools.utils.wrangle import _check_timedelta
            td = _check_timedelta(value)
            value, unit = td.value, td.unit

        self.value = value
        self._original_unit = None  # to alert get_name that although we converted the unit to 'd' it was initially
        unit = self._check_unit_plural(unit)
        assert unit in self._readable_units or unit in self._readable_to_unit
        if unit in self._readable_to_unit:
            unit = self._readable_to_unit[unit]

        # weeks
        if unit in self._convert_to_days:
            self._original_unit = unit
            self.value = self.value * self._convert_to_days[unit]
            unit = 'd'

        self.unit = unit
        self.delta_obj = self.get_unit_type()
示例#5
0
 def __init__(self, id, entity, name=None):
     assert is_string(id), "Variable id must be a string"
     self.id = id
     self._name = name
     self.entity_id = entity.id
     assert entity.entityset is not None, "Entity must contain reference to EntitySet"
     self.entity = entity
     self._interesting_values = None
示例#6
0
 def __init__(self, id, entity, name=None):
     assert is_string(id), "Variable id must be a string"
     self.id = id
     self._name = name
     self.entity_id = entity.id
     assert entity.entityset is not None, "Entity must contain reference to EntitySet"
     self.entity = entity
     self._interesting_values = None
示例#7
0
def ensure_compatible_dtype(left, right):
    # Pandas converts dtype to float
    # if all nans. If the actual values are
    # strings/objects though, future features
    # that depend on these values may error
    # unless we explicitly set the dtype to object
    if isinstance(left, pd.Series) and isinstance(right, pd.Series):
        if left.dtype != object and right.dtype == object:
            left = left.astype(object)
        elif right.dtype != object and left.dtype == object:
            right = right.astype(object)
    elif isinstance(left, pd.Series):
        if left.dtype != object and is_string(right):
            left = left.astype(object)
    elif isinstance(right, pd.Series):
        if right.dtype != object and is_string(left):
            right = right.astype(object)
    return left, right
示例#8
0
 def check_value(self, value, unit):
     if is_string(value):
         from featuretools.utils.wrangle import _check_timedelta
         td = _check_timedelta(value)
         self.times = td.times
     elif isinstance(value, dict):
         self.times = value
     else:
         self.times = {unit: value}
示例#9
0
def ensure_compatible_dtype(left, right):
    # Pandas converts dtype to float
    # if all nans. If the actual values are
    # strings/objects though, future features
    # that depend on these values may error
    # unless we explicitly set the dtype to object
    if isinstance(left, pd.Series) and isinstance(right, pd.Series):
        if left.dtype != object and right.dtype == object:
            left = left.astype(object)
        elif right.dtype != object and left.dtype == object:
            right = right.astype(object)
    elif isinstance(left, pd.Series):
        if left.dtype != object and is_string(right):
            left = left.astype(object)
    elif isinstance(right, pd.Series):
        if right.dtype != object and is_string(left):
            right = right.astype(object)
    return left, right
示例#10
0
def _check_timedelta(td, entity_id=None, related_entity_id=None):
    """
    Convert strings to Timedelta objects
    Allows for both shortform and longform units, as well as any form of capitalization
    '2 Minutes'
    '2 minutes'
    '2 m'
    '1 Minute'
    '1 minute'
    '1 m'
    '1 units'
    '1 Units'
    '1 u'
    Shortform is fine if space is dropped
    '2m'
    '1u"
    If a pd.Timedelta object is passed, units will be converted to seconds due to the underlying representation
        of pd .Timedelta.
    """
    if td is None:
        return td
    if isinstance(td, Timedelta):
        return td
    elif not (is_string(td) or isinstance(td, pd.Timedelta)
              or isinstance(td, (int, float))):
        raise ValueError("Unable to parse timedelta: {}".format(td))

    value = None
    try:
        value = int(td)
    except Exception:
        try:
            value = float(td)
        except Exception:
            pass
    if isinstance(td, pd.Timedelta):
        unit = 's'
        value = td.total_seconds()
    else:
        pattern = '([0-9]+) *([a-zA-Z]+)$'
        match = re.match(pattern, td)
        value, unit = match.groups()
        try:
            value = int(value)
        except Exception:
            try:
                value = float(value)
            except Exception:
                raise ValueError(
                    "Unable to parse value {} from ".format(value) +
                    "timedelta string: {}".format(td))
    return Timedelta(value, unit)
def check_trans_primitive(primitive):
    trans_prim_dict = primitives.get_transform_primitives()

    if is_string(primitive):
        if primitive.lower() not in trans_prim_dict:
            raise ValueError(
                "Unknown transform primitive {}. ".format(primitive),
                "Call ft.primitives.list_primitives() to get",
                " a list of available primitives")
        primitive = trans_prim_dict[primitive.lower()]
    primitive = handle_primitive(primitive)
    if not isinstance(primitive, TransformPrimitive):
        raise ValueError("Primitive {} in trans_primitives or "
                         "groupby_trans_primitives is not a transform "
                         "primitive".format(type(primitive)))
    return primitive
示例#12
0
    def __init__(self,
                 value,
                 unit=None,
                 entity=None,
                 data=None,
                 inclusive=False):
        """
        Args:
            value (float, str) : Value of timedelta, or string providing
                both unit and value.
            unit (str) : Unit of time delta.
            entity (str, optional) : Entity id to use if unit equals
                "observations".
            data (pd.Series, optional) : series of timestamps to use
                with observations. Can be calculated later.
            inclusive (bool, optional) : if True, include events that are
                exactly timedelta distance away from the original time/observation
        """
        # TODO: check if value is int or float
        if is_string(value):
            from featuretools.utils.wrangle import _check_timedelta
            td = _check_timedelta(value)
            value, unit = td.value, td.unit

        self.value = value
        self._original_unit = None  # to alert get_name that although we converted the unit to 'd' it was initially
        unit = self._check_unit_plural(unit)
        assert unit in self._readable_units or unit in self._readable_to_unit
        if unit in self._readable_to_unit:
            unit = self._readable_to_unit[unit]

        # weeks
        if unit in self._convert_to_days:
            self._original_unit = unit
            self.value = self.value * self._convert_to_days[unit]
            unit = 'd'

        self.unit = unit

        if unit == self._Observations and entity is None:
            raise Exception("Must define entity to use %s as unit" % (unit))

        self.entity = entity
        self.data = data

        self.inclusive = inclusive
    def __init__(self,
                 base_feature,
                 group_feature,
                 time_index=None,
                 where=None,
                 use_previous=None):
        """Summary

        Args:
            agg_feature (type): subclass of :class:`.AggregationPrimitive`;
                aggregation method being used.  This is passed by the
                constructors of the cumfeat subclasses
            base_feature (:class:`.PrimitiveBase` or :class:`.Variable`): Feature
                or variable calculated on
            group_feature (:class:`.PrimitiveBase` or :class:`.Variable`): Feature
                or variable used to group the rows before computation
            where (optional[:class:`.PrimitiveBase`]):
            use_previous (optional[:class:`.Timedelta`):
        """
        self.return_type = self.agg_feature.return_type

        base_feature = self._check_feature(base_feature)

        td_entity_id = None
        if is_string(use_previous):
            td_entity_id = base_feature.entity.id
        self.use_previous = _check_timedelta(use_previous,
                                             entity_id=td_entity_id)

        group_feature = self._check_feature(group_feature)
        self.group_feature = group_feature

        self.base_features = [base_feature, group_feature]

        if time_index is None:
            entity = base_feature.entity
            time_index = IdentityFeature(entity[entity.time_index])
        self.base_features += [time_index]

        if where is not None:
            self.where = where

        super(CumFeature, self).__init__(*self.base_features)
示例#14
0
    def __init__(self, value, unit=None, entity=None, data=None, inclusive=False):
        """
        Args:
            value (float, str) : Value of timedelta, or string providing
                both unit and value.
            unit (str) : Unit of time delta.
            entity (str, optional) : Entity id to use if unit equals
                "observations".
            data (pd.Series, optional) : series of timestamps to use
                with observations. Can be calculated later.
            inclusive (bool, optional) : if True, include events that are
                exactly timedelta distance away from the original time/observation
        """
        # TODO: check if value is int or float
        if is_string(value):
            from featuretools.utils.wrangle import _check_timedelta
            td = _check_timedelta(value)
            value, unit = td.value, td.unit

        self.value = value
        self._original_unit = None  # to alert get_name that although we converted the unit to 'd' it was initially
        unit = self._check_unit_plural(unit)
        assert unit in self._readable_units or unit in self._readable_to_unit
        if unit in self._readable_to_unit:
            unit = self._readable_to_unit[unit]

        # weeks
        if unit in self._convert_to_days:
            self._original_unit = unit
            self.value = self.value * self._convert_to_days[unit]
            unit = 'd'

        self.unit = unit

        if unit == self._Observations and entity is None:
            raise Exception("Must define entity to use %s as unit" % (unit))

        self.entity = entity
        self.data = data

        self.inclusive = inclusive
    def __init__(self, base_feature, group_feature, time_index=None,
                 where=None, use_previous=None):
        """Summary

        Args:
            agg_feature (type): subclass of :class:`.AggregationPrimitive`;
                aggregation method being used.  This is passed by the
                constructors of the cumfeat subclasses
            base_feature (:class:`.PrimitiveBase` or :class:`.Variable`): Feature
                or variable calculated on
            group_feature (:class:`.PrimitiveBase` or :class:`.Variable`): Feature
                or variable used to group the rows before computation
            where (optional[:class:`.PrimitiveBase`]):
            use_previous (optional[:class:`.Timedelta`):
        """
        self.return_type = self.agg_feature.return_type

        base_feature = self._check_feature(base_feature)

        td_entity_id = None
        if is_string(use_previous):
            td_entity_id = base_feature.entity.id
        self.use_previous = _check_timedelta(
            use_previous, entity_id=td_entity_id)

        group_feature = self._check_feature(group_feature)
        self.group_feature = group_feature

        self.base_features = [base_feature, group_feature]

        if time_index is None:
            entity = base_feature.entity
            time_index = IdentityFeature(entity[entity.time_index])
        self.base_features += [time_index]

        if where is not None:
            self.where = where

        super(CumFeature, self).__init__(*self.base_features)
    def __init__(self,
                 target_entity_id,
                 entityset,
                 agg_primitives=None,
                 trans_primitives=None,
                 where_primitives=None,
                 max_depth=2,
                 max_hlevel=2,
                 max_features=-1,
                 allowed_paths=None,
                 ignore_entities=None,
                 ignore_variables=None,
                 seed_features=None,
                 drop_contains=None,
                 drop_exact=None,
                 where_stacking_limit=1):
        # need to change max_depth and max_hlevel to None because DFs terminates when  <0
        if max_depth == -1:
            max_depth = None
        self.max_depth = max_depth

        if max_hlevel == -1:
            max_hlevel = None
        self.max_hlevel = max_hlevel

        self.max_features = max_features

        self.allowed_paths = allowed_paths
        if self.allowed_paths:
            self.allowed_paths = set()
            for path in allowed_paths:
                self.allowed_paths.add(tuple(path))

        if ignore_entities is None:
            self.ignore_entities = set()
        else:
            if not isinstance(ignore_entities, list):
                raise TypeError('ignore_entities must be a list')
            assert target_entity_id not in ignore_entities,\
                "Can't ignore target_entity!"
            self.ignore_entities = set(ignore_entities)

        self.ignore_variables = defaultdict(set)
        if ignore_variables is not None:
            for eid, vars in ignore_variables.items():
                self.ignore_variables[eid] = set(vars)
        self.target_entity_id = target_entity_id
        self.es = entityset

        if agg_primitives is None:
            agg_primitives = [
                ftypes.Sum, ftypes.Std, ftypes.Max, ftypes.Skew, ftypes.Min,
                ftypes.Mean, ftypes.Count, ftypes.PercentTrue, ftypes.NUnique,
                ftypes.Mode
            ]
        self.agg_primitives = []
        agg_prim_dict = ftypes.get_aggregation_primitives()
        for a in agg_primitives:
            if is_string(a):
                if a.lower() not in agg_prim_dict:
                    raise ValueError(
                        "Unknown aggregation primitive {}. ".format(a),
                        "Call ft.primitives.list_primitives() to get",
                        " a list of available primitives")
                a = agg_prim_dict[a.lower()]
            a = handle_primitive(a)
            self.agg_primitives.append(a)

        if trans_primitives is None:
            trans_primitives = [
                ftypes.Day, ftypes.Year, ftypes.Month, ftypes.Weekday,
                ftypes.Haversine, ftypes.NumWords, ftypes.NumCharacters
            ]  # ftypes.TimeSince
        self.trans_primitives = []
        trans_prim_dict = ftypes.get_transform_primitives()
        for t in trans_primitives:
            if is_string(t):
                if t.lower() not in trans_prim_dict:
                    raise ValueError(
                        "Unknown transform primitive {}. ".format(t),
                        "Call ft.primitives.list_primitives() to get",
                        " a list of available primitives")
                t = trans_prim_dict[t.lower()]
            t = handle_primitive(t)
            self.trans_primitives.append(t)

        if where_primitives is None:
            where_primitives = [ftypes.Count]
        self.where_primitives = []
        for p in where_primitives:
            if is_string(p):
                prim_obj = agg_prim_dict.get(p.lower(), None)
                if prim_obj is None:
                    raise ValueError(
                        "Unknown where primitive {}. ".format(p),
                        "Call ft.primitives.list_primitives() to get",
                        " a list of available primitives")
                p = prim_obj
            p = handle_primitive(p)
            self.where_primitives.append(p)

        self.seed_features = seed_features or []
        self.drop_exact = drop_exact or []
        self.drop_contains = drop_contains or []
        self.where_stacking_limit = where_stacking_limit
示例#17
0
    def normalize_entity(self,
                         base_entity_id,
                         new_entity_id,
                         index,
                         additional_variables=None,
                         copy_variables=None,
                         make_time_index=None,
                         make_secondary_time_index=None,
                         new_entity_time_index=None,
                         new_entity_secondary_time_index=None):
        """Create a new entity and relationship from unique values of an existing variable.

        Args:
            base_entity_id (str) : Entity id from which to split.

            new_entity_id (str): Id of the new entity.

            index (str): Variable in old entity
                that will become index of new entity. Relationship
                will be created across this variable.

            additional_variables (list[str]):
                List of variable ids to remove from
                base_entity and move to new entity.

            copy_variables (list[str]): List of
                variable ids to copy from old entity
                and move to new entity.

            make_time_index (bool or str, optional): Create time index for new entity based
                on time index in base_entity, optionally specifying which variable in base_entity
                to use for time_index. If specified as True without a specific variable,
                uses the primary time index. Defaults to True if base entity has a time index.

            make_secondary_time_index (dict[str -> list[str]], optional): Create a secondary time index
                from key. Values of dictionary
                are the variables to associate with the secondary time index. Only one
                secondary time index is allowed. If None, only associate the time index.

            new_entity_time_index (str, optional): Rename new entity time index.

            new_entity_secondary_time_index (str, optional): Rename new entity secondary time index.

        """
        base_entity = self.entity_dict[base_entity_id]
        additional_variables = additional_variables or []
        copy_variables = copy_variables or []

        if not isinstance(additional_variables, list):
            raise TypeError(
                "'additional_variables' must be a list, but received type {}".
                format(type(additional_variables)))

        if len(additional_variables) != len(set(additional_variables)):
            raise ValueError(
                "'additional_variables' contains duplicate variables. All variables must be unique."
            )

        if not isinstance(copy_variables, list):
            raise TypeError(
                "'copy_variables' must be a list, but received type {}".format(
                    type(copy_variables)))

        if len(copy_variables) != len(set(copy_variables)):
            raise ValueError(
                "'copy_variables' contains duplicate variables. All variables must be unique."
            )

        for v in additional_variables + copy_variables:
            if v == index:
                raise ValueError(
                    "Not copying {} as both index and variable".format(v))
                break
        if is_string(make_time_index):
            if make_time_index not in base_entity.df.columns:
                raise ValueError(
                    "'make_time_index' must be a variable in the base entity")
            elif make_time_index not in additional_variables + copy_variables:
                raise ValueError(
                    "'make_time_index' must specified in 'additional_variables' or 'copy_variables'"
                )
        if index == base_entity.index:
            raise ValueError(
                "'index' must be different from the index column of the base entity"
            )

        transfer_types = {}
        transfer_types[index] = type(base_entity[index])
        for v in additional_variables + copy_variables:
            transfer_types[v] = type(base_entity[v])

        # create and add new entity
        new_entity_df = self[base_entity_id].df.copy()

        if make_time_index is None and base_entity.time_index is not None:
            make_time_index = True

        if isinstance(make_time_index, str):
            # Set the new time index to make_time_index.
            base_time_index = make_time_index
            new_entity_time_index = make_time_index
            already_sorted = (new_entity_time_index == base_entity.time_index)
        elif make_time_index:
            # Create a new time index based on the base entity time index.
            base_time_index = base_entity.time_index
            if new_entity_time_index is None:
                new_entity_time_index = "first_%s_time" % (base_entity.id)

            already_sorted = True

            assert base_entity.time_index is not None, \
                "Base entity doesn't have time_index defined"

            if base_time_index not in [v for v in additional_variables]:
                copy_variables.append(base_time_index)

            transfer_types[new_entity_time_index] = type(
                base_entity[base_entity.time_index])
        else:
            new_entity_time_index = None
            already_sorted = False

        if new_entity_time_index is not None and new_entity_time_index == index:
            raise ValueError(
                "time_index and index cannot be the same value, %s" %
                (new_entity_time_index))

        selected_variables = [index] +\
            [v for v in additional_variables] +\
            [v for v in copy_variables]

        new_entity_df2 = new_entity_df. \
            drop_duplicates(index, keep='first')[selected_variables]

        if make_time_index:
            new_entity_df2.rename(
                columns={base_time_index: new_entity_time_index}, inplace=True)
        if make_secondary_time_index:
            assert len(make_secondary_time_index
                       ) == 1, "Can only provide 1 secondary time index"
            secondary_time_index = list(make_secondary_time_index.keys())[0]

            secondary_variables = [index, secondary_time_index] + list(
                make_secondary_time_index.values())[0]
            secondary_df = new_entity_df. \
                drop_duplicates(index, keep='last')[secondary_variables]
            if new_entity_secondary_time_index:
                secondary_df.rename(columns={
                    secondary_time_index:
                    new_entity_secondary_time_index
                },
                                    inplace=True)
                secondary_time_index = new_entity_secondary_time_index
            else:
                new_entity_secondary_time_index = secondary_time_index
            secondary_df.set_index(index, inplace=True)
            new_entity_df = new_entity_df2.join(secondary_df, on=index)
        else:
            new_entity_df = new_entity_df2

        base_entity_index = index

        transfer_types[index] = vtypes.Categorical
        if make_secondary_time_index:
            old_ti_name = list(make_secondary_time_index.keys())[0]
            ti_cols = list(make_secondary_time_index.values())[0]
            ti_cols = [
                c if c != old_ti_name else secondary_time_index
                for c in ti_cols
            ]
            make_secondary_time_index = {secondary_time_index: ti_cols}

        self.entity_from_dataframe(
            new_entity_id,
            new_entity_df,
            index,
            already_sorted=already_sorted,
            time_index=new_entity_time_index,
            secondary_time_index=make_secondary_time_index,
            variable_types=transfer_types)

        self.entity_dict[base_entity_id].delete_variables(additional_variables)

        new_entity = self.entity_dict[new_entity_id]
        base_entity.convert_variable_type(base_entity_index,
                                          vtypes.Id,
                                          convert_data=False)
        self.add_relationship(
            Relationship(new_entity[index], base_entity[base_entity_index]))
        self.reset_data_description()
        return self
    def __init__(self,
                 target_entity_id,
                 entityset,
                 agg_primitives=None,
                 trans_primitives=None,
                 where_primitives=None,
                 groupby_trans_primitives=None,
                 max_depth=2,
                 max_features=-1,
                 allowed_paths=None,
                 ignore_entities=None,
                 ignore_variables=None,
                 primitive_options=None,
                 seed_features=None,
                 drop_contains=None,
                 drop_exact=None,
                 where_stacking_limit=1):

        if target_entity_id not in entityset.entity_dict:
            es_name = entityset.id or 'entity set'
            msg = 'Provided target entity %s does not exist in %s' % (
                target_entity_id, es_name)
            raise KeyError(msg)

        # need to change max_depth to None because DFs terminates when  <0
        if max_depth == -1:
            max_depth = None
        self.max_depth = max_depth

        self.max_features = max_features

        self.allowed_paths = allowed_paths
        if self.allowed_paths:
            self.allowed_paths = set()
            for path in allowed_paths:
                self.allowed_paths.add(tuple(path))

        if ignore_entities is None:
            self.ignore_entities = set()
        else:
            if not isinstance(ignore_entities, list):
                raise TypeError('ignore_entities must be a list')
            assert target_entity_id not in ignore_entities,\
                "Can't ignore target_entity!"
            self.ignore_entities = set(ignore_entities)

        self.ignore_variables = defaultdict(set)
        if ignore_variables is not None:
            for eid, vars in ignore_variables.items():
                self.ignore_variables[eid] = set(vars)
        self.target_entity_id = target_entity_id
        self.es = entityset

        if agg_primitives is None:
            agg_primitives = [
                primitives.Sum, primitives.Std, primitives.Max,
                primitives.Skew, primitives.Min, primitives.Mean,
                primitives.Count, primitives.PercentTrue, primitives.NumUnique,
                primitives.Mode
            ]
        self.agg_primitives = []
        agg_prim_dict = primitives.get_aggregation_primitives()
        for a in agg_primitives:
            if is_string(a):
                if a.lower() not in agg_prim_dict:
                    raise ValueError(
                        "Unknown aggregation primitive {}. ".format(a),
                        "Call ft.primitives.list_primitives() to get",
                        " a list of available primitives")
                a = agg_prim_dict[a.lower()]
            a = handle_primitive(a)
            if not isinstance(a, AggregationPrimitive):
                raise ValueError("Primitive {} in agg_primitives is not an "
                                 "aggregation primitive".format(type(a)))
            self.agg_primitives.append(a)

        if trans_primitives is None:
            trans_primitives = [
                primitives.Day, primitives.Year, primitives.Month,
                primitives.Weekday, primitives.Haversine, primitives.NumWords,
                primitives.NumCharacters
            ]  # primitives.TimeSince
        self.trans_primitives = []
        for t in trans_primitives:
            t = check_trans_primitive(t)
            self.trans_primitives.append(t)

        if where_primitives is None:
            where_primitives = [primitives.Count]
        self.where_primitives = []
        for p in where_primitives:
            if is_string(p):
                prim_obj = agg_prim_dict.get(p.lower(), None)
                if prim_obj is None:
                    raise ValueError(
                        "Unknown where primitive {}. ".format(p),
                        "Call ft.primitives.list_primitives() to get",
                        " a list of available primitives")
                p = prim_obj
            p = handle_primitive(p)
            self.where_primitives.append(p)

        if groupby_trans_primitives is None:
            groupby_trans_primitives = []
        self.groupby_trans_primitives = []
        for p in groupby_trans_primitives:
            p = check_trans_primitive(p)
            self.groupby_trans_primitives.append(p)

        if primitive_options is None:
            primitive_options = {}
        all_primitives = self.trans_primitives + self.agg_primitives + \
            self.where_primitives + self.groupby_trans_primitives
        self.primitive_options, self.ignore_entities =\
            generate_all_primitive_options(all_primitives,
                                           primitive_options,
                                           self.ignore_entities,
                                           self.ignore_variables,
                                           self.es)

        self.seed_features = seed_features or []
        self.drop_exact = drop_exact or []
        self.drop_contains = drop_contains or []
        self.where_stacking_limit = where_stacking_limit
示例#19
0
    def __init__(self, id, df, entityset, variable_types=None,
                 index=None, time_index=None, secondary_time_index=None,
                 last_time_index=None, encoding=None,
                 already_sorted=False, created_index=None, verbose=False):
        """ Create Entity

        Args:
            id (str): Id of Entity.
            df (pd.DataFrame): Dataframe providing the data for the
                entity.
            entityset (EntitySet): Entityset for this Entity.
            variable_types (dict[str -> dict[str -> type]]) : Optional mapping of
                entity_id to variable_types dict with which to initialize an
                entity's store.
                An entity's variable_types dict maps string variable ids to types (:class:`.Variable`).
            index (str): Name of id column in the dataframe.
            time_index (str): Name of time column in the dataframe.
            secondary_time_index (dict[str -> str]): Dictionary mapping columns
                in the dataframe to the time index column they are associated with.
            last_time_index (pd.Series): Time index of the last event for each
                instance across all child entities.
            encoding (str, optional)) : If None, will use 'ascii'. Another option is 'utf-8',
                or any encoding supported by pandas.

        """
        assert is_string(id), "Entity id must be a string"
        assert len(df.columns) == len(set(df.columns)), "Duplicate column names"
        self.data = {"df": df,
                     "last_time_index": last_time_index,
                     }
        self.encoding = encoding
        self._verbose = verbose
        self.created_index = created_index
        self.convert_all_variable_data(variable_types)
        self.id = id
        self.entityset = entityset
        variable_types = variable_types or {}
        self.index = index
        self.time_index = time_index
        self.secondary_time_index = secondary_time_index or {}
        # make sure time index is actually in the columns
        for ti, cols in self.secondary_time_index.items():
            if ti not in cols:
                cols.append(ti)

        relationships = [r for r in entityset.relationships
                         if r.parent_entity.id == id or
                         r.child_entity.id == id]

        link_vars = [v.id for rel in relationships for v in [rel.parent_variable, rel.child_variable]
                     if v.entity.id == self.id]

        inferred_variable_types = self.infer_variable_types(ignore=list(variable_types.keys()),
                                                            link_vars=link_vars)
        for var_id, desired_type in variable_types.items():
            if isinstance(desired_type, tuple):
                desired_type = desired_type[0]
            inferred_variable_types.update({var_id: desired_type})

        self.variables = []
        for v in inferred_variable_types:
            # TODO document how vtype can be tuple
            vtype = inferred_variable_types[v]
            if isinstance(vtype, tuple):
                # vtype is (ft.Variable, dict_of_kwargs)
                _v = vtype[0](v, self, **vtype[1])
            else:
                _v = inferred_variable_types[v](v, self)
            self.variables += [_v]

        # do one last conversion of data once we've inferred
        self.convert_all_variable_data(inferred_variable_types)

        # make sure index is at the beginning
        index_variable = [v for v in self.variables
                          if v.id == self.index][0]
        self.variables = [index_variable] + [v for v in self.variables
                                             if v.id != self.index]
        self.update_data(df=self.df,
                         already_sorted=already_sorted,
                         recalculate_last_time_indexes=False)
    def __init__(self,
                 target_entity_id,
                 entityset,
                 agg_primitives=None,
                 trans_primitives=None,
                 where_primitives=None,
                 max_depth=2,
                 max_hlevel=2,
                 max_features=-1,
                 allowed_paths=None,
                 ignore_entities=None,
                 ignore_variables=None,
                 seed_features=None,
                 drop_contains=None,
                 drop_exact=None,
                 where_stacking_limit=1):
        # need to change max_depth and max_hlevel to None because DFs terminates when  <0
        if max_depth == -1:
            max_depth = None
        self.max_depth = max_depth

        if max_hlevel == -1:
            max_hlevel = None
        self.max_hlevel = max_hlevel

        self.max_features = max_features

        self.allowed_paths = allowed_paths
        if self.allowed_paths:
            self.allowed_paths = set()
            for path in allowed_paths:
                self.allowed_paths.add(tuple(path))

        if ignore_entities is None:
            self.ignore_entities = set()
        else:
            if not isinstance(ignore_entities, list):
                raise TypeError('ignore_entities must be a list')
            assert target_entity_id not in ignore_entities,\
                "Can't ignore target_entity!"
            self.ignore_entities = set(ignore_entities)

        self.ignore_variables = defaultdict(set)
        if ignore_variables is not None:
            for eid, vars in ignore_variables.items():
                self.ignore_variables[eid] = set(vars)
        self.target_entity_id = target_entity_id
        self.es = entityset

        if agg_primitives is None:
            agg_primitives = [ftypes.Sum, ftypes.Std, ftypes.Max, ftypes.Skew,
                              ftypes.Min, ftypes.Mean, ftypes.Count,
                              ftypes.PercentTrue, ftypes.NUnique, ftypes.Mode]
        self.agg_primitives = []
        agg_prim_dict = ftypes.get_aggregation_primitives()
        for a in agg_primitives:
            if is_string(a):
                if a.lower() not in agg_prim_dict:
                    raise ValueError("Unknown aggregation primitive {}. ".format(a),
                                     "Call ft.primitives.list_primitives() to get",
                                     " a list of available primitives")
                a = agg_prim_dict[a.lower()]

            self.agg_primitives.append(a)

        if trans_primitives is None:
            trans_primitives = [ftypes.Day, ftypes.Year, ftypes.Month,
                                ftypes.Weekday, ftypes.Haversine,
                                ftypes.NumWords, ftypes.NumCharacters]  # ftypes.TimeSince
        self.trans_primitives = []
        trans_prim_dict = ftypes.get_transform_primitives()
        for t in trans_primitives:
            if is_string(t):
                if t.lower() not in trans_prim_dict:
                    raise ValueError("Unknown transform primitive {}. ".format(t),
                                     "Call ft.primitives.list_primitives() to get",
                                     " a list of available primitives")
                t = trans_prim_dict[t.lower()]

            self.trans_primitives.append(t)

        if where_primitives is None:
            where_primitives = [ftypes.Count]
        self.where_primitives = []
        for p in where_primitives:
            if is_string(p):
                prim_obj = agg_prim_dict.get(p.lower(), None)
                if prim_obj is None:
                    raise ValueError("Unknown where primitive {}. ".format(p),
                                     "Call ft.primitives.list_primitives() to get",
                                     " a list of available primitives")
                p = prim_obj

            self.where_primitives.append(p)

        self.seed_features = seed_features or []
        self.drop_exact = drop_exact or []
        self.drop_contains = drop_contains or []
        self.where_stacking_limit = where_stacking_limit
示例#21
0
def _check_timedelta(td, entity_id=None, related_entity_id=None):
    """
    Convert strings to Timedelta objects
    Allows for both shortform and longform units, as well as any form of capitalization
    '2 Minutes'
    '2 minutes'
    '2 m'
    '1 Minute'
    '1 minute'
    '1 m'
    '1 units'
    '1 Units'
    '1 u'
    Shortform is fine if space is dropped
    '2m'
    '1u"
    When using generic units, can drop the unit
    1
    2
    '1'
    '2'
    When using observations, need to provide an entity as either a tuple or a separate arg
    ('2o', 'logs')
    ('2 o', 'logs')
    ('2 Observations', 'logs')
    ('2 observations', 'logs')
    ('2 observation', 'logs')
    If an entity is provided and no unit is provided, assume observations (instead of generic units)
    (2, 'logs')
    ('2', 'logs')



    """
    if td is None:
        return td
    if isinstance(td, Timedelta):
        if td.entity is not None and entity_id is not None and td.entity != entity_id:
            raise ValueError("Timedelta entity {} different from passed entity {}".format(td.entity, entity_id))
        if td.entity is not None and related_entity_id is not None and td.entity == related_entity_id:
            raise ValueError("Timedelta entity {} same as passed related entity {}".format(td.entity, related_entity_id))
        return td
    elif not (is_string(td) or isinstance(td, (tuple, int, float))):
        raise ValueError("Unable to parse timedelta: {}".format(td))

    # TODO: allow observations from an entity in string

    if isinstance(td, tuple):
        if entity_id is None:
            entity_id = td[1]
        td = td[0]

    value = None
    try:
        value = int(td)
    except Exception:
        try:
            value = float(td)
        except Exception:
            pass
    if value is not None and entity_id is not None:
        unit = 'o'
    elif value is not None:
        unit = 'u'
    else:
        pattern = '([0-9]+) *([a-zA-Z]+)$'
        match = re.match(pattern, td)
        value, unit = match.groups()
        try:
            value = int(value)
        except Exception:
            try:
                value = float(value)
            except Exception:
                raise ValueError("Unable to parse value {} from ".format(value) +
                                 "timedelta string: {}".format(td))
    return Timedelta(value, unit, entity=entity_id)
示例#22
0
def _check_timedelta(td, entity_id=None, related_entity_id=None):
    """
    Convert strings to Timedelta objects
    Allows for both shortform and longform units, as well as any form of capitalization
    '2 Minutes'
    '2 minutes'
    '2 m'
    '1 Minute'
    '1 minute'
    '1 m'
    '1 units'
    '1 Units'
    '1 u'
    Shortform is fine if space is dropped
    '2m'
    '1u"
    When using generic units, can drop the unit
    1
    2
    '1'
    '2'
    When using observations, need to provide an entity as either a tuple or a separate arg
    ('2o', 'logs')
    ('2 o', 'logs')
    ('2 Observations', 'logs')
    ('2 observations', 'logs')
    ('2 observation', 'logs')
    If an entity is provided and no unit is provided, assume observations (instead of generic units)
    (2, 'logs')
    ('2', 'logs')



    """
    if td is None:
        return td
    if isinstance(td, Timedelta):
        if td.entity is not None and entity_id is not None and td.entity != entity_id:
            raise ValueError("Timedelta entity {} different from passed entity {}".format(td.entity, entity_id))
        if td.entity is not None and related_entity_id is not None and td.entity == related_entity_id:
            raise ValueError("Timedelta entity {} same as passed related entity {}".format(td.entity, related_entity_id))
        return td
    elif not (is_string(td) or isinstance(td, (tuple, int, float))):
        raise ValueError("Unable to parse timedelta: {}".format(td))

    # TODO: allow observations from an entity in string

    if isinstance(td, tuple):
        if entity_id is None:
            entity_id = td[1]
        td = td[0]

    value = None
    try:
        value = int(td)
    except Exception:
        try:
            value = float(td)
        except Exception:
            pass
    if value is not None and entity_id is not None:
        unit = 'o'
    elif value is not None:
        unit = 'u'
    else:
        pattern = '([0-9]+) *([a-zA-Z]+)$'
        match = re.match(pattern, td)
        value, unit = match.groups()
        try:
            value = int(value)
        except Exception:
            try:
                value = float(value)
            except Exception:
                raise ValueError("Unable to parse value {} from ".format(value) +
                                 "timedelta string: {}".format(td))
    return Timedelta(value, unit, entity=entity_id)