def _validate_entity_params(id, df, time_index): '''Validation checks for Entity inputs''' assert is_string(id), "Entity id must be a string" assert len(df.columns) == len(set(df.columns)), "Duplicate column names" for c in df.columns: if not is_string(c): raise ValueError("All column names must be strings (Column {} " "is not a string)".format(c)) if time_index is not None and time_index not in df.columns: raise LookupError('Time index not found in dataframe')
def _check_timedelta(td): """ Convert strings to Timedelta objects Allows for both shortform and longform units, as well as any form of capitalization '2 Minutes' '2 minutes' '2 m' '1 Minute' '1 minute' '1 m' '1 units' '1 Units' '1 u' Shortform is fine if space is dropped '2m' '1u" If a pd.Timedelta object is passed, units will be converted to seconds due to the underlying representation of pd.Timedelta. If a pd.DateOffset object is passed, it will be converted to a Featuretools Timedelta if it has one temporal parameter. Otherwise, it will remain a pd.DateOffset. """ if td is None: return td if isinstance(td, Timedelta): return td elif not (is_string(td) or isinstance(td, pd.Timedelta) or isinstance(td, (int, float)) or isinstance(td, pd.DateOffset)): raise ValueError("Unable to parse timedelta: {}".format(td)) if isinstance(td, pd.Timedelta): unit = 's' value = td.total_seconds() times = {unit: value} return Timedelta(times, delta_obj=td) elif isinstance(td, pd.DateOffset): # DateOffsets if td.__class__.__name__ == "DateOffset": times = dict() for td_unit, td_value in td.kwds.items(): times[td_unit] = td_value return Timedelta(times, delta_obj=td) # Special offsets (such as BDay) else: unit = td.__class__.__name__ value = td.__dict__['n'] times = dict([(unit, value)]) return Timedelta(times, delta_obj=td) else: pattern = '([0-9]+) *([a-zA-Z]+)$' match = re.match(pattern, td) value, unit = match.groups() try: value = int(value) except Exception: try: value = float(value) except Exception: raise ValueError("Unable to parse value {} from ".format(value) + "timedelta string: {}".format(td)) times = {unit: value} return Timedelta(times)
def __init__(self, value, unit=None): """ Args: value (float, str) : Value of timedelta, or string providing both unit and value. unit (str) : Unit of time delta. """ # TODO: check if value is int or float if is_string(value): from featuretools.utils.wrangle import _check_timedelta td = _check_timedelta(value) value, unit = td.value, td.unit self.value = value self._original_unit = None # to alert get_name that although we converted the unit to 'd' it was initially unit = self._check_unit_plural(unit) assert unit in self._readable_units or unit in self._readable_to_unit if unit in self._readable_to_unit: unit = self._readable_to_unit[unit] # weeks if unit in self._convert_to_days: self._original_unit = unit self.value = self.value * self._convert_to_days[unit] unit = 'd' self.unit = unit self.delta_obj = self.get_unit_type()
def __init__(self, id, entity, name=None): assert is_string(id), "Variable id must be a string" self.id = id self._name = name self.entity_id = entity.id assert entity.entityset is not None, "Entity must contain reference to EntitySet" self.entity = entity self._interesting_values = None
def ensure_compatible_dtype(left, right): # Pandas converts dtype to float # if all nans. If the actual values are # strings/objects though, future features # that depend on these values may error # unless we explicitly set the dtype to object if isinstance(left, pd.Series) and isinstance(right, pd.Series): if left.dtype != object and right.dtype == object: left = left.astype(object) elif right.dtype != object and left.dtype == object: right = right.astype(object) elif isinstance(left, pd.Series): if left.dtype != object and is_string(right): left = left.astype(object) elif isinstance(right, pd.Series): if right.dtype != object and is_string(left): right = right.astype(object) return left, right
def check_value(self, value, unit): if is_string(value): from featuretools.utils.wrangle import _check_timedelta td = _check_timedelta(value) self.times = td.times elif isinstance(value, dict): self.times = value else: self.times = {unit: value}
def _check_timedelta(td, entity_id=None, related_entity_id=None): """ Convert strings to Timedelta objects Allows for both shortform and longform units, as well as any form of capitalization '2 Minutes' '2 minutes' '2 m' '1 Minute' '1 minute' '1 m' '1 units' '1 Units' '1 u' Shortform is fine if space is dropped '2m' '1u" If a pd.Timedelta object is passed, units will be converted to seconds due to the underlying representation of pd .Timedelta. """ if td is None: return td if isinstance(td, Timedelta): return td elif not (is_string(td) or isinstance(td, pd.Timedelta) or isinstance(td, (int, float))): raise ValueError("Unable to parse timedelta: {}".format(td)) value = None try: value = int(td) except Exception: try: value = float(td) except Exception: pass if isinstance(td, pd.Timedelta): unit = 's' value = td.total_seconds() else: pattern = '([0-9]+) *([a-zA-Z]+)$' match = re.match(pattern, td) value, unit = match.groups() try: value = int(value) except Exception: try: value = float(value) except Exception: raise ValueError( "Unable to parse value {} from ".format(value) + "timedelta string: {}".format(td)) return Timedelta(value, unit)
def check_trans_primitive(primitive): trans_prim_dict = primitives.get_transform_primitives() if is_string(primitive): if primitive.lower() not in trans_prim_dict: raise ValueError( "Unknown transform primitive {}. ".format(primitive), "Call ft.primitives.list_primitives() to get", " a list of available primitives") primitive = trans_prim_dict[primitive.lower()] primitive = handle_primitive(primitive) if not isinstance(primitive, TransformPrimitive): raise ValueError("Primitive {} in trans_primitives or " "groupby_trans_primitives is not a transform " "primitive".format(type(primitive))) return primitive
def __init__(self, value, unit=None, entity=None, data=None, inclusive=False): """ Args: value (float, str) : Value of timedelta, or string providing both unit and value. unit (str) : Unit of time delta. entity (str, optional) : Entity id to use if unit equals "observations". data (pd.Series, optional) : series of timestamps to use with observations. Can be calculated later. inclusive (bool, optional) : if True, include events that are exactly timedelta distance away from the original time/observation """ # TODO: check if value is int or float if is_string(value): from featuretools.utils.wrangle import _check_timedelta td = _check_timedelta(value) value, unit = td.value, td.unit self.value = value self._original_unit = None # to alert get_name that although we converted the unit to 'd' it was initially unit = self._check_unit_plural(unit) assert unit in self._readable_units or unit in self._readable_to_unit if unit in self._readable_to_unit: unit = self._readable_to_unit[unit] # weeks if unit in self._convert_to_days: self._original_unit = unit self.value = self.value * self._convert_to_days[unit] unit = 'd' self.unit = unit if unit == self._Observations and entity is None: raise Exception("Must define entity to use %s as unit" % (unit)) self.entity = entity self.data = data self.inclusive = inclusive
def __init__(self, base_feature, group_feature, time_index=None, where=None, use_previous=None): """Summary Args: agg_feature (type): subclass of :class:`.AggregationPrimitive`; aggregation method being used. This is passed by the constructors of the cumfeat subclasses base_feature (:class:`.PrimitiveBase` or :class:`.Variable`): Feature or variable calculated on group_feature (:class:`.PrimitiveBase` or :class:`.Variable`): Feature or variable used to group the rows before computation where (optional[:class:`.PrimitiveBase`]): use_previous (optional[:class:`.Timedelta`): """ self.return_type = self.agg_feature.return_type base_feature = self._check_feature(base_feature) td_entity_id = None if is_string(use_previous): td_entity_id = base_feature.entity.id self.use_previous = _check_timedelta(use_previous, entity_id=td_entity_id) group_feature = self._check_feature(group_feature) self.group_feature = group_feature self.base_features = [base_feature, group_feature] if time_index is None: entity = base_feature.entity time_index = IdentityFeature(entity[entity.time_index]) self.base_features += [time_index] if where is not None: self.where = where super(CumFeature, self).__init__(*self.base_features)
def __init__(self, base_feature, group_feature, time_index=None, where=None, use_previous=None): """Summary Args: agg_feature (type): subclass of :class:`.AggregationPrimitive`; aggregation method being used. This is passed by the constructors of the cumfeat subclasses base_feature (:class:`.PrimitiveBase` or :class:`.Variable`): Feature or variable calculated on group_feature (:class:`.PrimitiveBase` or :class:`.Variable`): Feature or variable used to group the rows before computation where (optional[:class:`.PrimitiveBase`]): use_previous (optional[:class:`.Timedelta`): """ self.return_type = self.agg_feature.return_type base_feature = self._check_feature(base_feature) td_entity_id = None if is_string(use_previous): td_entity_id = base_feature.entity.id self.use_previous = _check_timedelta( use_previous, entity_id=td_entity_id) group_feature = self._check_feature(group_feature) self.group_feature = group_feature self.base_features = [base_feature, group_feature] if time_index is None: entity = base_feature.entity time_index = IdentityFeature(entity[entity.time_index]) self.base_features += [time_index] if where is not None: self.where = where super(CumFeature, self).__init__(*self.base_features)
def __init__(self, target_entity_id, entityset, agg_primitives=None, trans_primitives=None, where_primitives=None, max_depth=2, max_hlevel=2, max_features=-1, allowed_paths=None, ignore_entities=None, ignore_variables=None, seed_features=None, drop_contains=None, drop_exact=None, where_stacking_limit=1): # need to change max_depth and max_hlevel to None because DFs terminates when <0 if max_depth == -1: max_depth = None self.max_depth = max_depth if max_hlevel == -1: max_hlevel = None self.max_hlevel = max_hlevel self.max_features = max_features self.allowed_paths = allowed_paths if self.allowed_paths: self.allowed_paths = set() for path in allowed_paths: self.allowed_paths.add(tuple(path)) if ignore_entities is None: self.ignore_entities = set() else: if not isinstance(ignore_entities, list): raise TypeError('ignore_entities must be a list') assert target_entity_id not in ignore_entities,\ "Can't ignore target_entity!" self.ignore_entities = set(ignore_entities) self.ignore_variables = defaultdict(set) if ignore_variables is not None: for eid, vars in ignore_variables.items(): self.ignore_variables[eid] = set(vars) self.target_entity_id = target_entity_id self.es = entityset if agg_primitives is None: agg_primitives = [ ftypes.Sum, ftypes.Std, ftypes.Max, ftypes.Skew, ftypes.Min, ftypes.Mean, ftypes.Count, ftypes.PercentTrue, ftypes.NUnique, ftypes.Mode ] self.agg_primitives = [] agg_prim_dict = ftypes.get_aggregation_primitives() for a in agg_primitives: if is_string(a): if a.lower() not in agg_prim_dict: raise ValueError( "Unknown aggregation primitive {}. ".format(a), "Call ft.primitives.list_primitives() to get", " a list of available primitives") a = agg_prim_dict[a.lower()] a = handle_primitive(a) self.agg_primitives.append(a) if trans_primitives is None: trans_primitives = [ ftypes.Day, ftypes.Year, ftypes.Month, ftypes.Weekday, ftypes.Haversine, ftypes.NumWords, ftypes.NumCharacters ] # ftypes.TimeSince self.trans_primitives = [] trans_prim_dict = ftypes.get_transform_primitives() for t in trans_primitives: if is_string(t): if t.lower() not in trans_prim_dict: raise ValueError( "Unknown transform primitive {}. ".format(t), "Call ft.primitives.list_primitives() to get", " a list of available primitives") t = trans_prim_dict[t.lower()] t = handle_primitive(t) self.trans_primitives.append(t) if where_primitives is None: where_primitives = [ftypes.Count] self.where_primitives = [] for p in where_primitives: if is_string(p): prim_obj = agg_prim_dict.get(p.lower(), None) if prim_obj is None: raise ValueError( "Unknown where primitive {}. ".format(p), "Call ft.primitives.list_primitives() to get", " a list of available primitives") p = prim_obj p = handle_primitive(p) self.where_primitives.append(p) self.seed_features = seed_features or [] self.drop_exact = drop_exact or [] self.drop_contains = drop_contains or [] self.where_stacking_limit = where_stacking_limit
def normalize_entity(self, base_entity_id, new_entity_id, index, additional_variables=None, copy_variables=None, make_time_index=None, make_secondary_time_index=None, new_entity_time_index=None, new_entity_secondary_time_index=None): """Create a new entity and relationship from unique values of an existing variable. Args: base_entity_id (str) : Entity id from which to split. new_entity_id (str): Id of the new entity. index (str): Variable in old entity that will become index of new entity. Relationship will be created across this variable. additional_variables (list[str]): List of variable ids to remove from base_entity and move to new entity. copy_variables (list[str]): List of variable ids to copy from old entity and move to new entity. make_time_index (bool or str, optional): Create time index for new entity based on time index in base_entity, optionally specifying which variable in base_entity to use for time_index. If specified as True without a specific variable, uses the primary time index. Defaults to True if base entity has a time index. make_secondary_time_index (dict[str -> list[str]], optional): Create a secondary time index from key. Values of dictionary are the variables to associate with the secondary time index. Only one secondary time index is allowed. If None, only associate the time index. new_entity_time_index (str, optional): Rename new entity time index. new_entity_secondary_time_index (str, optional): Rename new entity secondary time index. """ base_entity = self.entity_dict[base_entity_id] additional_variables = additional_variables or [] copy_variables = copy_variables or [] if not isinstance(additional_variables, list): raise TypeError( "'additional_variables' must be a list, but received type {}". format(type(additional_variables))) if len(additional_variables) != len(set(additional_variables)): raise ValueError( "'additional_variables' contains duplicate variables. All variables must be unique." ) if not isinstance(copy_variables, list): raise TypeError( "'copy_variables' must be a list, but received type {}".format( type(copy_variables))) if len(copy_variables) != len(set(copy_variables)): raise ValueError( "'copy_variables' contains duplicate variables. All variables must be unique." ) for v in additional_variables + copy_variables: if v == index: raise ValueError( "Not copying {} as both index and variable".format(v)) break if is_string(make_time_index): if make_time_index not in base_entity.df.columns: raise ValueError( "'make_time_index' must be a variable in the base entity") elif make_time_index not in additional_variables + copy_variables: raise ValueError( "'make_time_index' must specified in 'additional_variables' or 'copy_variables'" ) if index == base_entity.index: raise ValueError( "'index' must be different from the index column of the base entity" ) transfer_types = {} transfer_types[index] = type(base_entity[index]) for v in additional_variables + copy_variables: transfer_types[v] = type(base_entity[v]) # create and add new entity new_entity_df = self[base_entity_id].df.copy() if make_time_index is None and base_entity.time_index is not None: make_time_index = True if isinstance(make_time_index, str): # Set the new time index to make_time_index. base_time_index = make_time_index new_entity_time_index = make_time_index already_sorted = (new_entity_time_index == base_entity.time_index) elif make_time_index: # Create a new time index based on the base entity time index. base_time_index = base_entity.time_index if new_entity_time_index is None: new_entity_time_index = "first_%s_time" % (base_entity.id) already_sorted = True assert base_entity.time_index is not None, \ "Base entity doesn't have time_index defined" if base_time_index not in [v for v in additional_variables]: copy_variables.append(base_time_index) transfer_types[new_entity_time_index] = type( base_entity[base_entity.time_index]) else: new_entity_time_index = None already_sorted = False if new_entity_time_index is not None and new_entity_time_index == index: raise ValueError( "time_index and index cannot be the same value, %s" % (new_entity_time_index)) selected_variables = [index] +\ [v for v in additional_variables] +\ [v for v in copy_variables] new_entity_df2 = new_entity_df. \ drop_duplicates(index, keep='first')[selected_variables] if make_time_index: new_entity_df2.rename( columns={base_time_index: new_entity_time_index}, inplace=True) if make_secondary_time_index: assert len(make_secondary_time_index ) == 1, "Can only provide 1 secondary time index" secondary_time_index = list(make_secondary_time_index.keys())[0] secondary_variables = [index, secondary_time_index] + list( make_secondary_time_index.values())[0] secondary_df = new_entity_df. \ drop_duplicates(index, keep='last')[secondary_variables] if new_entity_secondary_time_index: secondary_df.rename(columns={ secondary_time_index: new_entity_secondary_time_index }, inplace=True) secondary_time_index = new_entity_secondary_time_index else: new_entity_secondary_time_index = secondary_time_index secondary_df.set_index(index, inplace=True) new_entity_df = new_entity_df2.join(secondary_df, on=index) else: new_entity_df = new_entity_df2 base_entity_index = index transfer_types[index] = vtypes.Categorical if make_secondary_time_index: old_ti_name = list(make_secondary_time_index.keys())[0] ti_cols = list(make_secondary_time_index.values())[0] ti_cols = [ c if c != old_ti_name else secondary_time_index for c in ti_cols ] make_secondary_time_index = {secondary_time_index: ti_cols} self.entity_from_dataframe( new_entity_id, new_entity_df, index, already_sorted=already_sorted, time_index=new_entity_time_index, secondary_time_index=make_secondary_time_index, variable_types=transfer_types) self.entity_dict[base_entity_id].delete_variables(additional_variables) new_entity = self.entity_dict[new_entity_id] base_entity.convert_variable_type(base_entity_index, vtypes.Id, convert_data=False) self.add_relationship( Relationship(new_entity[index], base_entity[base_entity_index])) self.reset_data_description() return self
def __init__(self, target_entity_id, entityset, agg_primitives=None, trans_primitives=None, where_primitives=None, groupby_trans_primitives=None, max_depth=2, max_features=-1, allowed_paths=None, ignore_entities=None, ignore_variables=None, primitive_options=None, seed_features=None, drop_contains=None, drop_exact=None, where_stacking_limit=1): if target_entity_id not in entityset.entity_dict: es_name = entityset.id or 'entity set' msg = 'Provided target entity %s does not exist in %s' % ( target_entity_id, es_name) raise KeyError(msg) # need to change max_depth to None because DFs terminates when <0 if max_depth == -1: max_depth = None self.max_depth = max_depth self.max_features = max_features self.allowed_paths = allowed_paths if self.allowed_paths: self.allowed_paths = set() for path in allowed_paths: self.allowed_paths.add(tuple(path)) if ignore_entities is None: self.ignore_entities = set() else: if not isinstance(ignore_entities, list): raise TypeError('ignore_entities must be a list') assert target_entity_id not in ignore_entities,\ "Can't ignore target_entity!" self.ignore_entities = set(ignore_entities) self.ignore_variables = defaultdict(set) if ignore_variables is not None: for eid, vars in ignore_variables.items(): self.ignore_variables[eid] = set(vars) self.target_entity_id = target_entity_id self.es = entityset if agg_primitives is None: agg_primitives = [ primitives.Sum, primitives.Std, primitives.Max, primitives.Skew, primitives.Min, primitives.Mean, primitives.Count, primitives.PercentTrue, primitives.NumUnique, primitives.Mode ] self.agg_primitives = [] agg_prim_dict = primitives.get_aggregation_primitives() for a in agg_primitives: if is_string(a): if a.lower() not in agg_prim_dict: raise ValueError( "Unknown aggregation primitive {}. ".format(a), "Call ft.primitives.list_primitives() to get", " a list of available primitives") a = agg_prim_dict[a.lower()] a = handle_primitive(a) if not isinstance(a, AggregationPrimitive): raise ValueError("Primitive {} in agg_primitives is not an " "aggregation primitive".format(type(a))) self.agg_primitives.append(a) if trans_primitives is None: trans_primitives = [ primitives.Day, primitives.Year, primitives.Month, primitives.Weekday, primitives.Haversine, primitives.NumWords, primitives.NumCharacters ] # primitives.TimeSince self.trans_primitives = [] for t in trans_primitives: t = check_trans_primitive(t) self.trans_primitives.append(t) if where_primitives is None: where_primitives = [primitives.Count] self.where_primitives = [] for p in where_primitives: if is_string(p): prim_obj = agg_prim_dict.get(p.lower(), None) if prim_obj is None: raise ValueError( "Unknown where primitive {}. ".format(p), "Call ft.primitives.list_primitives() to get", " a list of available primitives") p = prim_obj p = handle_primitive(p) self.where_primitives.append(p) if groupby_trans_primitives is None: groupby_trans_primitives = [] self.groupby_trans_primitives = [] for p in groupby_trans_primitives: p = check_trans_primitive(p) self.groupby_trans_primitives.append(p) if primitive_options is None: primitive_options = {} all_primitives = self.trans_primitives + self.agg_primitives + \ self.where_primitives + self.groupby_trans_primitives self.primitive_options, self.ignore_entities =\ generate_all_primitive_options(all_primitives, primitive_options, self.ignore_entities, self.ignore_variables, self.es) self.seed_features = seed_features or [] self.drop_exact = drop_exact or [] self.drop_contains = drop_contains or [] self.where_stacking_limit = where_stacking_limit
def __init__(self, id, df, entityset, variable_types=None, index=None, time_index=None, secondary_time_index=None, last_time_index=None, encoding=None, already_sorted=False, created_index=None, verbose=False): """ Create Entity Args: id (str): Id of Entity. df (pd.DataFrame): Dataframe providing the data for the entity. entityset (EntitySet): Entityset for this Entity. variable_types (dict[str -> dict[str -> type]]) : Optional mapping of entity_id to variable_types dict with which to initialize an entity's store. An entity's variable_types dict maps string variable ids to types (:class:`.Variable`). index (str): Name of id column in the dataframe. time_index (str): Name of time column in the dataframe. secondary_time_index (dict[str -> str]): Dictionary mapping columns in the dataframe to the time index column they are associated with. last_time_index (pd.Series): Time index of the last event for each instance across all child entities. encoding (str, optional)) : If None, will use 'ascii'. Another option is 'utf-8', or any encoding supported by pandas. """ assert is_string(id), "Entity id must be a string" assert len(df.columns) == len(set(df.columns)), "Duplicate column names" self.data = {"df": df, "last_time_index": last_time_index, } self.encoding = encoding self._verbose = verbose self.created_index = created_index self.convert_all_variable_data(variable_types) self.id = id self.entityset = entityset variable_types = variable_types or {} self.index = index self.time_index = time_index self.secondary_time_index = secondary_time_index or {} # make sure time index is actually in the columns for ti, cols in self.secondary_time_index.items(): if ti not in cols: cols.append(ti) relationships = [r for r in entityset.relationships if r.parent_entity.id == id or r.child_entity.id == id] link_vars = [v.id for rel in relationships for v in [rel.parent_variable, rel.child_variable] if v.entity.id == self.id] inferred_variable_types = self.infer_variable_types(ignore=list(variable_types.keys()), link_vars=link_vars) for var_id, desired_type in variable_types.items(): if isinstance(desired_type, tuple): desired_type = desired_type[0] inferred_variable_types.update({var_id: desired_type}) self.variables = [] for v in inferred_variable_types: # TODO document how vtype can be tuple vtype = inferred_variable_types[v] if isinstance(vtype, tuple): # vtype is (ft.Variable, dict_of_kwargs) _v = vtype[0](v, self, **vtype[1]) else: _v = inferred_variable_types[v](v, self) self.variables += [_v] # do one last conversion of data once we've inferred self.convert_all_variable_data(inferred_variable_types) # make sure index is at the beginning index_variable = [v for v in self.variables if v.id == self.index][0] self.variables = [index_variable] + [v for v in self.variables if v.id != self.index] self.update_data(df=self.df, already_sorted=already_sorted, recalculate_last_time_indexes=False)
def __init__(self, target_entity_id, entityset, agg_primitives=None, trans_primitives=None, where_primitives=None, max_depth=2, max_hlevel=2, max_features=-1, allowed_paths=None, ignore_entities=None, ignore_variables=None, seed_features=None, drop_contains=None, drop_exact=None, where_stacking_limit=1): # need to change max_depth and max_hlevel to None because DFs terminates when <0 if max_depth == -1: max_depth = None self.max_depth = max_depth if max_hlevel == -1: max_hlevel = None self.max_hlevel = max_hlevel self.max_features = max_features self.allowed_paths = allowed_paths if self.allowed_paths: self.allowed_paths = set() for path in allowed_paths: self.allowed_paths.add(tuple(path)) if ignore_entities is None: self.ignore_entities = set() else: if not isinstance(ignore_entities, list): raise TypeError('ignore_entities must be a list') assert target_entity_id not in ignore_entities,\ "Can't ignore target_entity!" self.ignore_entities = set(ignore_entities) self.ignore_variables = defaultdict(set) if ignore_variables is not None: for eid, vars in ignore_variables.items(): self.ignore_variables[eid] = set(vars) self.target_entity_id = target_entity_id self.es = entityset if agg_primitives is None: agg_primitives = [ftypes.Sum, ftypes.Std, ftypes.Max, ftypes.Skew, ftypes.Min, ftypes.Mean, ftypes.Count, ftypes.PercentTrue, ftypes.NUnique, ftypes.Mode] self.agg_primitives = [] agg_prim_dict = ftypes.get_aggregation_primitives() for a in agg_primitives: if is_string(a): if a.lower() not in agg_prim_dict: raise ValueError("Unknown aggregation primitive {}. ".format(a), "Call ft.primitives.list_primitives() to get", " a list of available primitives") a = agg_prim_dict[a.lower()] self.agg_primitives.append(a) if trans_primitives is None: trans_primitives = [ftypes.Day, ftypes.Year, ftypes.Month, ftypes.Weekday, ftypes.Haversine, ftypes.NumWords, ftypes.NumCharacters] # ftypes.TimeSince self.trans_primitives = [] trans_prim_dict = ftypes.get_transform_primitives() for t in trans_primitives: if is_string(t): if t.lower() not in trans_prim_dict: raise ValueError("Unknown transform primitive {}. ".format(t), "Call ft.primitives.list_primitives() to get", " a list of available primitives") t = trans_prim_dict[t.lower()] self.trans_primitives.append(t) if where_primitives is None: where_primitives = [ftypes.Count] self.where_primitives = [] for p in where_primitives: if is_string(p): prim_obj = agg_prim_dict.get(p.lower(), None) if prim_obj is None: raise ValueError("Unknown where primitive {}. ".format(p), "Call ft.primitives.list_primitives() to get", " a list of available primitives") p = prim_obj self.where_primitives.append(p) self.seed_features = seed_features or [] self.drop_exact = drop_exact or [] self.drop_contains = drop_contains or [] self.where_stacking_limit = where_stacking_limit
def _check_timedelta(td, entity_id=None, related_entity_id=None): """ Convert strings to Timedelta objects Allows for both shortform and longform units, as well as any form of capitalization '2 Minutes' '2 minutes' '2 m' '1 Minute' '1 minute' '1 m' '1 units' '1 Units' '1 u' Shortform is fine if space is dropped '2m' '1u" When using generic units, can drop the unit 1 2 '1' '2' When using observations, need to provide an entity as either a tuple or a separate arg ('2o', 'logs') ('2 o', 'logs') ('2 Observations', 'logs') ('2 observations', 'logs') ('2 observation', 'logs') If an entity is provided and no unit is provided, assume observations (instead of generic units) (2, 'logs') ('2', 'logs') """ if td is None: return td if isinstance(td, Timedelta): if td.entity is not None and entity_id is not None and td.entity != entity_id: raise ValueError("Timedelta entity {} different from passed entity {}".format(td.entity, entity_id)) if td.entity is not None and related_entity_id is not None and td.entity == related_entity_id: raise ValueError("Timedelta entity {} same as passed related entity {}".format(td.entity, related_entity_id)) return td elif not (is_string(td) or isinstance(td, (tuple, int, float))): raise ValueError("Unable to parse timedelta: {}".format(td)) # TODO: allow observations from an entity in string if isinstance(td, tuple): if entity_id is None: entity_id = td[1] td = td[0] value = None try: value = int(td) except Exception: try: value = float(td) except Exception: pass if value is not None and entity_id is not None: unit = 'o' elif value is not None: unit = 'u' else: pattern = '([0-9]+) *([a-zA-Z]+)$' match = re.match(pattern, td) value, unit = match.groups() try: value = int(value) except Exception: try: value = float(value) except Exception: raise ValueError("Unable to parse value {} from ".format(value) + "timedelta string: {}".format(td)) return Timedelta(value, unit, entity=entity_id)