Пример #1
0
    def _handle_time(self, entity_id, df, time_last=None, training_window=None, include_cutoff_time=True):
        """
        Filter a dataframe for all instances before time_last.
        If the DataTable does not have a time index, return the original
        dataframe.
        """
        dt = self[entity_id]
        if is_instance(df, ks, 'DataFrame') and isinstance(time_last, np.datetime64):
            time_last = pd.to_datetime(time_last)
        if dt.time_index:
            df_empty = df.empty if isinstance(df, pd.DataFrame) else False
            if time_last is not None and not df_empty:
                if include_cutoff_time:
                    df = df[df[dt.time_index] <= time_last]
                else:
                    df = df[df[dt.time_index] < time_last]
                if training_window is not None:
                    training_window = _check_timedelta(training_window)
                    if include_cutoff_time:
                        mask = df[dt.time_index] > time_last - training_window
                    else:
                        mask = df[dt.time_index] >= time_last - training_window
                    if dt.last_time_index is not None:
                        lti_slice = dt.last_time_index.reindex(df.index)
                        if include_cutoff_time:
                            lti_mask = lti_slice > time_last - training_window
                        else:
                            lti_mask = lti_slice >= time_last - training_window
                        mask = mask | lti_mask
                    else:
                        warnings.warn(
                            "Using training_window but last_time_index is "
                            "not set on entity %s" % (dt.id)
                        )

                    df = df[mask]

        for secondary_time_index, columns in dt.secondary_time_index.items():
            # should we use ignore time last here?
            df_empty = df.empty if isinstance(df, pd.DataFrame) else False
            if time_last is not None and not df_empty:
                mask = df[secondary_time_index] >= time_last
                if isinstance(df, dd.DataFrame):
                    for col in columns:
                        df[col] = df[col].mask(mask, np.nan)
                elif is_instance(df, ks, 'DataFrame'):
                    df.loc[mask, columns] = None
                else:
                    df.loc[mask, columns] = np.nan

        return df
Пример #2
0
def convert_variable_data(df, column_id, new_type, **kwargs):
    """Convert dataframe's variable to different type.
    """
    empty = df[column_id].empty if isinstance(df, pd.DataFrame) else False
    if empty:
        return df
    if new_type == vtypes.Numeric:
        if isinstance(df, dd.DataFrame):
            df[column_id] = dd.to_numeric(df[column_id], errors='coerce')
        elif is_instance(df, ks, 'DataFrame'):
            df[column_id] = ks.to_numeric(df[column_id])
        else:
            orig_nonnull = df[column_id].dropna().shape[0]
            df[column_id] = pd.to_numeric(df[column_id], errors='coerce')
            # This will convert strings to nans
            # If column contained all strings, then we should
            # just raise an error, because that shouldn't have
            # been converted to numeric
            nonnull = df[column_id].dropna().shape[0]
            if nonnull == 0 and orig_nonnull != 0:
                raise TypeError(
                    "Attempted to convert all string column {} to numeric".
                    format(column_id))
    elif issubclass(new_type, vtypes.Datetime):
        format = kwargs.get("format", None)
        # TODO: if float convert to int?
        if isinstance(df, dd.DataFrame):
            df[column_id] = dd.to_datetime(df[column_id],
                                           format=format,
                                           infer_datetime_format=True)
        elif is_instance(df, ks, 'DataFrame'):
            df[column_id] = ks.to_datetime(df[column_id],
                                           format=format,
                                           infer_datetime_format=True)
        else:
            df[column_id] = pd.to_datetime(df[column_id],
                                           format=format,
                                           infer_datetime_format=True)
    elif new_type == vtypes.Boolean:
        map_dict = {
            kwargs.get("true_val", True): True,
            kwargs.get("false_val", False): False,
            True: True,
            False: False
        }
        # TODO: what happens to nans?
        df[column_id] = df[column_id].map(map_dict).astype(np.bool)
    elif not issubclass(new_type, vtypes.Discrete):
        raise Exception("Cannot convert column %s to %s" %
                        (column_id, new_type))
    return df
Пример #3
0
def to_pandas(df, index=None, sort_index=False, int_index=False):
    '''
    Testing util to convert dataframes to pandas. If a pandas dataframe is passed in, just returns the dataframe.

    Args:
        index (str, optional): column name to set as index, defaults to None
        sort_index (bool, optional): whether to sort the dataframe on the index after setting it, defaults to False
        int_index (bool, optional): Converts computed dask index to Int64Index to avoid errors, defaults to False

    Returns:
        Pandas DataFrame
    '''
    if isinstance(df, (pd.DataFrame, pd.Series)):
        return df

    if isinstance(df, (dd.DataFrame, dd.Series)):
        pd_df = df.compute()
    if is_instance(df, (ks, ks), ('DataFrame', 'Series')):
        pd_df = df.to_pandas()

    if index:
        pd_df = pd_df.set_index(index)
    if sort_index:
        pd_df = pd_df.sort_index()
    if int_index and isinstance(df, dd.DataFrame):
        pd_df.index = pd.Int64Index(pd_df.index)

    return pd_df
Пример #4
0
    def set_time_index(self, variable_id, already_sorted=False):
        # check time type
        if not isinstance(self.df, pd.DataFrame) or self.df.empty:
            time_to_check = vtypes.DEFAULT_DTYPE_VALUES[self[variable_id]._default_pandas_dtype]
        else:
            time_to_check = self.df[variable_id].iloc[0]
        time_type = _check_time_type(time_to_check)

        if time_type is None:
            raise TypeError("%s time index not recognized as numeric or"
                            " datetime" % (self.id))

        if self.entityset.time_type is None:
            self.entityset.time_type = time_type
        elif self.entityset.time_type != time_type:
            raise TypeError("%s time index is %s type which differs from"
                            " other entityset time indexes" %
                            (self.id, time_type))

        if is_instance(self.df, (dd, ks), 'DataFrame'):
            t = time_type  # skip checking values
            already_sorted = True  # skip sorting
        else:
            t = vtypes.NumericTimeIndex
            if col_is_datetime(self.df[variable_id]):
                t = vtypes.DatetimeTimeIndex

        # use stable sort
        if not already_sorted:
            # sort by time variable, then by index
            self.df = self.df.sort_values([variable_id, self.index])

        self.convert_variable_type(variable_id, t, convert_data=False)

        self.time_index = variable_id
Пример #5
0
def _create_index(index, make_index, df):
    '''Handles index creation logic base on user input'''
    created_index = None

    if index is None:
        # Case 1: user wanted to make index but did not specify column name
        assert not make_index, "Must specify an index name if make_index is True"
        # Case 2: make_index not specified but no index supplied, use first column
        warnings.warn(("Using first column as index. "
                       "To change this, specify the index parameter"))
        index = df.columns[0]
    elif make_index and index in df.columns:
        # Case 3: user wanted to make index but column already exists
        raise RuntimeError("Cannot make index: index variable already present")
    elif index not in df.columns:
        if not make_index:
            # Case 4: user names index, it is not in df. does not specify
            # make_index.  Make new index column and warn
            warnings.warn("index {} not found in dataframe, creating new "
                          "integer column".format(index))
        # Case 5: make_index with no errors or warnings
        # (Case 4 also uses this code path)
        if isinstance(df, dd.DataFrame):
            df[index] = 1
            df[index] = df[index].cumsum() - 1
        elif is_instance(df, ks, 'DataFrame'):
            df = df.koalas.attach_id_column('distributed-sequence', index)
        else:
            df.insert(0, index, range(len(df)))
        created_index = index
    # Case 6: user specified index, which is already in df. No action needed.
    return created_index, index, df
Пример #6
0
def _vals_to_series(instance_vals, variable_id):
    """
    instance_vals may be a pd.Dataframe, a pd.Series, a list, a single
    value, or None. This function always returns a Series or None.
    """
    if instance_vals is None:
        return None

    # If this is a single value, make it a list
    if not hasattr(instance_vals, '__iter__'):
        instance_vals = [instance_vals]

    # convert iterable to pd.Series
    if isinstance(instance_vals, pd.DataFrame):
        out_vals = instance_vals[variable_id]
    elif is_instance(instance_vals, (pd, dd, ks), 'Series'):
        out_vals = instance_vals.rename(variable_id)
    else:
        out_vals = pd.Series(instance_vals)

    # no duplicates or NaN values
    out_vals = out_vals.drop_duplicates().dropna()

    # want index to have no name for the merge in query_by_values
    out_vals.index.name = None

    return out_vals
    def _calculate_direct_features(self, features, child_df, df_trie,
                                   progress_callback):
        path = features[0].relationship_path
        assert len(
            path) == 1, "Error calculating DirectFeatures, len(path) != 1"

        parent_df = df_trie.get_node([path[0]]).value
        _is_forward, relationship = path[0]
        merge_col = relationship._child_column_name

        # generate a mapping of old column names (in the parent dataframe) to
        # new column names (in the child dataframe) for the merge
        col_map = {relationship._parent_column_name: merge_col}
        index_as_feature = None

        fillna_dict = {}
        for f in features:
            feature_defaults = {
                name: f.default_value
                for name in f.get_feature_names()
                if not pd.isna(f.default_value)
            }
            fillna_dict.update(feature_defaults)
            if f.base_features[0].get_name(
            ) == relationship._parent_column_name:
                index_as_feature = f
            base_names = f.base_features[0].get_feature_names()
            for name, base_name in zip(f.get_feature_names(), base_names):
                if name in child_df.columns:
                    continue
                col_map[base_name] = name

        # merge the identity feature from the parent dataframe into the child
        merge_df = parent_df[list(col_map.keys())].rename(columns=col_map)
        if is_instance(merge_df, (dd, ps), "DataFrame"):
            new_df = child_df.merge(merge_df,
                                    left_on=merge_col,
                                    right_on=merge_col,
                                    how="left")
        else:
            if index_as_feature is not None:
                merge_df.set_index(index_as_feature.get_name(),
                                   inplace=True,
                                   drop=False)
            else:
                merge_df.set_index(merge_col, inplace=True)

            new_df = child_df.merge(merge_df,
                                    left_on=merge_col,
                                    right_index=True,
                                    how="left")

        progress_callback(len(features) / float(self.num_features))

        return new_df.fillna(fillna_dict)
Пример #8
0
def write_entity_data(entity, path, format='csv', **kwargs):
    '''Write entity data to disk or S3 path.

    Args:
        entity (Entity) : Instance of :class:`.Entity`.
        path (str) : Location on disk to write entity data.
        format (str) : Format to use for writing entity data. Defaults to csv.
        kwargs (keywords) : Additional keyword arguments to pass as keywords arguments to the underlying serialization method.

    Returns:
        loading_info (dict) : Information on storage location and format of entity data.
    '''
    format = format.lower()
    if isinstance(entity.df, dd.DataFrame) and format == 'csv':
        basename = "{}-*.{}".format(entity.id, format)
    else:
        basename = '.'.join([entity.id, format])
    location = os.path.join('data', basename)
    file = os.path.join(path, location)
    df = entity.df

    if format == 'csv':
        if is_instance(df, ks, 'DataFrame'):
            df = df.copy()
            columns = list(df.select_dtypes('object').columns)
            df[columns] = df[columns].astype(str)
        df.to_csv(
            file,
            index=kwargs['index'],
            sep=kwargs['sep'],
            encoding=kwargs['encoding'],
            compression=kwargs['compression'],
        )
    elif format == 'parquet':
        # Serializing to parquet format raises an error when columns contain tuples.
        # Columns containing tuples are mapped as dtype object.
        # Issue is resolved by casting columns of dtype object to string.
        df = df.copy()
        columns = list(df.select_dtypes('object').columns)
        df[columns] = df[columns].astype(str)
        df.to_parquet(file, **kwargs)
    elif format == 'pickle':
        # Dask currently does not support to_pickle
        if isinstance(df, dd.DataFrame):
            msg = 'Cannot serialize Dask EntitySet to pickle'
            raise ValueError(msg)
        else:
            df.to_pickle(file, **kwargs)
    else:
        error = 'must be one of the following formats: {}'
        raise ValueError(error.format(', '.join(FORMATS)))
    return {'location': location, 'type': format, 'params': kwargs}
Пример #9
0
def init_ww_and_concat_fm(feature_matrix, ww_init_kwargs):
    for fm in feature_matrix:
        fm.ww.init(**ww_init_kwargs)

    if any(isinstance(fm, dd.DataFrame) for fm in feature_matrix):
        feature_matrix = dd.concat(feature_matrix)
    elif any(is_instance(fm, ks, 'DataFrame') for fm in feature_matrix):
        feature_matrix = ks.concat(feature_matrix)
    else:
        feature_matrix = pd.concat(feature_matrix)

    feature_matrix.ww.init(**ww_init_kwargs)
    return feature_matrix
def init_ww_and_concat_fm(feature_matrix, ww_init_kwargs):
    cols_to_check = {
        col
        for col, ltype in ww_init_kwargs["logical_types"].items()
        if isinstance(ltype, (Age, Boolean, Integer))
    }
    replacement_type = {
        "age": AgeNullable(),
        "boolean": BooleanNullable(),
        "integer": IntegerNullable(),
    }
    for fm in feature_matrix:
        updated_cols = set()
        for col in cols_to_check:
            # Only convert types for pandas if null values are present
            # Always convert for Dask/Spark to avoid pulling data into memory for null check
            is_pandas_df_with_null = (isinstance(fm, pd.DataFrame)
                                      and fm[col].isnull().any())
            is_dask_df = isinstance(fm, dd.DataFrame)
            is_spark_df = is_instance(fm, ps, "DataFrame")
            if is_pandas_df_with_null or is_dask_df or is_spark_df:
                current_type = ww_init_kwargs["logical_types"][col].type_string
                ww_init_kwargs["logical_types"][col] = replacement_type[
                    current_type]
                updated_cols.add(col)
        cols_to_check = cols_to_check - updated_cols
        fm.ww.init(**ww_init_kwargs)

    if any(isinstance(fm, dd.DataFrame) for fm in feature_matrix):
        feature_matrix = dd.concat(feature_matrix)
    elif any(is_instance(fm, ps, "DataFrame") for fm in feature_matrix):
        feature_matrix = ps.concat(feature_matrix)
    else:
        feature_matrix = pd.concat(feature_matrix)

    feature_matrix.ww.init(**ww_init_kwargs)
    return feature_matrix
Пример #11
0
def entity_to_description(entity):
    '''Serialize entity to data description.

    Args:
        entity (Entity) : Instance of :class:`.Entity`.

    Returns:
        dictionary (dict) : Description of :class:`.Entity`.
    '''
    index = entity.df.columns.isin([variable.id for variable in entity.variables])
    indexer = entity.df.columns[index].to_list() if is_instance(entity.df, ks, 'DataFrame') else entity.df.columns[index]
    dtypes = entity.df[indexer].dtypes.astype(str).to_dict()
    if isinstance(entity.df, dd.DataFrame):
        entity_type = 'dask'
    elif is_instance(entity.df, ks, 'DataFrame'):
        entity_type = 'koalas'
    else:
        entity_type = 'pandas'
    description = {
        "id": entity.id,
        "index": entity.index,
        "time_index": entity.time_index,
        "properties": {
            'secondary_time_index': entity.secondary_time_index,
            'last_time_index': entity.last_time_index is not None,
        },
        "variables": [variable.to_data_description() for variable in entity.variables],
        "loading_info": {
            'entity_type': entity_type,
            'params': {},
            'properties': {
                'dtypes': dtypes
            }
        }
    }

    return description
    def _calculate_direct_features(self, features, child_df, df_trie,
                                   progress_callback):
        path = features[0].relationship_path
        assert len(path) == 1, \
            "Error calculating DirectFeatures, len(path) != 1"

        parent_df = df_trie.get_node([path[0]]).value
        _is_forward, relationship = path[0]
        merge_var = relationship.child_variable.id

        # generate a mapping of old column names (in the parent entity) to
        # new column names (in the child entity) for the merge
        col_map = {relationship.parent_variable.id: merge_var}
        index_as_feature = None
        for f in features:
            if f.base_features[0].get_name(
            ) == relationship.parent_variable.id:
                index_as_feature = f
            base_names = f.base_features[0].get_feature_names()
            for name, base_name in zip(f.get_feature_names(), base_names):
                if name in child_df.columns:
                    continue
                col_map[base_name] = name

        # merge the identity feature from the parent entity into the child
        merge_df = parent_df[list(col_map.keys())].rename(columns=col_map)
        if is_instance(merge_df, (dd, ks), 'DataFrame'):
            new_df = child_df.merge(merge_df,
                                    left_on=merge_var,
                                    right_on=merge_var,
                                    how='left')
        else:
            if index_as_feature is not None:
                merge_df.set_index(index_as_feature.get_name(),
                                   inplace=True,
                                   drop=False)
            else:
                merge_df.set_index(merge_var, inplace=True)

            new_df = child_df.merge(merge_df,
                                    left_on=merge_var,
                                    right_index=True,
                                    how='left')

        progress_callback(len(features) / float(self.num_features))

        return new_df
Пример #13
0
    def to_csv(self, path, sep=',', encoding='utf-8', engine='python', compression=None, profile_name=None):
        '''Write entityset to disk in the csv format, location specified by `path`.
            Path could be a local path or a S3 path.
            If writing to S3 a tar archive of files will be written.

            Args:
                path (str) : Location on disk to write to (will be created as a directory)
                sep (str) : String of length 1. Field delimiter for the output file.
                encoding (str) : A string representing the encoding to use in the output file, defaults to 'utf-8'.
                engine (str) : Name of the engine to use. Possible values are: {'c', 'python'}.
                compression (str) : Name of the compression to use. Possible values are: {'gzip', 'bz2', 'zip', 'xz', None}.
                profile_name (str) : Name of AWS profile to use, False to use an anonymous profile, or None.
        '''
        if is_instance(self.entities[0].df, ks, 'DataFrame'):
            compression = str(compression)
        serialize.write_data_description(self, path, format='csv', index=False, sep=sep, encoding=encoding, engine=engine, compression=compression, profile_name=profile_name)
        return self
Пример #14
0
    def set_secondary_time_index(self, secondary_time_index):
        for time_index, columns in secondary_time_index.items():
            if is_instance(self.df, (dd, ks), 'DataFrame') or self.df.empty:
                time_to_check = vtypes.DEFAULT_DTYPE_VALUES[self[time_index]._default_pandas_dtype]
            else:
                time_to_check = self.df[time_index].head(1).iloc[0]
            time_type = _check_time_type(time_to_check)
            if time_type is None:
                raise TypeError("%s time index not recognized as numeric or"
                                " datetime" % (self.id))
            if self.entityset.time_type != time_type:
                raise TypeError("%s time index is %s type which differs from"
                                " other entityset time indexes" %
                                (self.id, time_type))
            if time_index not in columns:
                columns.append(time_index)

        self.secondary_time_index = secondary_time_index
Пример #15
0
def test_is_instance_multiple_modules(df):
    df2 = dd.from_pandas(df, npartitions=2)
    assert is_instance(df, (dd, pd), 'DataFrame')
    assert is_instance(df2, (dd, pd), 'DataFrame')
    assert is_instance(df2['id'], (dd, pd), ('Series', 'DataFrame'))
    assert not is_instance(df2['id'], (dd, pd), ('DataFrame', 'Series'))
Пример #16
0
def test_is_instance_none_module(df):
    assert not is_instance(df, None, "DataFrame")
    assert is_instance(df, (None, pd), "DataFrame")
    assert is_instance(df, (None, pd), ("Series", "DataFrame"))
Пример #17
0
def test_is_instance_errors_mismatch():
    msg = "Number of modules does not match number of classnames"
    with pytest.raises(ValueError, match=msg):
        is_instance("abc", pd, ("DataFrame", "Series"))
Пример #18
0
def test_is_instance_multiple_modules(df):
    df2 = dd.from_pandas(df, npartitions=2)
    assert is_instance(df, (dd, pd), "DataFrame")
    assert is_instance(df2, (dd, pd), "DataFrame")
    assert is_instance(df2["id"], (dd, pd), ("Series", "DataFrame"))
    assert not is_instance(df2["id"], (dd, pd), ("DataFrame", "Series"))
Пример #19
0
def test_is_instance_single_module(df):
    assert is_instance(df, pd, "DataFrame")
Пример #20
0
    def __init__(self,
                 target_entity_id,
                 entityset,
                 agg_primitives=None,
                 trans_primitives=None,
                 where_primitives=None,
                 groupby_trans_primitives=None,
                 max_depth=2,
                 max_features=-1,
                 allowed_paths=None,
                 ignore_entities=None,
                 ignore_variables=None,
                 primitive_options=None,
                 seed_features=None,
                 drop_contains=None,
                 drop_exact=None,
                 where_stacking_limit=1):

        if target_entity_id not in entityset.entity_dict:
            es_name = entityset.id or 'entity set'
            msg = 'Provided target entity %s does not exist in %s' % (target_entity_id, es_name)
            raise KeyError(msg)

        # need to change max_depth to None because DFs terminates when  <0
        if max_depth == -1:
            max_depth = None

        # if just one entity, set max depth to 1 (transform stacking rule)
        if len(entityset.entity_dict) == 1 and (max_depth is None or max_depth > 1):
            warnings.warn("Only one entity in entityset, changing max_depth to "
                          "1 since deeper features cannot be created")
            max_depth = 1

        self.max_depth = max_depth

        self.max_features = max_features

        self.allowed_paths = allowed_paths
        if self.allowed_paths:
            self.allowed_paths = set()
            for path in allowed_paths:
                self.allowed_paths.add(tuple(path))

        if ignore_entities is None:
            self.ignore_entities = set()
        else:
            if not isinstance(ignore_entities, list):
                raise TypeError('ignore_entities must be a list')
            assert target_entity_id not in ignore_entities,\
                "Can't ignore target_entity!"
            self.ignore_entities = set(ignore_entities)

        self.ignore_variables = defaultdict(set)
        if ignore_variables is not None:
            # check if ignore_variables is not {str: list}
            if not all(isinstance(i, str) for i in ignore_variables.keys()) or not all(isinstance(i, list) for i in ignore_variables.values()):
                raise TypeError('ignore_variables should be dict[str -> list]')
            # check if list values are all of type str
            elif not all(all(isinstance(v, str) for v in value) for value in ignore_variables.values()):
                raise TypeError('list values should be of type str')
            for eid, vars in ignore_variables.items():
                self.ignore_variables[eid] = set(vars)
        self.target_entity_id = target_entity_id
        self.es = entityset

        if any(isinstance(entity.df, dd.DataFrame) for entity in self.es.entities):
            entityset_type = Library.DASK
        elif any(is_instance(entity.df, ks, 'DataFrame') for entity in self.es.entities):
            entityset_type = Library.KOALAS
        else:
            entityset_type = Library.PANDAS

        if agg_primitives is None:
            agg_primitives = [p for p in primitives.get_default_aggregation_primitives() if entityset_type in p.compatibility]
        self.agg_primitives = []
        agg_prim_dict = primitives.get_aggregation_primitives()
        for a in agg_primitives:
            if isinstance(a, str):
                if a.lower() not in agg_prim_dict:
                    raise ValueError("Unknown aggregation primitive {}. ".format(a),
                                     "Call ft.primitives.list_primitives() to get",
                                     " a list of available primitives")
                a = agg_prim_dict[a.lower()]
            a = handle_primitive(a)
            if not isinstance(a, AggregationPrimitive):
                raise ValueError("Primitive {} in agg_primitives is not an "
                                 "aggregation primitive".format(type(a)))
            self.agg_primitives.append(a)
        self.agg_primitives.sort()

        if trans_primitives is None:
            trans_primitives = [p for p in primitives.get_default_transform_primitives() if entityset_type in p.compatibility]
        self.trans_primitives = []
        for t in trans_primitives:
            t = check_trans_primitive(t)
            self.trans_primitives.append(t)
        self.trans_primitives.sort()

        if where_primitives is None:
            where_primitives = [primitives.Count]
        self.where_primitives = []
        for p in where_primitives:
            if isinstance(p, str):
                prim_obj = agg_prim_dict.get(p.lower(), None)
                if prim_obj is None:
                    raise ValueError("Unknown where primitive {}. ".format(p),
                                     "Call ft.primitives.list_primitives() to get",
                                     " a list of available primitives")
                p = prim_obj
            p = handle_primitive(p)
            self.where_primitives.append(p)
        self.where_primitives.sort()

        if groupby_trans_primitives is None:
            groupby_trans_primitives = []
        self.groupby_trans_primitives = []
        for p in groupby_trans_primitives:
            p = check_trans_primitive(p)
            self.groupby_trans_primitives.append(p)
        self.groupby_trans_primitives.sort()

        if primitive_options is None:
            primitive_options = {}
        all_primitives = self.trans_primitives + self.agg_primitives + \
            self.where_primitives + self.groupby_trans_primitives
        bad_primitives = [prim.name for prim in all_primitives if entityset_type not in prim.compatibility]
        if bad_primitives:
            msg = 'Selected primitives are incompatible with {} EntitySets: {}'
            raise ValueError(msg.format(entityset_type.value, ', '.join(bad_primitives)))

        self.primitive_options, self.ignore_entities, self.ignore_variables =\
            generate_all_primitive_options(all_primitives,
                                           primitive_options,
                                           self.ignore_entities,
                                           self.ignore_variables,
                                           self.es)
        self.seed_features = sorted(seed_features or [], key=lambda f: f.unique_name())
        self.drop_exact = drop_exact or []
        self.drop_contains = drop_contains or []
        self.where_stacking_limit = where_stacking_limit
def calculate_chunk(cutoff_time,
                    chunk_size,
                    feature_set,
                    entityset,
                    approximate,
                    training_window,
                    save_progress,
                    no_unapproximated_aggs,
                    cutoff_df_time_var,
                    target_time,
                    pass_columns,
                    progress_bar=None,
                    progress_callback=None,
                    include_cutoff_time=True):

    if not isinstance(feature_set, FeatureSet):
        feature_set = cloudpickle.loads(feature_set)

    feature_matrix = []
    if no_unapproximated_aggs and approximate is not None:
        if entityset.time_type == NumericTimeIndex:
            group_time = np.inf
        else:
            group_time = datetime.now()

    if isinstance(cutoff_time, tuple):
        update_progress_callback = None
        if progress_bar is not None:

            def update_progress_callback(done):
                previous_progress = progress_bar.n
                progress_bar.update(done * len(cutoff_time[1]))
                if progress_callback is not None:
                    update, progress_percent, time_elapsed = update_progress_callback_parameters(
                        progress_bar, previous_progress)
                    progress_callback(update, progress_percent, time_elapsed)

        time_last = cutoff_time[0]
        ids = cutoff_time[1]
        calculator = FeatureSetCalculator(entityset,
                                          feature_set,
                                          time_last,
                                          training_window=training_window)
        _feature_matrix = calculator.run(
            ids,
            progress_callback=update_progress_callback,
            include_cutoff_time=include_cutoff_time)
        if isinstance(_feature_matrix, pd.DataFrame):
            time_index = pd.Index([time_last] * len(ids), name='time')
            _feature_matrix = _feature_matrix.set_index(time_index,
                                                        append=True)
        feature_matrix.append(_feature_matrix)

    else:
        for _, group in cutoff_time.groupby(cutoff_df_time_var):
            # if approximating, calculate the approximate features
            if approximate is not None:
                precalculated_features_trie = approximate_features(
                    feature_set,
                    group,
                    window=approximate,
                    entityset=entityset,
                    training_window=training_window,
                    include_cutoff_time=include_cutoff_time,
                )
            else:
                precalculated_features_trie = None

            @save_csv_decorator(save_progress)
            def calc_results(time_last,
                             ids,
                             precalculated_features=None,
                             training_window=None,
                             include_cutoff_time=True):
                update_progress_callback = None

                if progress_bar is not None:

                    def update_progress_callback(done):
                        previous_progress = progress_bar.n
                        progress_bar.update(done * group.shape[0])
                        if progress_callback is not None:
                            update, progress_percent, time_elapsed = update_progress_callback_parameters(
                                progress_bar, previous_progress)
                            progress_callback(update, progress_percent,
                                              time_elapsed)

                calculator = FeatureSetCalculator(
                    entityset,
                    feature_set,
                    time_last,
                    training_window=training_window,
                    precalculated_features=precalculated_features)
                matrix = calculator.run(
                    ids,
                    progress_callback=update_progress_callback,
                    include_cutoff_time=include_cutoff_time)

                return matrix

            # if all aggregations have been approximated, can calculate all together
            if no_unapproximated_aggs and approximate is not None:
                inner_grouped = [[group_time, group]]
            else:
                # if approximated features, set cutoff_time to unbinned time
                if precalculated_features_trie is not None:
                    group[cutoff_df_time_var] = group[target_time]

                inner_grouped = group.groupby(cutoff_df_time_var, sort=True)

            if chunk_size is not None:
                inner_grouped = _chunk_dataframe_groups(
                    inner_grouped, chunk_size)

            for time_last, group in inner_grouped:
                # sort group by instance id
                ids = group['instance_id'].sort_values().values
                if no_unapproximated_aggs and approximate is not None:
                    window = None
                else:
                    window = training_window

                # calculate values for those instances at time time_last
                _feature_matrix = calc_results(
                    time_last,
                    ids,
                    precalculated_features=precalculated_features_trie,
                    training_window=window,
                    include_cutoff_time=include_cutoff_time)

                if is_instance(_feature_matrix, (dd, ks), 'DataFrame'):
                    id_name = _feature_matrix.columns[-1]
                else:
                    id_name = _feature_matrix.index.name

                # if approximate, merge feature matrix with group frame to get original
                # cutoff times and passed columns
                if approximate:
                    cols = [
                        c for c in _feature_matrix.columns
                        if c not in pass_columns
                    ]
                    indexer = group[['instance_id', target_time] +
                                    pass_columns]
                    _feature_matrix = _feature_matrix[cols].merge(
                        indexer,
                        right_on=['instance_id'],
                        left_index=True,
                        how='right')
                    _feature_matrix.set_index(['instance_id', target_time],
                                              inplace=True)
                    _feature_matrix.index.set_names([id_name, 'time'],
                                                    inplace=True)
                    _feature_matrix.sort_index(level=1,
                                               kind='mergesort',
                                               inplace=True)
                else:
                    # all rows have same cutoff time. set time and add passed columns
                    num_rows = len(ids)
                    if len(pass_columns) > 0:
                        pass_through = group[
                            ['instance_id', cutoff_df_time_var] + pass_columns]
                        pass_through.rename(columns={
                            'instance_id': id_name,
                            cutoff_df_time_var: 'time'
                        },
                                            inplace=True)
                    if isinstance(_feature_matrix, pd.DataFrame):
                        time_index = pd.Index([time_last] * num_rows,
                                              name='time')
                        _feature_matrix = _feature_matrix.set_index(
                            time_index, append=True)
                        if len(pass_columns) > 0:
                            pass_through.set_index([id_name, 'time'],
                                                   inplace=True)
                            for col in pass_columns:
                                _feature_matrix[col] = pass_through[col]
                    elif isinstance(_feature_matrix,
                                    dd.DataFrame) and (len(pass_columns) > 0):
                        _feature_matrix['time'] = time_last
                        for col in pass_columns:
                            pass_df = dd.from_pandas(
                                pass_through[[id_name, 'time', col]],
                                npartitions=_feature_matrix.npartitions)
                            _feature_matrix = _feature_matrix.merge(
                                pass_df, how="outer")
                        _feature_matrix = _feature_matrix.drop(
                            columns=['time'])
                    elif is_instance(_feature_matrix, ks,
                                     'DataFrame') and (len(pass_columns) > 0):
                        _feature_matrix['time'] = time_last
                        for col in pass_columns:
                            pass_df = ks.from_pandas(
                                pass_through[[id_name, 'time', col]])
                            _feature_matrix = _feature_matrix.merge(
                                pass_df, how="outer")
                        _feature_matrix = _feature_matrix.drop(
                            columns=['time'])
                feature_matrix.append(_feature_matrix)

    if any(isinstance(fm, dd.DataFrame) for fm in feature_matrix):
        feature_matrix = dd.concat(feature_matrix)
    elif any(is_instance(fm, ks, 'DataFrame') for fm in feature_matrix):
        feature_matrix = ks.concat(feature_matrix)
    else:
        feature_matrix = pd.concat(feature_matrix)

    return feature_matrix
Пример #22
0
def test_is_instance_none_module(df):
    assert not is_instance(df, None, 'DataFrame')
    assert is_instance(df, (None, pd), 'DataFrame')
    assert is_instance(df, (None, pd), ('Series', 'DataFrame'))
Пример #23
0
    def add_last_time_indexes(self, updated_entities=None):
        """
        Calculates the last time index values for each entity (the last time
        an instance or children of that instance were observed).  Used when
        calculating features using training windows
        Args:
            updated_entities (list[str]): List of entity ids to update last_time_index for
                (will update all parents of those entities as well)
        """
        # Generate graph of entities to find leaf entities
        children = defaultdict(list)  # parent --> child mapping
        child_vars = defaultdict(dict)
        for r in self.relationships:
            children[r.parent_entity.id].append(r.child_entity)
            child_vars[r.parent_entity.id][r.child_entity.id] = r.child_variable

        updated_entities = updated_entities or []
        if updated_entities:
            # find parents of updated_entities
            parent_queue = updated_entities[:]
            parents = set()
            while len(parent_queue):
                e = parent_queue.pop(0)
                if e in parents:
                    continue
                parents.add(e)

                for parent_id, _ in self.get_forward_entities(e):
                    parent_queue.append(parent_id)

            queue = [self[p] for p in parents]
            to_explore = parents
        else:
            to_explore = set([e.id for e in self.entities[:]])
            queue = self.entities[:]

        explored = set()

        for e in queue:
            e.last_time_index = None

        # We will explore children of entities on the queue,
        # which may not be in the to_explore set. Therefore,
        # we check whether all elements of to_explore are in
        # explored, rather than just comparing length
        while not to_explore.issubset(explored):
            entity = queue.pop(0)

            if entity.last_time_index is None:
                if entity.time_index is not None:
                    lti = entity.df[entity.time_index].copy()
                    if isinstance(entity.df, dd.DataFrame):
                        # The current Dask implementation doesn't set the index of the dataframe
                        # to the entity's index, so we have to do it manually here
                        lti.index = entity.df[entity.index].copy()
                else:
                    lti = entity.df[entity.index].copy()
                    if isinstance(entity.df, dd.DataFrame):
                        lti.index = entity.df[entity.index].copy()
                        lti = lti.apply(lambda x: None)
                    elif is_instance(entity.df, ks, 'DataFrame'):
                        lti = ks.Series(pd.Series(index=lti.to_list(), name=lti.name))
                    else:
                        lti[:] = None
                entity.last_time_index = lti

            if entity.id in children:
                child_entities = children[entity.id]

                # if all children not explored, skip for now
                if not set([e.id for e in child_entities]).issubset(explored):
                    # Now there is a possibility that a child entity
                    # was not explicitly provided in updated_entities,
                    # and never made it onto the queue. If updated_entities
                    # is None then we just load all entities onto the queue
                    # so we didn't need this logic
                    for e in child_entities:
                        if e.id not in explored and e.id not in [q.id for q in queue]:
                            queue.append(e)
                    queue.append(entity)
                    continue

                # updated last time from all children
                for child_e in child_entities:
                    # TODO: Figure out if Dask code related to indexes is important for Koalas
                    if child_e.last_time_index is None:
                        continue
                    link_var = child_vars[entity.id][child_e.id].id

                    lti_is_dask = isinstance(child_e.last_time_index, dd.Series)
                    lti_is_koalas = is_instance(child_e.last_time_index, ks, 'Series')
                    if lti_is_dask or lti_is_koalas:
                        to_join = child_e.df[link_var]
                        if lti_is_dask:
                            to_join.index = child_e.df[child_e.index]

                        lti_df = child_e.last_time_index.to_frame(name='last_time').join(
                            to_join.to_frame(name=entity.index)
                        )

                        if lti_is_dask:
                            new_index = lti_df.index.copy()
                            new_index.name = None
                            lti_df.index = new_index
                        lti_df = lti_df.groupby(lti_df[entity.index]).agg('max')

                        lti_df = entity.last_time_index.to_frame(name='last_time_old').join(lti_df)

                    else:
                        lti_df = pd.DataFrame({'last_time': child_e.last_time_index,
                                               entity.index: child_e.df[link_var]})

                        # sort by time and keep only the most recent
                        lti_df.sort_values(['last_time', entity.index],
                                           kind="mergesort", inplace=True)

                        lti_df.drop_duplicates(entity.index,
                                               keep='last',
                                               inplace=True)

                        lti_df.set_index(entity.index, inplace=True)
                        lti_df = lti_df.reindex(entity.last_time_index.index)
                        lti_df['last_time_old'] = entity.last_time_index
                    if not (lti_is_dask or lti_is_koalas) and lti_df.empty:
                        # Pandas errors out if it tries to do fillna and then max on an empty dataframe
                        lti_df = pd.Series()
                    else:
                        if lti_is_koalas:
                            lti_df['last_time'] = ks.to_datetime(lti_df['last_time'])
                            lti_df['last_time_old'] = ks.to_datetime(lti_df['last_time_old'])
                            # TODO: Figure out a workaround for fillna and replace
                            lti_df = lti_df.max(axis=1)
                        else:
                            lti_df['last_time'] = lti_df['last_time'].astype('datetime64[ns]')
                            lti_df['last_time_old'] = lti_df['last_time_old'].astype('datetime64[ns]')
                            lti_df = lti_df.fillna(pd.to_datetime('1800-01-01 00:00')).max(axis=1)
                            lti_df = lti_df.replace(pd.to_datetime('1800-01-01 00:00'), pd.NaT)
                    # lti_df = lti_df.apply(lambda x: x.dropna().max(), axis=1)

                    entity.last_time_index = lti_df
                    entity.last_time_index.name = 'last_time'

            explored.add(entity.id)
        self.reset_data_description()
    def _calculate_agg_features(self, features, frame, df_trie,
                                progress_callback):
        test_feature = features[0]
        child_dataframe = test_feature.base_features[0].dataframe
        base_frame = df_trie.get_node(test_feature.relationship_path).value
        parent_merge_col = test_feature.relationship_path[0][
            1]._parent_column_name
        # Sometimes approximate features get computed in a previous filter frame
        # and put in the current one dynamically,
        # so there may be existing features here
        fl = []
        for f in features:
            for ind in f.get_feature_names():
                if ind not in frame.columns:
                    fl.append(f)
                    break
        features = fl
        if not len(features):
            progress_callback(len(features) / float(self.num_features))
            return frame

        # handle where
        base_frame_empty = (base_frame.empty if isinstance(
            base_frame, pd.DataFrame) else False)
        where = test_feature.where
        if where is not None and not base_frame_empty:
            base_frame = base_frame.loc[base_frame[where.get_name()]]

        # when no child data, just add all the features to frame with nan
        base_frame_empty = (base_frame.empty if isinstance(
            base_frame, pd.DataFrame) else False)
        if base_frame_empty:
            feature_values = []
            for f in features:
                feature_values.append(
                    (f, np.full(f.number_output_features, np.nan)))
                progress_callback(1 / float(self.num_features))
            frame = update_feature_columns(feature_values, frame)
        else:
            relationship_path = test_feature.relationship_path

            groupby_col = get_relationship_column_id(relationship_path)

            # if the use_previous property exists on this feature, include only the
            # instances from the child dataframe included in that Timedelta
            use_previous = test_feature.use_previous
            if use_previous:
                # Filter by use_previous values
                time_last = self.time_last
                if use_previous.has_no_observations():
                    time_first = time_last - use_previous
                    ti = child_dataframe.ww.time_index
                    if ti is not None:
                        base_frame = base_frame[base_frame[ti] >= time_first]
                else:
                    n = use_previous.get_value("o")

                    def last_n(df):
                        return df.iloc[-n:]

                    base_frame = base_frame.groupby(groupby_col,
                                                    observed=True,
                                                    sort=False).apply(last_n)

            to_agg = {}
            agg_rename = {}
            to_apply = set()
            # apply multi-column and time-dependent features as we find them, and
            # save aggregable features for later
            for f in features:
                if _can_agg(f):

                    column_id = f.base_features[0].get_name()
                    if column_id not in to_agg:
                        to_agg[column_id] = []
                    if isinstance(base_frame, dd.DataFrame):
                        func = f.get_function(agg_type=Library.DASK)
                    elif is_instance(base_frame, ps, "DataFrame"):
                        func = f.get_function(agg_type=Library.SPARK)
                    else:
                        func = f.get_function()

                    # for some reason, using the string count is significantly
                    # faster than any method a primitive can return
                    # https://stackoverflow.com/questions/55731149/use-a-function-instead-of-string-in-pandas-groupby-agg
                    if func == pd.Series.count:
                        func = "count"

                    funcname = func
                    if callable(func):
                        # if the same function is being applied to the same
                        # column twice, wrap it in a partial to avoid
                        # duplicate functions
                        funcname = str(id(func))
                        if "{}-{}".format(column_id, funcname) in agg_rename:
                            func = partial(func)
                            funcname = str(id(func))

                        func.__name__ = funcname

                    if isinstance(func, dd.Aggregation):
                        # TODO: handle aggregation being applied to same column twice
                        # (see above partial wrapping of functions)
                        funcname = func.__name__

                    to_agg[column_id].append(func)
                    # this is used below to rename columns that pandas names for us
                    agg_rename["{}-{}".format(column_id,
                                              funcname)] = f.get_name()
                    continue

                to_apply.add(f)

            # Apply the non-aggregable functions generate a new dataframe, and merge
            # it with the existing one
            if len(to_apply):
                wrap = agg_wrapper(to_apply, self.time_last)
                # groupby_col can be both the name of the index and a column,
                # to silence pandas warning about ambiguity we explicitly pass
                # the column (in actuality grouping by both index and group would
                # work)
                to_merge = base_frame.groupby(base_frame[groupby_col],
                                              observed=True,
                                              sort=False).apply(wrap)
                frame = pd.merge(
                    left=frame,
                    right=to_merge,
                    left_index=True,
                    right_index=True,
                    how="left",
                )

                progress_callback(len(to_apply) / float(self.num_features))

            # Apply the aggregate functions to generate a new dataframe, and merge
            # it with the existing one
            if len(to_agg):
                # groupby_col can be both the name of the index and a column,
                # to silence pandas warning about ambiguity we explicitly pass
                # the column (in actuality grouping by both index and group would
                # work)
                if is_instance(base_frame, (dd, ps), "DataFrame"):
                    to_merge = base_frame.groupby(groupby_col).agg(to_agg)

                else:
                    to_merge = base_frame.groupby(base_frame[groupby_col],
                                                  observed=True,
                                                  sort=False).agg(to_agg)
                # rename columns to the correct feature names
                to_merge.columns = [
                    agg_rename["-".join(x)] for x in to_merge.columns
                ]
                to_merge = to_merge[list(agg_rename.values())]

                # Workaround for pandas bug where categories are in the wrong order
                # see: https://github.com/pandas-dev/pandas/issues/22501
                #
                # Pandas claims that bug is fixed but it still shows up in some
                # cases.  More investigation needed.
                if pdtypes.is_categorical_dtype(frame.index):
                    categories = pdtypes.CategoricalDtype(
                        categories=frame.index.categories)
                    to_merge.index = to_merge.index.astype(object).astype(
                        categories)

                if is_instance(frame, (dd, ps), "DataFrame"):
                    frame = frame.merge(to_merge,
                                        left_on=parent_merge_col,
                                        right_index=True,
                                        how="left")
                else:
                    frame = pd.merge(
                        left=frame,
                        right=to_merge,
                        left_index=True,
                        right_index=True,
                        how="left",
                    )

                # determine number of features that were just merged
                progress_callback(
                    len(to_merge.columns) / float(self.num_features))

        # Handle default values
        fillna_dict = {}
        for f in features:
            feature_defaults = {
                name: f.default_value
                for name in f.get_feature_names()
            }
            fillna_dict.update(feature_defaults)

        frame = frame.fillna(fillna_dict)

        return frame
Пример #25
0
    def query_by_values(self, entity_id, instance_vals, variable_id=None, columns=None,
                        time_last=None, training_window=None, include_cutoff_time=True):
        """Query instances that have variable with given value

        Args:
            entity_id (str): The id of the entity to query
            instance_vals (pd.Dataframe, pd.Series, list[str] or str) :
                Instance(s) to match.
            variable_id (str) : Variable to query on. If None, query on index.
            columns (list[str]) : Columns to return. Return all columns if None.
            time_last (pd.TimeStamp) : Query data up to and including this
                time. Only applies if entity has a time index.
            training_window (Timedelta, optional):
                Window defining how much time before the cutoff time data
                can be used when calculating features. If None, all data before cutoff time is used.
            include_cutoff_time (bool):
                If True, data at cutoff time are included in calculating features

        Returns:
            pd.DataFrame : instances that match constraints with ids in order of underlying dataframe
        """
        entity = self[entity_id]
        if not variable_id:
            variable_id = entity.index

        instance_vals = _vals_to_series(instance_vals, variable_id)

        training_window = _check_timedelta(training_window)

        if training_window is not None:
            assert training_window.has_no_observations(), "Training window cannot be in observations"

        if instance_vals is None:
            df = entity.df.copy()

        elif isinstance(instance_vals, pd.Series) and instance_vals.empty:
            df = entity.df.head(0)

        else:
            if is_instance(instance_vals, (dd, ks), 'Series'):
                df = entity.df.merge(instance_vals.to_frame(), how="inner", on=variable_id)
            elif isinstance(instance_vals, pd.Series) and is_instance(entity.df, ks, 'DataFrame'):
                df = entity.df.merge(ks.DataFrame({variable_id: instance_vals}), how="inner", on=variable_id)
            else:
                df = entity.df[entity.df[variable_id].isin(instance_vals)]

            if isinstance(entity.df, pd.DataFrame):
                df = df.set_index(entity.index, drop=False)

            # ensure filtered df has same categories as original
            # workaround for issue below
            # github.com/pandas-dev/pandas/issues/22501#issuecomment-415982538
            if pdtypes.is_categorical_dtype(entity.df[variable_id]):
                categories = pd.api.types.CategoricalDtype(categories=entity.df[variable_id].cat.categories)
                df[variable_id] = df[variable_id].astype(categories)

        df = self._handle_time(entity_id=entity_id,
                               df=df,
                               time_last=time_last,
                               training_window=training_window,
                               include_cutoff_time=include_cutoff_time)

        if columns is not None:
            df = df[columns]

        return df
def calculate_feature_matrix(features,
                             entityset=None,
                             cutoff_time=None,
                             instance_ids=None,
                             entities=None,
                             relationships=None,
                             cutoff_time_in_index=False,
                             training_window=None,
                             approximate=None,
                             save_progress=None,
                             verbose=False,
                             chunk_size=None,
                             n_jobs=1,
                             dask_kwargs=None,
                             progress_callback=None,
                             include_cutoff_time=True):
    """Calculates a matrix for a given set of instance ids and calculation times.

    Args:
        features (list[:class:`.FeatureBase`]): Feature definitions to be calculated.

        entityset (EntitySet): An already initialized entityset. Required if `entities` and `relationships`
            not provided

        cutoff_time (pd.DataFrame or Datetime): Specifies times at which to calculate
            the features for each instance. The resulting feature matrix will use data
            up to and including the cutoff_time. Can either be a DataFrame or a single
            value. If a DataFrame is passed the instance ids for which to calculate features
            must be in a column with the same name as the target entity index or a column
            named `instance_id`. The cutoff time values in the DataFrame must be in a column with
            the same name as the target entity time index or a column named `time`. If the
            DataFrame has more than two columns, any additional columns will be added to the
            resulting feature matrix. If a single value is passed, this value will be used for
            all instances.

        instance_ids (list): List of instances to calculate features on. Only
            used if cutoff_time is a single datetime.

        entities (dict[str -> tuple(pd.DataFrame, str, str, dict[str -> Variable])]): dictionary of
            entities. Entries take the format
            {entity id -> (dataframe, id column, (time_column), (variable_types))}.
            Note that time_column and variable_types are optional.

        relationships (list[(str, str, str, str)]): list of relationships
            between entities. List items are a tuple with the format
            (parent entity id, parent variable, child entity id, child variable).

        cutoff_time_in_index (bool): If True, return a DataFrame with a MultiIndex
            where the second index is the cutoff time (first is instance id).
            DataFrame will be sorted by (time, instance_id).

        training_window (Timedelta or str, optional):
            Window defining how much time before the cutoff time data
            can be used when calculating features. If ``None``, all data before cutoff time is used.
            Defaults to ``None``.

        approximate (Timedelta or str): Frequency to group instances with similar
            cutoff times by for features with costly calculations. For example,
            if bucket is 24 hours, all instances with cutoff times on the same
            day will use the same calculation for expensive features.

        verbose (bool, optional): Print progress info. The time granularity is
            per chunk.

        chunk_size (int or float or None): maximum number of rows of
            output feature matrix to calculate at time. If passed an integer
            greater than 0, will try to use that many rows per chunk. If passed
            a float value between 0 and 1 sets the chunk size to that
            percentage of all rows. if None, and n_jobs > 1 it will be set to 1/n_jobs

        n_jobs (int, optional): number of parallel processes to use when
            calculating feature matrix.

        dask_kwargs (dict, optional): Dictionary of keyword arguments to be
            passed when creating the dask client and scheduler. Even if n_jobs
            is not set, using `dask_kwargs` will enable multiprocessing.
            Main parameters:

            cluster (str or dask.distributed.LocalCluster):
                cluster or address of cluster to send tasks to. If unspecified,
                a cluster will be created.
            diagnostics port (int):
                port number to use for web dashboard.  If left unspecified, web
                interface will not be enabled.

            Valid keyword arguments for LocalCluster will also be accepted.

        save_progress (str, optional): path to save intermediate computational results.

        progress_callback (callable): function to be called with incremental progress updates.
            Has the following parameters:

                update: percentage change (float between 0 and 100) in progress since last call
                progress_percent: percentage (float between 0 and 100) of total computation completed
                time_elapsed: total time in seconds that has elapsed since start of call

        include_cutoff_time (bool): Include data at cutoff times in feature calculations. Defaults to ``True``.

    Returns:
        pd.DataFrame: The feature matrix.
    """
    assert (isinstance(features, list) and features != [] and
            all([isinstance(feature, FeatureBase) for feature in features])), \
        "features must be a non-empty list of features"

    # handle loading entityset
    from featuretools.entityset.entityset import EntitySet
    if not isinstance(entityset, EntitySet):
        if entities is not None and relationships is not None:
            entityset = EntitySet("entityset", entities, relationships)

    if any(isinstance(es.df, dd.DataFrame) for es in entityset.entities):
        if approximate:
            msg = "Using approximate is not supported with Dask Entities"
            raise ValueError(msg)
        if training_window:
            msg = "Using training_window is not supported with Dask Entities"
            raise ValueError(msg)

    target_entity = entityset[features[0].entity.id]

    cutoff_time = _validate_cutoff_time(cutoff_time, target_entity)

    if isinstance(cutoff_time, pd.DataFrame):
        if instance_ids:
            msg = "Passing 'instance_ids' is valid only if 'cutoff_time' is a single value or None - ignoring"
            warnings.warn(msg)
        pass_columns = [
            col for col in cutoff_time.columns
            if col not in ['instance_id', 'time']
        ]
        # make sure dtype of instance_id in cutoff time
        # is same as column it references
        target_entity = features[0].entity
        dtype = entityset[target_entity.id].df[target_entity.index].dtype
        cutoff_time["instance_id"] = cutoff_time["instance_id"].astype(dtype)
    else:
        pass_columns = []
        if cutoff_time is None:
            if entityset.time_type == NumericTimeIndex:
                cutoff_time = np.inf
            else:
                cutoff_time = datetime.now()

        if instance_ids is None:
            index_var = target_entity.index
            df = target_entity._handle_time(
                target_entity.df,
                time_last=cutoff_time,
                training_window=training_window,
                include_cutoff_time=include_cutoff_time)
            instance_ids = df[index_var]

        if isinstance(instance_ids, dd.Series):
            instance_ids = instance_ids.compute()
        elif is_instance(instance_ids, ks, 'Series'):
            instance_ids = instance_ids.to_pandas()

        # convert list or range object into series
        if not isinstance(instance_ids, pd.Series):
            instance_ids = pd.Series(instance_ids)

        cutoff_time = (cutoff_time, instance_ids)

    _check_cutoff_time_type(cutoff_time, entityset.time_type)

    # Approximate provides no benefit with a single cutoff time, so ignore it
    if isinstance(cutoff_time, tuple) and approximate is not None:
        msg = "Using approximate with a single cutoff_time value or no cutoff_time " \
            "provides no computational efficiency benefit"
        warnings.warn(msg)
        cutoff_time = pd.DataFrame({
            "instance_id":
            cutoff_time[1],
            "time": [cutoff_time[0]] * len(cutoff_time[1])
        })

    feature_set = FeatureSet(features)

    # Get features to approximate
    if approximate is not None:
        approximate_feature_trie = gather_approximate_features(feature_set)
        # Make a new FeatureSet that ignores approximated features
        feature_set = FeatureSet(
            features, approximate_feature_trie=approximate_feature_trie)

    # Check if there are any non-approximated aggregation features
    no_unapproximated_aggs = True
    for feature in features:
        if isinstance(feature, AggregationFeature):
            # do not need to check if feature is in to_approximate since
            # only base features of direct features can be in to_approximate
            no_unapproximated_aggs = False
            break

        if approximate is not None:
            all_approx_features = {
                f
                for _, feats in feature_set.approximate_feature_trie
                for f in feats
            }
        else:
            all_approx_features = set()
        deps = feature.get_dependencies(deep=True, ignored=all_approx_features)
        for dependency in deps:
            if isinstance(dependency, AggregationFeature):
                no_unapproximated_aggs = False
                break

    cutoff_df_time_var = 'time'
    target_time = '_original_time'

    if approximate is not None:
        # If there are approximated aggs, bin times
        binned_cutoff_time = bin_cutoff_times(cutoff_time, approximate)

        # Think about collisions: what if original time is a feature
        binned_cutoff_time[target_time] = cutoff_time[cutoff_df_time_var]

        cutoff_time_to_pass = binned_cutoff_time

    else:
        cutoff_time_to_pass = cutoff_time

    if isinstance(cutoff_time, pd.DataFrame):
        cutoff_time_len = cutoff_time.shape[0]
    else:
        cutoff_time_len = len(cutoff_time[1])

    chunk_size = _handle_chunk_size(chunk_size, cutoff_time_len)
    tqdm_options = {
        'total': (cutoff_time_len / FEATURE_CALCULATION_PERCENTAGE),
        'bar_format': PBAR_FORMAT,
        'disable': True
    }

    if verbose:
        tqdm_options.update({'disable': False})
    elif progress_callback is not None:
        # allows us to utilize progress_bar updates without printing to anywhere
        tqdm_options.update({'file': open(os.devnull, 'w'), 'disable': False})

    with make_tqdm_iterator(**tqdm_options) as progress_bar:
        if n_jobs != 1 or dask_kwargs is not None:
            feature_matrix = parallel_calculate_chunks(
                cutoff_time=cutoff_time_to_pass,
                chunk_size=chunk_size,
                feature_set=feature_set,
                approximate=approximate,
                training_window=training_window,
                save_progress=save_progress,
                entityset=entityset,
                n_jobs=n_jobs,
                no_unapproximated_aggs=no_unapproximated_aggs,
                cutoff_df_time_var=cutoff_df_time_var,
                target_time=target_time,
                pass_columns=pass_columns,
                progress_bar=progress_bar,
                dask_kwargs=dask_kwargs or {},
                progress_callback=progress_callback,
                include_cutoff_time=include_cutoff_time)
        else:
            feature_matrix = calculate_chunk(
                cutoff_time=cutoff_time_to_pass,
                chunk_size=chunk_size,
                feature_set=feature_set,
                approximate=approximate,
                training_window=training_window,
                save_progress=save_progress,
                entityset=entityset,
                no_unapproximated_aggs=no_unapproximated_aggs,
                cutoff_df_time_var=cutoff_df_time_var,
                target_time=target_time,
                pass_columns=pass_columns,
                progress_bar=progress_bar,
                progress_callback=progress_callback,
                include_cutoff_time=include_cutoff_time)

        # ensure rows are sorted by input order
        if isinstance(feature_matrix, pd.DataFrame):
            if isinstance(cutoff_time, pd.DataFrame):
                feature_matrix = feature_matrix.reindex(
                    pd.MultiIndex.from_frame(
                        cutoff_time[["instance_id", "time"]],
                        names=feature_matrix.index.names))
            else:
                # Maintain index dtype
                index_dtype = feature_matrix.index.get_level_values(0).dtype
                feature_matrix = feature_matrix.reindex(
                    cutoff_time[1].astype(index_dtype), level=0)
            if not cutoff_time_in_index:
                feature_matrix.reset_index(level='time',
                                           drop=True,
                                           inplace=True)

        if save_progress and os.path.exists(os.path.join(
                save_progress, 'temp')):
            shutil.rmtree(os.path.join(save_progress, 'temp'))

        # force to 100% since we saved last 5 percent
        previous_progress = progress_bar.n
        progress_bar.update(progress_bar.total - progress_bar.n)

        if progress_callback is not None:
            update, progress_percent, time_elapsed = update_progress_callback_parameters(
                progress_bar, previous_progress)
            progress_callback(update, progress_percent, time_elapsed)

        progress_bar.refresh()

    return feature_matrix
Пример #27
0
def infer_variable_types(df, link_vars, variable_types, time_index,
                         secondary_time_index):
    '''Infer variable types from dataframe

    Args:
        df (DataFrame): Input DataFrame
        link_vars (list[]): Linked variables
        variable_types (dict[str -> dict[str -> type]]) : An entity's
            variable_types dict maps string variable ids to types (:class:`.Variable`)
            or (type, kwargs) to pass keyword arguments to the Variable.
        time_index (str or None): Name of time_index column
        secondary_time_index (dict[str: [str]]): Dictionary of secondary time columns
            that each map to a list of columns that depend on that secondary time
    '''
    # TODO: set pk and pk types here
    inferred_types = {}
    vids_to_assume_datetime = [time_index]
    if len(list(secondary_time_index.keys())):
        vids_to_assume_datetime.append(list(secondary_time_index.keys())[0])
    inferred_type = vtypes.Unknown
    for variable in df.columns:
        if variable in variable_types:
            continue
        elif isinstance(df, dd.DataFrame):
            msg = 'Variable types cannot be inferred from Dask DataFrames, ' \
                  'use variable_types to provide type metadata for entity'
            raise ValueError(msg)
        elif is_instance(df, ks, 'DataFrame'):
            msg = 'Variable types cannot be inferred from Koalas DataFrames, ' \
                  'use variable_types to provide type metadata for entity'
            raise ValueError(msg)
        elif variable in vids_to_assume_datetime:
            if col_is_datetime(df[variable]):
                inferred_type = vtypes.Datetime
            else:
                inferred_type = vtypes.Numeric

        elif variable in link_vars:
            inferred_type = vtypes.Categorical

        elif df[variable].dtype == "object":
            if not len(df[variable]):
                inferred_type = vtypes.Categorical
            elif col_is_datetime(df[variable]):
                inferred_type = vtypes.Datetime
            else:
                inferred_type = vtypes.Categorical

                # heuristics to predict this some other than categorical
                sample = df[variable].sample(min(10000, len(df[variable])))

                # catch cases where object dtype cannot be interpreted as a string
                try:
                    avg_length = sample.str.len().mean()
                    if avg_length > 50:
                        inferred_type = vtypes.NaturalLanguage
                except AttributeError:
                    pass

        elif df[variable].dtype == "bool":
            inferred_type = vtypes.Boolean

        elif pdtypes.is_categorical_dtype(df[variable].dtype):
            inferred_type = vtypes.Categorical

        elif pdtypes.is_numeric_dtype(df[variable].dtype):
            inferred_type = vtypes.Numeric

        elif col_is_datetime(df[variable]):
            inferred_type = vtypes.Datetime

        elif len(df[variable]):
            n = min(10000, len(df[variable]))
            sample = df[variable].sample(n=n)
            n_unique = len(sample.unique())
            percent_unique = n_unique / sample.size

            if percent_unique < .05:
                inferred_type = vtypes.Categorical
            else:
                inferred_type = vtypes.Numeric

        inferred_types[variable] = inferred_type

    return inferred_types
Пример #28
0
def test_is_instance_errors_mismatch():
    msg = 'Number of modules does not match number of classnames'
    with pytest.raises(ValueError, match=msg):
        is_instance('abc', pd, ('DataFrame', 'Series'))
Пример #29
0
    def normalize_entity(self, base_entity_id, new_entity_id, index,
                         additional_variables=None, copy_variables=None,
                         make_time_index=None,
                         make_secondary_time_index=None,
                         new_entity_time_index=None,
                         new_entity_secondary_time_index=None):
        """Create a new entity and relationship from unique values of an existing variable.

        Args:
            base_entity_id (str) : Entity id from which to split.

            new_entity_id (str): Id of the new entity.

            index (str): Variable in old entity
                that will become index of new entity. Relationship
                will be created across this variable.

            additional_variables (list[str]):
                List of variable ids to remove from
                base_entity and move to new entity.

            copy_variables (list[str]): List of
                variable ids to copy from old entity
                and move to new entity.

            make_time_index (bool or str, optional): Create time index for new entity based
                on time index in base_entity, optionally specifying which variable in base_entity
                to use for time_index. If specified as True without a specific variable,
                uses the primary time index. Defaults to True if base entity has a time index.

            make_secondary_time_index (dict[str -> list[str]], optional): Create a secondary time index
                from key. Values of dictionary
                are the variables to associate with the secondary time index. Only one
                secondary time index is allowed. If None, only associate the time index.

            new_entity_time_index (str, optional): Rename new entity time index.

            new_entity_secondary_time_index (str, optional): Rename new entity secondary time index.

        """
        base_entity = self.entity_dict[base_entity_id]
        additional_variables = additional_variables or []
        copy_variables = copy_variables or []

        # Check base entity to make sure time index is valid
        if base_entity.time_index is not None:
            t_index = base_entity[base_entity.time_index]
            if not isinstance(t_index, (vtypes.NumericTimeIndex, vtypes.DatetimeTimeIndex)):
                base_error = "Time index '{0}' is not a NumericTimeIndex or DatetimeTimeIndex, but type {1}. Use set_time_index on entity '{2}' to set the time_index."
                raise TypeError(base_error.format(base_entity.time_index, type(t_index), str(base_entity.id)))

        if not isinstance(additional_variables, list):
            raise TypeError("'additional_variables' must be a list, but received type {}"
                            .format(type(additional_variables)))

        if len(additional_variables) != len(set(additional_variables)):
            raise ValueError("'additional_variables' contains duplicate variables. All variables must be unique.")

        if not isinstance(copy_variables, list):
            raise TypeError("'copy_variables' must be a list, but received type {}"
                            .format(type(copy_variables)))

        if len(copy_variables) != len(set(copy_variables)):
            raise ValueError("'copy_variables' contains duplicate variables. All variables must be unique.")

        for v in additional_variables + copy_variables:
            if v == index:
                raise ValueError("Not copying {} as both index and variable".format(v))

        for v in additional_variables:
            if v == base_entity.time_index:
                raise ValueError("Not moving {} as it is the base time index variable. Perhaps, move the variable to the copy_variables.".format(v))

        if isinstance(make_time_index, str):
            if make_time_index not in base_entity.df.columns:
                raise ValueError("'make_time_index' must be a variable in the base entity")
            elif make_time_index not in additional_variables + copy_variables:
                raise ValueError("'make_time_index' must be specified in 'additional_variables' or 'copy_variables'")
        if index == base_entity.index:
            raise ValueError("'index' must be different from the index column of the base entity")

        transfer_types = {}
        transfer_types[index] = type(base_entity[index])
        for v in additional_variables + copy_variables:
            if type(base_entity[v]) == vtypes.DatetimeTimeIndex:
                transfer_types[v] = vtypes.Datetime
            elif type(base_entity[v]) == vtypes.NumericTimeIndex:
                transfer_types[v] = vtypes.Numeric
            else:
                transfer_types[v] = type(base_entity[v])

        # create and add new entity
        new_entity_df = self[base_entity_id].df.copy()

        if make_time_index is None and base_entity.time_index is not None:
            make_time_index = True

        if isinstance(make_time_index, str):
            # Set the new time index to make_time_index.
            base_time_index = make_time_index
            new_entity_time_index = make_time_index
            already_sorted = (new_entity_time_index == base_entity.time_index)
        elif make_time_index:
            # Create a new time index based on the base entity time index.
            base_time_index = base_entity.time_index
            if new_entity_time_index is None:
                new_entity_time_index = "first_%s_time" % (base_entity.id)

            already_sorted = True

            assert base_entity.time_index is not None, \
                "Base entity doesn't have time_index defined"

            if base_time_index not in [v for v in additional_variables]:
                copy_variables.append(base_time_index)

            transfer_types[new_entity_time_index] = type(base_entity[base_entity.time_index])
        else:
            new_entity_time_index = None
            already_sorted = False

        if new_entity_time_index is not None and new_entity_time_index == index:
            raise ValueError("time_index and index cannot be the same value, %s" % (new_entity_time_index))

        selected_variables = [index] +\
            [v for v in additional_variables] +\
            [v for v in copy_variables]

        new_entity_df2 = new_entity_df. \
            drop_duplicates(index, keep='first')[selected_variables]

        if make_time_index:
            new_entity_df2 = new_entity_df2.rename(columns={base_time_index: new_entity_time_index})
        if make_secondary_time_index:
            assert len(make_secondary_time_index) == 1, "Can only provide 1 secondary time index"
            secondary_time_index = list(make_secondary_time_index.keys())[0]

            secondary_variables = [index, secondary_time_index] + list(make_secondary_time_index.values())[0]
            secondary_df = new_entity_df. \
                drop_duplicates(index, keep='last')[secondary_variables]
            if new_entity_secondary_time_index:
                secondary_df = secondary_df.rename(columns={secondary_time_index: new_entity_secondary_time_index})
                secondary_time_index = new_entity_secondary_time_index
            else:
                new_entity_secondary_time_index = secondary_time_index
            secondary_df = secondary_df.set_index(index)
            new_entity_df = new_entity_df2.join(secondary_df, on=index)
        else:
            new_entity_df = new_entity_df2

        base_entity_index = index

        transfer_types[index] = vtypes.Categorical
        if make_secondary_time_index:
            old_ti_name = list(make_secondary_time_index.keys())[0]
            ti_cols = list(make_secondary_time_index.values())[0]
            ti_cols = [c if c != old_ti_name else secondary_time_index for c in ti_cols]
            make_secondary_time_index = {secondary_time_index: ti_cols}

        if is_instance(new_entity_df, ks, 'DataFrame'):
            already_sorted = False

        self.entity_from_dataframe(
            new_entity_id,
            new_entity_df,
            index,
            already_sorted=already_sorted,
            time_index=new_entity_time_index,
            secondary_time_index=make_secondary_time_index,
            variable_types=transfer_types)

        self.entity_dict[base_entity_id].delete_variables(additional_variables)

        new_entity = self.entity_dict[new_entity_id]
        base_entity.convert_variable_type(base_entity_index, vtypes.Id, convert_data=False)
        self.add_relationship(Relationship(new_entity[index], base_entity[base_entity_index]))
        self.reset_data_description()
        return self
    def run(self,
            instance_ids,
            progress_callback=None,
            include_cutoff_time=True):
        """
        Calculate values of features for the given instances of the target
        dataframe.

        Summary of algorithm:
        1. Construct a trie where the edges are relationships and each node
            contains a set of features for a single dataframe. See
            FeatureSet._build_feature_trie.
        2. Initialize a trie for storing dataframes.
        3. Traverse the trie using depth first search. At each node calculate
            the features and store the resulting dataframe in the dataframe
            trie (so that its values can be used by features which depend on
            these features). See _calculate_features_for_dataframe.
        4. Get the dataframe at the root of the trie (for the target dataframe) and
            return the columns corresponding to the requested features.

        Args:
            instance_ids (np.ndarray or pd.Categorical): Instance ids for which
                to build features.

            progress_callback (callable): function to be called with incremental progress updates

            include_cutoff_time (bool): If True, data at cutoff time are included
                in calculating features.

        Returns:
            pd.DataFrame : Pandas DataFrame of calculated feature values.
                Indexed by instance_ids. Columns in same order as features
                passed in.
        """
        assert len(instance_ids) > 0, "0 instance ids provided"

        if progress_callback is None:
            # do nothing for the progress call back if not provided
            def progress_callback(*args):
                pass

        feature_trie = self.feature_set.feature_trie

        df_trie = Trie(path_constructor=RelationshipPath)
        full_dataframe_trie = Trie(path_constructor=RelationshipPath)

        target_dataframe = self.entityset[self.feature_set.target_df_name]

        self._calculate_features_for_dataframe(
            dataframe_name=self.feature_set.target_df_name,
            feature_trie=feature_trie,
            df_trie=df_trie,
            full_dataframe_trie=full_dataframe_trie,
            precalculated_trie=self.precalculated_features,
            filter_column=target_dataframe.ww.index,
            filter_values=instance_ids,
            progress_callback=progress_callback,
            include_cutoff_time=include_cutoff_time,
        )

        # The dataframe for the target dataframe should be stored at the root of
        # df_trie.
        df = df_trie.value

        # Fill in empty rows with default values. This only works for pandas dataframes
        # and is not currently supported for Dask dataframes.
        if isinstance(df, pd.DataFrame):
            index_dtype = df.index.dtype.name
            if df.empty:
                return self.generate_default_df(instance_ids=instance_ids)

            missing_ids = [
                i for i in instance_ids
                if i not in df[target_dataframe.ww.index]
            ]
            if missing_ids:
                default_df = self.generate_default_df(instance_ids=missing_ids,
                                                      extra_columns=df.columns)

                df = default_df.append(df, sort=True)

            df.index.name = self.entityset[
                self.feature_set.target_df_name].ww.index

            # Order by instance_ids
            unique_instance_ids = pd.unique(instance_ids)
            unique_instance_ids = unique_instance_ids.astype(
                instance_ids.dtype)
            df = df.reindex(unique_instance_ids)

            # Keep categorical index if original index was categorical
            if index_dtype == "category":
                df.index = df.index.astype("category")

        column_list = []

        for feat in self.feature_set.target_features:
            column_list.extend(feat.get_feature_names())

        if is_instance(df, (dd, ps), "DataFrame"):
            column_list.extend([target_dataframe.ww.index])

        return df[column_list]