Пример #1
0
    def to_timeseries(self):
        '''
        Merge the collection of groups in the GroupedTimeSeries into a single
        Timeseries.

        Returns
        -------
        out : Timeseries:w


        Examples
        --------
        .. sourcecode:: python

          >>> data = grouped_ts.to_timeseries()
          +---------------------+-------+
          |        index        | value |
          +---------------------+-------+
          | 2015-01-01 00:00:00 |   0   |
          | 2015-01-02 00:00:00 |   1   |
          | 2015-01-03 00:00:00 |   0   |
          | 2015-01-04 00:00:00 |   1   |
          | 2015-01-05 00:00:00 |   0   |
          | 2015-01-06 00:00:00 |   1   |
          | 2015-01-07 00:00:00 |   0   |
          | 2015-01-08 00:00:00 |   1   |
          | 2015-01-09 00:00:00 |   0   |
          | 2015-01-10 00:00:00 |   1   |
          +---------------------+-------+
          [366 rows x 2 columns]
        '''
        _mt._get_metric_tracker().track('grouped_timeseries.to_timeseries')
        return _graphlab.TimeSeries(self._grouped_ts.sframe,
                                    index=self.index_col_name)
    def get(self, field):
        """
        Return the value contained in the model's ``field``.

        Parameters
        ----------
        field : string
            Name of the field to be retrieved.

        Returns
        -------
        out
            Value of the requested field.

        See Also
        --------
        list_fields
        """
        _mt._get_metric_tracker().track(
            'toolkits.anomaly_detection.bayesian_changepoints.get')
        if field == "scores" and self.__proxy__.get(
                'dataset_type') == 'TimeSeries':
            ts = self.__proxy__.get('scores')
            return _gl.TimeSeries(ts,
                                  index=self.__proxy__.get_index_col_name())
        else:
            return self.__proxy__.get(field)
Пример #3
0
        def generator():
            elems_at_a_time = 16
            self._grouped_ts.begin_iterator()
            ret = self._grouped_ts.iterator_get_next(elems_at_a_time)
            while (True):
                for j in ret:
                    try:
                        j[1].remove_columns(self._temp_col_names)
                    except KeyError:
                        pass
                    j[1] = _graphlab.TimeSeries(j[1],
                                                self.index_col_name,
                                                is_sorted=True)
                    yield tuple(j)

                if len(ret) == elems_at_a_time:
                    ret = self._grouped_ts.iterator_get_next(elems_at_a_time)
                else:
                    break
Пример #4
0
    def _load_version(self, unpickler, version):
        """
        A function to load a previously saved MovingZScoreModel
        instance.

        Parameters
        ----------
        unpickler : GLUnpickler
            A GLUnpickler file handler.

        version : int
            Version number maintained by the class writer.
        """
        state = unpickler.load()

        if state['dataset_type'] == 'TimeSeries':
            state['scores'] = _gl.TimeSeries(state['scores'],
                                             index=state['index_col_name'])
            state.pop('index_col_name')

        if version == 0:
            state['min_observations'] = None

        return MovingZScoreModel(state)
Пример #5
0
    def _save_impl(self, pickler):
        """
        Save the model as a directory, which can be loaded with the
        :py:func:`~graphlab.load_model` method.

        Parameters
        ----------
        pickler : GLPickler
            An opened GLPickle archive (Do not close the archive).

        See Also
        --------
        graphlab.load_model

        Examples
        --------
        >>> model.save('my_model_file')
        >>> loaded_model = graphlab.load_model('my_model_file')
        """

        ## The GL pickler does not support TimeSeries, so we need to convert
        #  and un-convert to SFrame here. Furthermore, the proxy does not
        #  support copying, so we need to change proxy itself, then change it
        #  back.
        if self.__proxy__['dataset_type'] == 'TimeSeries':
            self.__proxy__['index_col_name'] = self.__proxy__['scores'].index_col_name
            self.__proxy__['scores'] = self.__proxy__['scores'].to_sframe()

            pickler.dump(self.__proxy__)

            self.__proxy__['scores'] = _gl.TimeSeries(self.__proxy__['scores'],
                                        index=self.__proxy__['index_col_name'])
            self.__proxy__.pop('index_col_name')

        else:
            pickler.dump(self.__proxy__)
Пример #6
0
    def get_group(self, name):
        """
        Get the TimeSeries associated with the group `name`.

        The name of the group corresponds to the distinct value in the
        column(s) that the group was performed on. Check the output of
        `graphlab.timeseries.GroupedTimeSeries.groups` for all available group
        names.

        Parameters
        ----------
        name : type | list
            Name of the group(s). If more than one column, the name is a list
            of the values of the group, in the same order that they were
            expressed to the group call.

        Returns
        -------
        ts : `graphlab.TimeSeries`

        Examples
        --------
        >>> import datetime as dt
        >>> start = dt.datetime(2013, 5, 7)
        >>> end = dt.datetime(2013, 5, 9, 23, 59, 59)
        >>> sa = gl.TimeSeries.date_range(start,end,dt.timedelta(hours=12))
        >>> sf = gl.SFrame({'time':sa,
        ... 'numbers':[(i % 2) for i in range(0,len(sa))],
        ... 'words':['day' if (i % 2) else 'night' for i in range(0,len(sa))]})

        # Create a timeseries.
        >>> ts = gl.TimeSeries(sf, index='time')
        >>> print ts
        +---------------------+---------+-------+
        |         time        | numbers | words |
        +---------------------+---------+-------+
        | 2013-05-07 00:00:00 |    0    | night |
        | 2013-05-07 12:00:00 |    1    |  day  |
        | 2013-05-08 00:00:00 |    0    | night |
        | 2013-05-08 12:00:00 |    1    |  day  |
        | 2013-05-09 00:00:00 |    0    | night |
        | 2013-05-09 12:00:00 |    1    |  day  |
        +---------------------+---------+-------+
        [6 rows x 3 columns]
        The index column of the TimeSeries is: time

        # Group the timeseries by hour.
        >>> by_hour = ts.group(ts.date_part.HOUR)
        >>> by_hour.get_group(12)
        +---------------------+---------+-------+
        |         time        | numbers | words |
        +---------------------+---------+-------+
        | 2013-05-07 12:00:00 |    1    |  day  |
        | 2013-05-08 12:00:00 |    1    |  day  |
        | 2013-05-09 12:00:00 |    1    |  day  |
        +---------------------+---------+-------+
        [3 rows x 3 columns]
        The index column of the TimeSeries is: time

        >>> by_word = ts.group('words')
        >>> by_word.get_group('night')
        +---------------------+---------+-------+
        |         time        | numbers | words |
        +---------------------+---------+-------+
        | 2013-05-07 00:00:00 |    0    | night |
        | 2013-05-08 00:00:00 |    0    | night |
        | 2013-05-09 00:00:00 |    0    | night |
        +---------------------+---------+-------+
        [3 rows x 3 columns]
        The index column of the TimeSeries is: time

        >>> by_num = ts.group('numbers')
        >>> by_num.get_group(1)
        +---------------------+---------+-------+
        |         time        | numbers | words |
        +---------------------+---------+-------+
        | 2013-05-07 12:00:00 |    1    |  day  |
        | 2013-05-08 12:00:00 |    1    |  day  |
        | 2013-05-09 12:00:00 |    1    |  day  |
        +---------------------+---------+-------+
        [3 rows x 3 columns]
        The index column of the TimeSeries is: time

        >>> by_both = ts.group(['numbers','words'])
        >>> by_both.get_group([1, 'day'])
        +---------------------+---------+-------+
        |         time        | numbers | words |
        +---------------------+---------+-------+
        | 2013-05-07 12:00:00 |    1    |  day  |
        | 2013-05-08 12:00:00 |    1    |  day  |
        | 2013-05-09 12:00:00 |    1    |  day  |
        +---------------------+---------+-------+
        [3 rows x 3 columns]
        The index column of the TimeSeries is: time

        >>> by_day = ts.group([ts.date_part.YEAR,
        ...                    ts.date_part.MONTH,
        ...                    ts.date_part.DAY])
        >>> by_day.get_group([2013,5,9])
        +---------------------+---------+-------+
        |         time        | numbers | words |
        +---------------------+---------+-------+
        | 2013-05-09 00:00:00 |    0    | night |
        | 2013-05-09 12:00:00 |    1    |  day  |
        +---------------------+---------+-------+
        [2 rows x 3 columns]
        The index column of the TimeSeries is: time
        """
        if not isinstance(name, list):
            name = [name]

        # HUGE hack to prevent list of ints from converting to list of floats
        # on C++ side
        name.append(None)
        src_sf = self._grouped_ts.get_group(name)
        try:
            src_sf.remove_columns(self._temp_col_names)
        except KeyError:
            pass
        return _graphlab.TimeSeries(src_sf,
                                    self.index_col_name,
                                    is_sorted=True)
Пример #7
0
    'open']  # distance between Highest and Opening price
data['lo'] = data['low'] - data[
    'open']  # distance between Lowest and Opening price
data['gain'] = data['close'] - data['open']

# feature generation
rsi_14 = RSI(14).generate(data)
rsi_5 = RSI(5).generate(data)
# rsi_14 = FeatureFactory.generate_rsi(data, n = 14)
# rsi_5 = FeatureFactory.generate_rsi(data, n = 5)
ma_20 = FeatureFactory.generate_moving_average(data, period=20)
ma_5 = FeatureFactory.generate_moving_average(data, period=5)
crossMA1_10 = crossMA(1, 10).generate(data)
crossMA5_20 = crossMA(5, 20).generate(data)

ts = gl.TimeSeries(data, index='datetime')
# add the outcome variable, 1 if the bar was positive (close>open), 0 otherwise
ts['outcome'] = ts.apply(lambda x: 1 if x['close'] > x['open'] else -1)
# ts['ma5-20'] = ts[ma_5] - ts[ma_20]
# GENERATE SOME LAGGED TIMESERIES
ts_1 = ts.shift(1)  # by 1 day
ts['dRtn'] = ts['close'] / ts_1['close']
ts['idRtn'] = ts['close'] / ts['open']
ts_1 = ts.shift(1)  # by 1 day
ts_2 = ts.shift(2)
ts_3 = ts.shift(3)
ts['open_above_close'] = ts['open'] > ts_1['close']
ts['dRtn'] = ts['close'] / ts_1['close']
ts['idRtn'] = ts['close'] / ts['open']
ts['jump'] = ts['open'] > ts_1['high']
ts['aboveMA'] = ts['open'] > ts_1[ma_20]
Пример #8
0
import graphlab as gl
import datetime as dt
# household_data = gl.SFrame(
#       "https://static.turi.com/datasets/household_electric_sample/household_electric_sample.sf")
#
# household_data.save("household_data") ##

household_data = gl.SFrame("household_data")
print (household_data.head(10))

household_ts = gl.TimeSeries(household_data, index="DateTime")
print (household_ts.head(10))
Пример #9
0
from dateutil import parser as datetime_parser


### Load Data ###

# Table of product purchases
purchases = gl.SFrame.read_csv('dataset/online_retail.csv')


### Prepare Data ###

# Convert InvoiceDate strings (e.g. "12/1/10 8:26") to datetimes
purchases['InvoiceDate'] = purchases['InvoiceDate'].apply(datetime_parser.parse)

# Create a TimeSeries
timeseries = gl.TimeSeries(purchases, 'InvoiceDate')


### Train the churn predictor model ###

# Split the data into train and validation
train, valid = gl.churn_predictor.random_split(timeseries, user_id='CustomerID', fraction=0.8, seed = 1)

# A churn forecast requires a time boundary and a churn period.
# Activity before the boundary is used to train the model.
# After the boundary, activity (or lack of activity)
# during the churn period is used to define whether the
# user churned.

# Train the model using data before August
churn_boundary_oct = datetime.datetime(year = 2011, month = 8, day = 1)
Пример #10
0
def create(dataset, features=None, verbose=True):
    """
    Create an anomaly detection model. Based on the type of the input data,
    this function automatically choose the anomaly detection model and the type
    of anomalies to search for. Generally speaking, if the input data appears
    to be a time series---if the dataset type is TimeSeries, one of the
    features is of type datetime.datetime, or there is only a single
    feature---the toolkit chooses the moving Z-score model.

    Parameters
    ----------
    dataset : SFrame or TimeSeries
        Input dataset. Determines the type of anomaly detection model and types
        of anomalies to search for.

    features : list[str], optional
        Names of columns in the input 'dataset' to use as features.

    verbose : bool, optional
        If True, print progress updates and model details.

    Returns
    -------
    model : GraphLab Create model

    See Also
    --------
    local_outlier_factor.create, graphlab.toolkits.dbscan.create

    Examples
    --------
    >>> sf = graphlab.SFrame({'x0': [0., 1., 1., 0., 1., 0., 5.],
    ...                       'x1': [2., 1., 0., 1., 2., 1.5, 2.5]})
    ...
    >>> m = graphlab.anomaly_detection.create(sf)
    >>> type(m)
    graphlab.toolkits.anomaly_detection.local_outlier_factor.LocalOutlierFactorModel
    ...
    >>> m['scores']
    +--------+----------------------+
    | row_id | local_outlier_factor |
    +--------+----------------------+
    |   2    |    0.951567102896    |
    |   0    |    0.951567102896    |
    |   5    |    1.00783754045     |
    |   4    |    0.982224576307    |
    |   3    |    1.05829898642     |
    |   1    |    1.05829898642     |
    |   6    |    2.52792223974     |
    +--------+----------------------+
    [7 rows x 2 columns]
    """
    _mt._get_metric_tracker().track('toolkit.anomaly_detection.create')

    ## Basic validation of the input dataset.
    if not isinstance(dataset, (_gl.SFrame, _gl.TimeSeries)):
        raise TypeError("Input 'dataset' must be an SFrame or TimeSeries.")

    if len(dataset) < 1 or len(dataset.column_names()) < 1:
        raise TypeError("Input 'dataset' is empty.")

    ## Figure out the features and do basic validation.
    if features is None:
        features = dataset.column_names()

    if (not isinstance(features, list)
            or not all([type(c) == str for c in features])):

        raise TypeError("If specified, input 'features' must be a list " +
                        "of strings.")

    if not all([c in dataset.column_names() for c in features]):
        raise _ToolkitError("The specified features could not all be found " +
                            "in the input 'dataset'.")

    ## If any valid features are datetime types LOF is not valid.
    ## If there is more than one feature Z-score is not valid.

    # Figure out if there is a datetime column.
    col_types = {
        k: v
        for k, v in zip(dataset.column_names(), dataset.column_types())
    }

    datetime_features = [c for c in features if col_types[c] == _dt.datetime]
    value_features = [c for c in features if col_types[c] != _dt.datetime]

    ## Decide which model to use.
    try_zscore = False

    if isinstance(dataset, _gl.TimeSeries):
        try_zscore = True

    else:  # dataset is an SFrame
        if len(datetime_features) > 0:
            try_zscore = True

        if len(value_features) == 1 and (col_types[value_features[0]]
                                         in (int, float)):
            try_zscore = True

    ## Create the relevant model.
    bandwidth = max(1, int(0.05 * len(dataset)))

    if try_zscore:
        if len(value_features) != 1 or len(datetime_features) > 1:
            raise _ToolkitError(
                "Cannot select an appropriate anomaly " +
                "detection model. For a " +
                "local outlier factor model, please remove " +
                "any datetime-type features. For a moving" +
                "Z-score model, please identify one data" +
                "feature (integer- or float-type) and at most" +
                "one datetime column as an index (this indexing is done" +
                "automatically for TimeSeries objects)")

        if isinstance(dataset, _gl.SFrame) and len(datetime_features) == 1:
            _dataset = _gl.TimeSeries(dataset, index=datetime_features[0])
        else:
            _dataset = dataset[:]

        if verbose:
            print("Creating a moving Z-score anomaly detection model.")

        model = _gl.moving_zscore.create(dataset=_dataset,
                                         feature=value_features[0],
                                         window_size=bandwidth,
                                         verbose=verbose)

    ## If not doing the moving z-score, do local outlier factor.
    else:
        if verbose:
            print("Creating a local outlier factor model.")

        model = _gl.local_outlier_factor.create(dataset=dataset,
                                                features=features,
                                                num_neighbors=bandwidth,
                                                verbose=verbose)

    return model
Пример #11
0
# just to check if data is sorted in ascending mode
qq.head(3)

qq.save(“SP500_daily.bin”)
# once data is saved, we can use the following instruction to retrieve it 
qq = gl.SFrame(“SP500_daily.bin/”)

# add the outcome variable, 1 if the trading session was positive (close>open), 0 otherwise
qq['outcome'] = qq.apply(lambda x: 1 if x['close'] > x['open'] else -1)
# we also need to add three new columns ‘ho’ ‘lo’ and ‘gain’
# they will be useful to backtest the model, later
qq['ho'] = qq['high'] - qq['open'] # distance between Highest and Opening price
qq['lo'] = qq['low'] - qq['open'] # distance between Lowest and Opening price
qq['gain'] = qq['close'] - qq['open']

ts = gl.TimeSeries(qq, index='datetime')
# add the outcome variable, 1 if the bar was positive (close>open), 0 otherwise
ts['outcome'] = ts.apply(lambda x: 1 if x['close'] > x['open'] else -1)

# GENERATE SOME LAGGED TIMESERIES
ts_1 = ts.shift(1) # by 1 day
ts_2 = ts.shift(2) # by 2 days
# ...etc....
# it's an arbitrary decision how many days of lag are needed to create a good forecaster, so
# everyone can experiment by his own decision

# add_features is a helper function, which is out of the scope of this article,
# and it returns a tuple with:
# ts: a timeseries object with, in addition to the already included columns, also lagged columns
# as well as some features added to train the model, as shown above with feat1 and feat2 examples
# l_features: a list with all features used to train Classifier models
Пример #12
0
# g1 = gn['genre1'].unique()
# g2 = gn['genre2'].unique()
# g3 = gn['genre3'].unique()

genres = {
    'genre1': gn['genre1'].unique(),
    'genre2': gn['genre2'].unique(),
    'genre3': gn['genre3'].unique()
}

joined = scrobbles.join(gn, on='songID')

joined['ts'] = joined['ts'].apply(lambda x: parse(x))

ts = gl.TimeSeries(joined, index='ts')
ts.save('ts')

total_listens = ts.resample(dt.timedelta(days=1), agg.COUNT())
total_listens.save(rootdir + '_total_listens')

for level in ('genre1', 'genre2', 'genre3'):
    n = len(genres[level])
    for i, genre in enumerate(genres[level]):

        current = ts[ts[level] == genre].resample(dt.timedelta(days=1),
                                                  agg.COUNT())
        #current.save(rootdir+level+'_'+genre)
        current.to_sframe().to_dataframe().to_pickle(rootdir + level + '_' +
                                                     genre.replace('/', '-') +
                                                     '.pkl')
Пример #13
0
    def update(self, dataset, window_size=None, min_observations=None,
               verbose=True):
        """
        Create a new `MovingZScoreModel` with a new dataset. The `window_size`
        and `min_observations` parameters can also be updated with this method.

        The new model contains anomaly scores for each observation in the new
        `dataset`. In addition, the last `window_size` rows of the existing
        model's data and anomaly scores are prepended, for continuity and to
        show how the anomaly score is computed for the first few rows of the
        new `dataset`.

        Parameters
        ----------
        dataset : SFrame or TimeSeries
            New data to use for updating the model. The type of the input
            'dataset' must match the type of the data already in the model (if
            the model has data already).

        window_size : int, optional
            Length of the time window to use for defining the moving z-score
            value, in terms of number of observations. The window size will be
            the same as the current model's window size if a new window is not
            specified.

        min_observations : int, optional
            Minimum number of non-missing observations in the moving window
            required to compute the moving Z-score. If unspecified, the entire
            moving window preceding an observation must not contain any missing
            values in order for the observation to get an anomaly score. This
            parameter will be the same as the current model's value if not
            specified.

        verbose : bool, optional
            If True, print progress updates and model details.

        Returns
        -------
        out : MovingZScoreModel
            A *new* MovingZScoreModel, with an updated dataset and anomaly
            scores for the updated dataset. The `scores` field of the new model
            has the same schema as the `scores` field of the existing model,
            but data prepended from the existing results have a row ID of
            'None'.

        See Also
        --------
        create

        Examples
        --------
        >>> sf = graphlab.SFrame({'year': [2007, 2007, 2008, 2009, 2010, 2010],
        ...                       'value': [12.2, 11.7, 12.5, 21.4, 10.8, 11.2]})
        >>> model = graphlab.anomaly_detection.moving_zscore.create(sf,
        ...                                                         window_size=3,
        ...                                                         feature='value')
        ...
        >>> sf2 = graphlab.SFrame({'year': [2010, 2011, 2012, 2013],
        ...                        'value': [18.4, 12.1, 12.0, 3.6]})
        >>> model2 = model.update(sf2)
        >>> model2['scores'].print_rows(max_column_width=20)
        +--------+----------------+-------+----------------+---------------------+
        | row_id | anomaly_score  | value | moving_average |  model_update_time  |
        +--------+----------------+-------+----------------+---------------------+
        |  None  | 28.0822407386  |  21.4 | 12.1333333333  | 2016-01-04 16:58... |
        |  None  | 1.00086199482  |  10.8 |      15.2      | 2016-01-04 16:58... |
        |  None  | 0.795990414837 |  11.2 |      14.9      | 2016-01-04 16:58... |
        |   0    | 0.801849542822 |  18.4 | 14.4666666667  | 2016-01-04 16:58... |
        |   1    | 0.391346818515 |  12.1 | 13.4666666667  | 2016-01-04 16:58... |
        |   2    | 0.593171014002 |  12.0 |      13.9      | 2016-01-04 16:58... |
        |   3    | 3.52963789428  |  3.6  | 14.1666666667  | 2016-01-04 16:58... |
        +--------+----------------+-------+----------------+---------------------+
        [7 rows x 5 columns]
        """
        start_time = _time.time()
        _mt._get_metric_tracker().track(
                              'toolkit.anomaly_detection.moving_zscore.update')
        logger = _logging.getLogger(__name__)


        ## Validate the new dataset
        if not isinstance(dataset, (_gl.SFrame, _gl.TimeSeries)):
            raise TypeError("Input 'dataset' must be an SFrame or TimeSeries.")

        if len(dataset) < 1:
            raise TypeError("Input 'dataset' is empty.")

        if ((self.__proxy__['dataset_type'] == 'TimeSeries' and not isinstance(dataset, _gl.TimeSeries)) or
            (self.__proxy__['dataset_type'] == 'SFrame' and not isinstance(dataset, _gl.SFrame))):

            raise TypeError("New input 'dataset' must have the same type " +
                            "as the data already in the model.")

        ## Validate the new window size (if there is one), and figure out what
        #  the new window size will be.
        if window_size is None:
            window_size = self.__proxy__['window_size']

        else:
            if not isinstance(window_size, int):
                raise TypeError("Input 'window_size' must be an integer.")

            if window_size < 1:
                raise ValueError("Input 'window_size' must greater than or " +
                                 "equal to 1.")

        ## Validate and determine the `min_observations` parameter.
        if min_observations is None:
            min_observations = self.__proxy__['min_observations']

        else:
            if not isinstance(min_observations, int):
                raise TypeError("If specified, input 'min_observations' must " +
                                "be a positive integer.")

            if min_observations < 1:
                raise ValueError("If specified, input 'min_observations' must " +
                                 "be a positive integer.")


        ## TimeSeries-specific dataset validation
            ## Make the sure new data occurs *after* the existing data.
        scores = self.__proxy__['scores']

        if isinstance(dataset, _gl.TimeSeries):
            first_new_timestamp = dataset[0][dataset.index_col_name]
            last_old_timestamp = scores[-1][scores.index_col_name]

            if first_new_timestamp < last_old_timestamp:
                raise _ToolkitError("The new dataset has data with " +
                                    "earlier timestamps than the existing " +
                                    "dataset. Please ensure that new data " +
                                    "occurs after existing data.")


        ## Extract the feature from the new dataset and validate it.
        feature = self.__proxy__['feature']

        try:
            series = dataset[feature]
        except:
            raise _ToolkitError("The feature specified by the original " +
                                "model could not be found in the input " +
                                "'dataset'.")

        if not series.dtype() in [int, float]:
            raise ValueError("The values in the specified feature must be " +
                             "integers or floats.")


        ## Create a new model and cut the old score object to the window size.
        new_state = {k: self.__proxy__[k]
            for k in ['verbose', 'feature', 'dataset_type']}

        new_state['window_size'] = window_size
        new_state['min_observations'] = min_observations

        new_model = MovingZScoreModel(new_state)


        ## Save just the old data needed for the moving statistics on the new
        #  data.
        if len(scores) < window_size:
            old_scores = scores[:]
        else:
            old_scores = scores[-window_size:]


        ## Compute Z-scores and anomaly scores.
        series = old_scores[feature].append(series)
        moving_average, moving_zscore, sufficient_data = \
            _moving_z_score(series, window_size, min_observations)

        anomaly_score = abs(moving_zscore)

        if not sufficient_data:
            logger.warning("The number of observations is smaller than " +
                           "the minimum number needed to compute a " +
                           "moving Z-score, so all anomaly scores are 'None'. " +
                           "Consider adding more data with the model's `update` " +
                           "method, or reducing the `window_size` or " +
                           "`min_observations` parameters.")

        ## General post-processing and formatting.
        scores = _gl.SFrame({feature: series,
                             'moving_average': moving_average,
                             'anomaly_score': anomaly_score})
        scores['model_update_time'] = _dt.datetime.now()

        scores = scores[[feature,  # reorder the columns
                         'moving_average',
                         'anomaly_score',
                         'model_update_time']]


        ## Replace the new Z-scores for the *old* data with the original
        #  Z-score for that data.
        num_new_examples = len(dataset)
        new_scores = scores[-num_new_examples:]

        if isinstance(dataset, _gl.TimeSeries):
            new_scores[dataset.index_col_name] = dataset[dataset.index_col_name]
            new_scores = _gl.TimeSeries(new_scores, index=dataset.index_col_name)

            ## The index column should have the same name in the old and new
            #  data. If it doesn't, change the name in the old scores.
            if dataset.index_col_name != old_scores.index_col_name:
                old_scores = old_scores.rename(
                           {old_scores.index_col_name: dataset.index_col_name})

                if verbose:
                    logger.warning("The new dataset's index column name " +
                                   "does not match the existing index " +
                                   "column name. The new name is used in " +
                                   "the new model.")

            final_scores = old_scores.union(new_scores)

        else:
            new_scores = new_scores.add_row_number('row_id')
            old_scores['row_id'] = None
            old_scores['row_id'] = old_scores['row_id'].astype(int)
            final_scores = old_scores.append(new_scores)


        ## Finalize and return the model.
        new_model.__proxy__['num_examples'] = len(scores)
        new_model.__proxy__['scores'] = final_scores
        new_model.__proxy__['training_time'] = _time.time() - start_time

        return new_model
Пример #14
0
def create(dataset, window_size, feature=None, min_observations=None,
           verbose=True):
    """
    Create a :class:`MovingZScoreModel` model. This model fits a moving average
    to a univariate time series and identifies points that are far from the
    fitted curve. The MovingZScoreModel works with either TimeSeries or SFrame
    inputs. A uniform sampling rate is assumed and the data window must be
    defined in terms of number of observations.

    This model differs from other GraphLab Create models in that it can be
    created from an existing `MovingZSCoreModel`. To create a new model in this
    fashion, use the existing model's `update` method.

    The model created by this function contains a table `scores` that contains
    the computed anomaly scores. The type of `scores` matches the type of the
    input `dataset`, and the table contains 5 columns:

        - *row id/time*: ID of the corresponding row in the input `dataset`. If
          `dataset` is an SFrame, this is the row numbers of the input data; if
          `dataset` is a TimeSeries, it is the index of the time series.

        - *anomaly score*: absolute value of the moving Z-score. A score of 0
          indicates the value is identical to the moving average. The higher
          the score, the more likely a point is to be an anomaly.

        - *value*: input data. The name of this column matches the input
          `feature`.

        - *moving average*: moving average of each point's preceding
          `window_size` values.

        - *model update time*: time the model was updated. This is particularly
          useful if the `window_size` is larger than the number of rows in the
          input datasets, because the `scores` table has results from several
          updates.

    Parameters
    ----------
    dataset : SFrame or TimeSeries
        Input data. The column named by the 'feature' parameter will be
        extracted for modeling.

    window_size : int
        Length of the time window to use for defining the moving z-score value,
        in terms of number of observations.

    feature : str, optional
        Name of the column to model. Any data provided to the model with either
        the `create` or `update` functions must have a column with this name.
        The feature name is not necessary if `dataset` is an SFrame with a
        single column or a TimeSeries with a single value column; it can be
        determined automatically in this case.

    min_observations : int, optional
        Minimum number of non-missing observations in the moving window
        required to compute the moving Z-score. If unspecified, the entire
        moving window preceding an observation must not contain any missing
        values in order for the observation to get an anomaly score.

    verbose : bool, optional
        If True, print progress updates and model details.

    Returns
    -------
    out : MovingZScoreModel
        A trained :class:`MovingZScoreModel`, which contains a table called
        `scores` that includes the anomaly score for each input data point. The
        type of the `scores` table matches the type of the input `dataset`.

    See Also
    --------
    MovingZScoreModel, MovingZScoreModel.update

    Notes
    -----
    - The moving Z-score for a data point :math:`x_t` is simply the value of
      :math:`x_t` standardized by subtracting the moving mean just prior to
      time :math:`t` and dividing by the moving standard deviation just prior
      to :math:`t`. Suppose :math:`w` stands for the `window_size` in terms of
      the number of observations. Then the moving Z-score is:

      .. math:: z(x_t) = \\frac{x_t - \\bar{x}_t}{s_t}

      where the moving average is:

      .. math:: \\bar{x}_t = (1/w) \sum_{i=t-w}^{t-1} x_i

      and the moving standard deviation is:

      .. math:: s_t = \sqrt{(1/w) \sum_{i=t-w}^{t-1} (x_i - \\bar{x}_t)^2}

    - The moving Z-score at points within `window_size` observations of the
      beginning of a series are not defined, because there are insufficient
      points to compute the moving average and moving standard deviation. This
      is represented by missing values.

    - Missing values in the input dataset are assigned missing values ('None')
      for their anomaly scores as well.

    - If there is no variation in the values preceding a given observation, the
      moving Z-score can be infinite or undefined. If the given observation is
      equal to the moving average, the anomaly score is coded as 'nan'; if the
      observation is *not* equal to the moving average, the anomaly score is
      'inf'.

    Examples
    --------
    >>> sf = graphlab.SFrame({'year': [2007, 2007, 2008, 2009, 2010, 2010],
    ...                       'value': [12.2, 11.7, 12.5, 21.4, 10.8, 11.2]})
    >>> model = graphlab.anomaly_detection.moving_zscore.create(sf,
    ...                                                         window_size=3,
    ...                                                         feature='value')
    >>> model['scores'].print_rows(max_column_width=20)
    +--------+----------------+-------+----------------+---------------------+
    | row_id | anomaly_score  | value | moving_average |  model_update_time  |
    +--------+----------------+-------+----------------+---------------------+
    |   0    |      None      |  12.2 |      None      | 2016-01-04 16:55... |
    |   1    |      None      |  11.7 |      None      | 2016-01-04 16:55... |
    |   2    |      None      |  12.5 |      None      | 2016-01-04 16:55... |
    |   3    | 28.0822407386  |  21.4 | 12.1333333333  | 2016-01-04 16:55... |
    |   4    | 1.00086199482  |  10.8 |      15.2      | 2016-01-04 16:55... |
    |   5    | 0.795990414837 |  11.2 |      14.9      | 2016-01-04 16:55... |
    +--------+----------------+-------+----------------+---------------------+
    [6 rows x 5 columns]
    """

    _mt._get_metric_tracker().track(
                              'toolkit.anomaly_detection.moving_zscore.create')

    start_time = _time.time()
    logger = _logging.getLogger(__name__)


    ## Validate required inputs by themselves.
    if not isinstance(dataset, (_gl.SFrame, _gl.TimeSeries)):
        raise TypeError("Input 'dataset' must be an SFrame or TimeSeries.")

    if len(dataset) < 1:
        raise _ToolkitError("Input 'dataset' is empty.")

    if not isinstance(window_size, int):
        raise TypeError("Input 'window_size' must be an integer.")

    if window_size < 1:
        raise ValueError("Input 'window_size' must greater than or " +
                         "equal to 1.")

    if feature is not None and not isinstance(feature, str):
        raise TypeError("Input 'feature' must be a string if specified.")

    if min_observations is not None:
        if not isinstance(min_observations, int):
            raise TypeError("If specified, input 'min_observations' must " +
                            "be a positive integer.")

        if min_observations < 1:
            raise ValueError("If specified, input 'min_observations' must " +
                             "be a positive integer.")

    ## Determine the feature name if left unspecified.
    column_names = dataset.column_names() if isinstance(dataset, _gl.SFrame) \
        else dataset.value_col_names

    if feature is None:
        if len(column_names) == 1:
            feature = column_names[0]
        else:
            raise _ToolkitError("If the 'input' dataset has multiple " +
                                "columns, a 'feature' column name must be " +
                                "specified.")


    ## Extract the specified feature as an SArray.
    try:
        series = dataset[feature]
    except:
        raise _ToolkitError("The specified feature could not be found " +
                            "in the input 'dataset'.")


    ## Validate the type of the feature.
    if not series.dtype() in [int, float]:
        raise ValueError("The values in the specified feature must be " +
                         "integers or floats.")


    ## Compute the moving average, Z-score, and a final anomaly score. For all
    #  anomaly detectcion models, the final score should be in the range [0,
    #  \infty], with higher values indicating more outlier-ness.
    moving_average, moving_zscore, sufficient_data = \
        _moving_z_score(series, window_size, min_observations)

    anomaly_score = abs(moving_zscore)

    if not sufficient_data:
        logger.warning("The number of observations is smaller than " +
                       "the minimum number needed to compute a " +
                       "moving Z-score, so all anomaly scores are 'None'. " +
                       "Consider adding more data with the model's `update` " +
                       "method, or reducing the `window_size` or " +
                       "`min_observations` parameters.")

    ## Format the results.
    scores = _gl.SFrame({feature: series,
                         'moving_average': moving_average,
                         'anomaly_score': anomaly_score})
    scores['model_update_time'] = _dt.datetime.now()

    scores = scores[['anomaly_score', # reorder the columns
                     feature,
                     'moving_average',
                     'model_update_time']]

    if isinstance(dataset, _gl.SFrame):
        if feature != 'row_id':
            scores = scores.add_row_number('row_id')
        else:
            logger.warning("Feature name is 'row_id', so the " +
                           "index in the model's 'scores' SFrame " +
                           "is called '_row_id'.")
            scores = scores.add_row_number('_row_id')

    if isinstance(dataset, _gl.TimeSeries):
        scores[dataset.index_col_name] = dataset[dataset.index_col_name]
        scores = _gl.TimeSeries(scores, index=dataset.index_col_name)

    dataset_type = 'TimeSeries' if isinstance(dataset, _gl.TimeSeries) else 'SFrame'

    ## Set up the model.
    state = {
        'dataset_type': dataset_type,
        'verbose': verbose,
        'window_size': window_size,
        'min_observations': min_observations,
        'num_examples': len(dataset),
        'feature': feature,
        'training_time': _time.time() - start_time,
        'scores': scores}

    model = MovingZScoreModel(state)
    return model