Exemplo n.º 1
0
def calib_detect(df, calib_params):
    """
    calib_detect seeks to find calibration events based on 2 conditions: persistence of a defined length, which often occurs when sensors are out of the water, during certain days of the week (M-F) and times of the day.
    Input:
    : param df: data frame with columns:
        'observed' of observed data
        'anomaly' booleans where True=1 corresponds to anomalies
        'persist_grp' with indices of peristent groups (output of the persist function)
    : param calib_params: parameters defined in the parameters file
        persist_high: longest length of the persistent group
        perist_low: shortest length of the persistent group
        hour_low: earliest hour for calibrations to have occurred
        hour_high: latest hour for calibrations to have occurred
    Output:
    : param calib: data frame of booleans indicating whether the conditions were met for a possible calibration event.
    : param calib_dates: datetimes for which the conditions were met for a possible calibration event.
    """
    if 'persist_grp' in df:
        temp = df[['observed', 'anomaly', 'persist_grp']].copy(deep=True)
        for i in range(1, max(temp['persist_grp']) + 1):
            temp['persist_grp'][temp.loc[temp['persist_grp'].shift(-1) ==
                                         i].index[0]] = i
            if ((len(temp['persist_grp'][temp['persist_grp'] == i]) >=
                 calib_params['persist_low'])
                    and (len(temp['persist_grp'][temp['persist_grp'] == i]) <=
                         calib_params['persist_high'])):
                temp['anomaly'][temp['persist_grp'] == i] = True
    else:
        temp = df[['observed', 'anomaly']].copy(deep=True)
        temp['persist_grp'] = (temp.observed.diff(1) == 0)
        temp['persist_grp'] = anomaly_utilities.anomaly_events(
            temp['persist_grp'], 0, 1)
        for i in range(1, max(temp['persist_grp']) + 1):
            temp['persist_grp'][temp.loc[temp['persist_grp'].shift(-1) ==
                                         i].index[0]] = i
            if ((len(temp['persist_grp'][temp['persist_grp'] == i]) >=
                 calib_params['persist_low'])
                    and (len(temp['persist_grp'][temp['persist_grp'] == i]) <=
                         calib_params['persist_high'])):
                temp['anomaly'][temp['persist_grp'] == i] = True

    dayofweek = temp.index.dayofweek
    hour = temp.index.hour
    business = temp.iloc[(
        (dayofweek == 0) | (dayofweek == 1) | (dayofweek == 2)
        | (dayofweek == 3) | (dayofweek == 4))
                         & (hour >= calib_params['hour_low']) &
                         (hour <= calib_params['hour_high'])]
    calib = pd.DataFrame(index=temp.index)
    calib['anomaly'] = False
    calib['anomaly'].loc[business[business['anomaly']].index] = True
    calib_dates = calib[calib['anomaly']].index

    return calib, calib_dates
Exemplo n.º 2
0
def group_size(df):
    """
    group_size determines the size of the largest consecutive group of anomalous points.
    Arguments:
        df: data frame with column 'anomaly'.
    Returns:
        size: length of the largest consecutive group of anomalous points.
    """
    temp = df[['anomaly']].copy(deep=True)
    temp['value_grp'] = anomaly_utilities.anomaly_events(temp['anomaly'], 0, 1)
    size = 0
    if max(temp['value_grp']) > 0:
        size = len(temp['value_grp'][temp['value_grp'] == 1])
        for i in range(2, max(temp['value_grp']) + 1):
            if(len(temp['value_grp'][temp['value_grp'] == i]) > size):
                size = len(temp['value_grp'][temp['value_grp'] == i])

    return size
Exemplo n.º 3
0
def calib_overlap(sensor_names, input_array, calib_params):
    """
    calib_overlap seeks to identify calibration events by identifying where overlaps occur between multiple sensors.
    Calls the calib_detect function to identify events with a defined persistence length during certain days of the
    week (M-F) and hours of the day.
    Arguments:
        sensor_names: list of sensors to be considered for overlap.
        input_array: array of data frames each with columns:
            'observed' of observed data
            'anomaly' booleans where True=1 corresponds to anomalies
            'persist_grp' with indices of persistent groups (output of the persist function)
        calib_params: parameters defined in the parameters file
            persist_high: longest length of the persistent group
            perist_low: shortest length of the persistent group
            hour_low: earliest hour for calibrations to have occurred
            hour_high: latest hour for calibrations to have occurred
    Returns:
        all_calib: array of data frames (one for each sensor) of booleans indicating whether the conditions were met
        for a possible calibration event.
        all_calib_dates: array of datetimes (one for each sensor) for which the conditions were met for a possible
        calibration event.
        df_all_calib: data frame with columns for each sensor observations, columns of booleans for each sensor
        indicating whether a calibration event may have occurred, and a column 'all_calib' that indicates if the
        conditions were met for all sensors.
        calib_dates_overlap: datetimes for which the conditions were met for a possible calibration event for
        all sensors.
    """
    all_calib = dict()
    all_calib_dates = dict()
    df_all_calib = pd.DataFrame(index=input_array[sensor_names[0]].index)
    df_all_calib['all_calib'] = True
    for snsr in sensor_names:
        calib, calib_dates = calib_persist_detect(input_array[snsr], calib_params)
        all_calib[snsr] = calib
        all_calib_dates[snsr] = calib_dates
        df_all_calib[snsr] = input_array[snsr]['observed']
        df_all_calib[snsr + '_calib'] = all_calib[snsr]['anomaly']
        df_all_calib[snsr + '_event'] = anomaly_utilities.anomaly_events(df_all_calib[snsr + '_calib'], wf=1, sf=1)
        df_all_calib['all_calib'] = np.where(df_all_calib['all_calib'] & (df_all_calib[snsr + '_event'] != 0), True, False)
    calib_dates_overlap = df_all_calib[df_all_calib['all_calib']].index

    return all_calib, all_calib_dates, df_all_calib, calib_dates_overlap
Exemplo n.º 4
0
def persistence(df, length, output_grp=False):
    """
    persistence adds an anomalous label in the data frame if data repeat for specified length.
    Arguments:
        df: data frame with a column 'raw' of raw data and a boolean column 'anomaly' (typically output of range_check)
        length: duration of persistent/repeated values to be flagged
        output_grp: boolean to indicate whether the length of persistence should be output as a column in the original dataframe.
    Returns:
        df: dataframe with column 'anomaly' modified and added column 'persist_grp' that indexes points as part of persistent groups
        persist_count: total number of persistent points in the data frame
    """
    temp = df[['raw', 'anomaly']].copy(deep=True)
    temp['persist_grp'] = (temp.raw.diff(1) == 0)
    temp['persist_grp'] = anomaly_utilities.anomaly_events(temp['persist_grp'], 0, 1)
    for i in range(1, max(temp['persist_grp']) + 1):
        if(len(temp['persist_grp'][temp['persist_grp'] == i]) >= length):
            temp['anomaly'][temp['persist_grp'] == i] = True
    persist_count = sum(temp['persist_grp'] != 0)
    df['anomaly'] = temp['anomaly']
    if output_grp:
        df['persist_grp'] = temp['persist_grp']

    return df, persist_count
Exemplo n.º 5
0
def LSTM_detect_multivar(sensor_array,
                         sensors,
                         params,
                         LSTM_params,
                         model_type,
                         name='',
                         rules=False,
                         plots=False,
                         summary=True,
                         compare=False,
                         model_output=True,
                         model_save=True):
    """
    """
    print('\nProcessing LSTM multivariate ' + str(model_type) + ' detections.')
    # RULES BASED DETECTION #
    if rules:
        size = dict()
        for snsr in sensors:
            sensor_array[snsr], r_c = rules_detect.range_check(
                sensor_array[snsr], params[snsr].max_range,
                params[snsr].min_range)
            sensor_array[snsr], p_c = rules_detect.persistence(
                sensor_array[snsr], params[snsr].persist)
            size[snsr] = rules_detect.group_size(sensor_array[snsr])
            sensor_array[snsr] = rules_detect.interpolate(sensor_array[snsr])
            print(snsr + ' maximum detected group length = ' + str(size[snsr]))
        print('Rules based detection complete.\n')
    # Create new data frames with raw and observed (after applying rules) and preliminary anomaly detections for selected sensors
    df_raw = pd.DataFrame(index=sensor_array[sensors[0]].index)
    df_observed = pd.DataFrame(index=sensor_array[sensors[0]].index)
    df_anomaly = pd.DataFrame(index=sensor_array[sensors[0]].index)
    for snsr in sensors:
        df_raw[snsr + '_raw'] = sensor_array[snsr]['raw']
        df_observed[snsr + '_obs'] = sensor_array[snsr]['observed']
        df_anomaly[snsr + '_anom'] = sensor_array[snsr]['anomaly']
    print('Raw data shape: ' + str(df_raw.shape))
    print('Observed data shape: ' + str(df_observed.shape))
    print('Initial anomalies data shape: ' + str(df_anomaly.shape))

    # MODEL CREATION #
    if model_type == ModelType.VANILLA:
        model = modeling_utilities.LSTM_multivar(df_observed, df_anomaly,
                                                 df_raw, LSTM_params, summary,
                                                 name, model_output,
                                                 model_save)
    elif model_type == ModelType.BIDIRECTIONAL:
        model = modeling_utilities.LSTM_multivar_bidir(df_observed, df_anomaly,
                                                       df_raw, LSTM_params,
                                                       summary, name,
                                                       model_output,
                                                       model_save)

    print('multivariate ' + str(model_type) + ' LSTM model complete.\n')
    # Plot Metrics and Evaluate the Model
    if plots:
        plt.figure()
        plt.plot(model.history.history['loss'], label='Training Loss')
        plt.plot(model.history.history['val_loss'], label='Validation Loss')
        plt.legend()
        plt.show()

    # DETERMINE THRESHOLD AND DETECT ANOMALIES #
    ts = LSTM_params['time_steps']
    residuals = pd.DataFrame(model.test_residuals)
    residuals.columns = sensors
    predictions = pd.DataFrame(model.predictions)
    predictions.columns = sensors
    if model_type == ModelType.VANILLA:
        residuals.index = df_observed[ts:].index
        predictions.index = df_observed[ts:].index
        observed = df_observed[ts:]
    elif model_type == ModelType.BIDIRECTIONAL:
        residuals.index = df_observed[ts:-ts].index
        predictions.index = df_observed[ts:-ts].index
        observed = df_observed[ts:-ts]

    threshold = dict()
    detections = dict()
    for snsr in sensors:
        threshold[snsr] = anomaly_utilities.set_dynamic_threshold(
            residuals[snsr], params[snsr]['window_sz'], params[snsr]['alpha'],
            params[snsr]['threshold_min'])
        threshold[snsr].index = residuals.index
        detections[snsr] = anomaly_utilities.detect_anomalies(
            observed[snsr + '_obs'],
            predictions[snsr],
            residuals[snsr],
            threshold[snsr],
            summary=True)
        if plots:
            plt.figure()
            anomaly_utilities.plt_threshold(residuals[snsr], threshold[snsr],
                                            sensors[snsr])
            plt.show()
    print('Threshold determination complete.')

    # WIDEN AND NUMBER ANOMALOUS EVENTS #
    all_data = dict()
    for snsr in sensors:
        if model_type == ModelType.VANILLA:
            all_data[snsr] = sensor_array[snsr].iloc[ts:]
        elif model_type == ModelType.BIDIRECTIONAL:
            all_data[snsr] = sensor_array[snsr].iloc[ts:-ts]
        all_data[snsr]['detected_anomaly'] = detections[snsr]['anomaly']
        all_data[snsr]['all_anomalies'] = all_data[snsr].eval(
            'detected_anomaly or anomaly')
        all_data[snsr]['detected_event'] = anomaly_utilities.anomaly_events(
            all_data[snsr]['all_anomalies'], params[snsr]['widen'])

    # COMPARE AND DETERMINE METRICS #
    if compare:
        metrics = dict()
        e_metrics = dict()
        for snsr in sensors:
            all_data[snsr]['labeled_event'] = anomaly_utilities.anomaly_events(
                all_data[snsr]['labeled_anomaly'], params[snsr]['widen'])
            anomaly_utilities.compare_events(all_data[snsr],
                                             params[snsr]['widen'])
            metrics[snsr] = anomaly_utilities.metrics(all_data[snsr])
            e_metrics[snsr] = anomaly_utilities.event_metrics(all_data[snsr])
            # OUTPUT RESULTS #
            print('\nModel type: LSTM multivariate ' + str(model_type))
            print('Sensor: ' + snsr)
            anomaly_utilities.print_metrics(metrics[snsr])
            print('Event based calculations:')
            anomaly_utilities.print_metrics(e_metrics[snsr])
        print('Model report complete\n')

    # GENERATE PLOTS #
    if plots:
        for snsr in sensors:
            plt.figure()
            anomaly_utilities.plt_results(
                raw=sensor_array[snsr]['raw'],
                predictions=detections[snsr]['prediction'],
                labels=sensor_array[snsr]['labeled_event'],
                detections=all_data[snsr]['detected_event'],
                sensor=snsr)
            plt.show()

    LSTM_detect_multivar = ModelWorkflow()
    LSTM_detect_multivar.sensor_array = sensor_array
    LSTM_detect_multivar.df_observed = df_observed
    LSTM_detect_multivar.df_raw = df_raw
    LSTM_detect_multivar.df_anomaly = df_anomaly
    LSTM_detect_multivar.model = model
    LSTM_detect_multivar.threshold = threshold
    LSTM_detect_multivar.detections = detections
    LSTM_detect_multivar.all_data = all_data
    if compare:
        LSTM_detect_multivar.metrics = metrics
        LSTM_detect_multivar.e_metrics = e_metrics

    return LSTM_detect_multivar
Exemplo n.º 6
0
def ARIMA_detect(df,
                 sensor,
                 params,
                 rules=False,
                 plots=False,
                 summary=True,
                 compare=False,
                 suppress_warnings=True):
    """
    """
    print('\nProcessing ARIMA detections.')
    # RULES BASED DETECTION #
    if rules:
        df = rules_detect.range_check(df, params['max_range'],
                                      params['min_range'])
        df = rules_detect.persistence(df, params['persist'])
        size = rules_detect.group_size(df)
        df = rules_detect.interpolate(df)
        print(sensor +
              ' rules based detection complete. Longest detected group = ' +
              str(size))

    # MODEL CREATION #
    [p, d, q] = params['pdq']
    model_fit, residuals, predictions = modeling_utilities.build_arima_model(
        df['observed'], p, d, q, summary, suppress_warnings)
    print(sensor + ' ARIMA model complete.')

    # DETERMINE THRESHOLD AND DETECT ANOMALIES #
    threshold = anomaly_utilities.set_dynamic_threshold(
        residuals[0], params['window_sz'], params['alpha'],
        params['threshold_min'])
    threshold.index = residuals.index
    if plots:
        plt.figure()
        anomaly_utilities.plt_threshold(residuals, threshold, sensor)
        plt.show()
    print('Threshold determination complete.')
    detections = anomaly_utilities.detect_anomalies(df['observed'],
                                                    predictions,
                                                    residuals,
                                                    threshold,
                                                    summary=True)

    # WIDEN AND NUMBER ANOMALOUS EVENTS #
    df['detected_anomaly'] = detections['anomaly']
    df['all_anomalies'] = df.eval('detected_anomaly or anomaly')
    df['detected_event'] = anomaly_utilities.anomaly_events(
        df['all_anomalies'], params['widen'])

    if compare:
        df['labeled_event'] = anomaly_utilities.anomaly_events(
            df['labeled_anomaly'], params['widen'])
        # DETERMINE METRICS #
        anomaly_utilities.compare_events(df, params['widen'])
        metrics = anomaly_utilities.metrics(df)
        e_metrics = anomaly_utilities.event_metrics(df)
        # OUTPUT RESULTS #
        print('Model type: ARIMA')
        print('Sensor: ' + sensor)
        anomaly_utilities.print_metrics(metrics)
        print('Event based calculations:')
        anomaly_utilities.print_metrics(e_metrics)
        print('Model report complete\n')

    # GENERATE PLOTS #
    if plots:
        plt.figure()
        anomaly_utilities.plt_results(raw=df['raw'],
                                      predictions=detections['prediction'],
                                      labels=df['labeled_event'],
                                      detections=df['detected_event'],
                                      sensor=sensor)
        plt.show()

    ARIMA_detect = ModelWorkflow()
    ARIMA_detect.df = df
    ARIMA_detect.model_fit = model_fit
    ARIMA_detect.threshold = threshold
    ARIMA_detect.detections = detections
    if compare:
        ARIMA_detect.metrics = metrics
        ARIMA_detect.e_metrics = e_metrics

    return ARIMA_detect
Exemplo n.º 7
0
def LSTM_detect_univar(df,
                       sensor,
                       params,
                       LSTM_params,
                       model_type,
                       name='',
                       rules=False,
                       plots=False,
                       summary=True,
                       compare=False,
                       model_output=True,
                       model_save=True):
    """
    """
    print('\nProcessing LSTM univariate ' + str(model_type) + ' detections.')
    # RULES BASED DETECTION #
    if rules:
        df = rules_detect.range_check(df, params['max_range'],
                                      params['min_range'])
        df = rules_detect.persistence(df, params['persist'])
        size = rules_detect.group_size(df)
        df = rules_detect.interpolate(df)
        print(
            sensor +
            ' rules based detection complete. Maximum detected group length = '
            + str(size))

    # MODEL CREATION #
    if model_type == ModelType.VANILLA:
        model = modeling_utilities.LSTM_univar(df, LSTM_params, summary, name,
                                               model_output, model_save)
    elif model_type == ModelType.BIDIRECTIONAL:
        model = modeling_utilities.LSTM_univar_bidir(df, LSTM_params, summary,
                                                     name, model_output,
                                                     model_save)
    print(sensor + ' ' + str(model_type) + ' LSTM model complete.')
    if plots:
        plt.figure()
        plt.plot(model.history.history['loss'], label='Training Loss')
        plt.plot(model.history.history['val_loss'], label='Validation Loss')
        plt.legend()
        plt.show()

    # DETERMINE THRESHOLD AND DETECT ANOMALIES #
    ts = LSTM_params['time_steps']
    threshold = anomaly_utilities.set_dynamic_threshold(
        model.test_residuals[0], params['window_sz'], params['alpha'],
        params['threshold_min'])
    if model_type == ModelType.VANILLA:
        threshold.index = df[ts:].index
    elif model_type == ModelType.BIDIRECTIONAL:
        threshold.index = df[ts:-ts].index
    residuals = pd.DataFrame(model.test_residuals)
    residuals.index = threshold.index
    if plots:
        plt.figure()
        anomaly_utilities.plt_threshold(residuals, threshold, sensor)
        plt.show()
    if model_type == ModelType.VANILLA:
        observed = df[['observed']][ts:]
    elif model_type == ModelType.BIDIRECTIONAL:
        observed = df[['observed']][ts:-ts]
    print('Threshold determination complete.')
    detections = anomaly_utilities.detect_anomalies(observed,
                                                    model.predictions,
                                                    model.test_residuals,
                                                    threshold,
                                                    summary=True)

    # WIDEN AND NUMBER ANOMALOUS EVENTS #
    if model_type == ModelType.VANILLA:
        df_anomalies = df.iloc[ts:]
    elif model_type == ModelType.BIDIRECTIONAL:
        df_anomalies = df.iloc[ts:-ts]
    df_anomalies['detected_anomaly'] = detections['anomaly']
    df_anomalies['all_anomalies'] = df_anomalies.eval(
        'detected_anomaly or anomaly')
    df_anomalies['detected_event'] = anomaly_utilities.anomaly_events(
        df_anomalies['all_anomalies'], params['widen'])

    if compare:
        df_anomalies['labeled_event'] = anomaly_utilities.anomaly_events(
            df_anomalies['labeled_anomaly'], params['widen'])
        # DETERMINE METRICS #
        anomaly_utilities.compare_events(df_anomalies, params['widen'])
        metrics = anomaly_utilities.metrics(df_anomalies)
        e_metrics = anomaly_utilities.event_metrics(df_anomalies)
        # OUTPUT RESULTS #
        print('Model type: LSTM univariate ' + str(model_type))
        print('Sensor: ' + sensor)
        anomaly_utilities.print_metrics(metrics)
        print('Event based calculations:')
        anomaly_utilities.print_metrics(e_metrics)
        print('Model report complete\n')

    # GENERATE PLOTS #
    if plots:
        plt.figure()
        anomaly_utilities.plt_results(
            raw=df['raw'],
            predictions=detections['prediction'],
            labels=df['labeled_event'],
            detections=df_anomalies['detected_event'],
            sensor=sensor)
        plt.show()

    LSTM_detect_univar = ModelWorkflow()
    LSTM_detect_univar.df = df
    LSTM_detect_univar.model = model
    LSTM_detect_univar.threshold = threshold
    LSTM_detect_univar.detections = detections
    LSTM_detect_univar.df_anomalies = df_anomalies
    if compare:
        LSTM_detect_univar.metrics = metrics
        LSTM_detect_univar.e_metrics = e_metrics

    return LSTM_detect_univar
Exemplo n.º 8
0
        minimum=site_params[site][snsr]['min_range'])
    sensor_array[snsr], persist_count[snsr] = rules_detect.persistence(
        df=sensor_array[snsr],
        length=site_params[site][snsr]['persist'],
        output_grp=True)
    sensor_array[snsr] = rules_detect.add_labels(df=sensor_array[snsr],
                                                 value=-9999)
    sensor_array[snsr] = rules_detect.interpolate(df=sensor_array[snsr])
    # s = rules_detect.group_size(df=sensor_array[snsr])
    # size.append(s)
    # print(str(snsr) + ' longest detected group = ' + str(size))

# metrics for rules based detection #
for snsr in sensor_array:
    df_rules_metrics = sensor_array[snsr]
    df_rules_metrics['labeled_event'] = anomaly_utilities.anomaly_events(
        anomaly=df_rules_metrics['labeled_anomaly'], wf=0)
    df_rules_metrics['detected_event'] = anomaly_utilities.anomaly_events(
        anomaly=df_rules_metrics['anomaly'], wf=0)
    anomaly_utilities.compare_events(df=df_rules_metrics, wf=0)
    rules_metrics[snsr] = anomaly_utilities.metrics(df=df_rules_metrics)
    print('\nRules based metrics')
    print('Sensor: ' + snsr)
    anomaly_utilities.print_metrics(df=rules_metrics[snsr])
    del (df_rules_metrics)

print('Rules based detection complete.\n')

#### Detect Calibration Events
#########################################

calib_sensors = sensors[1:4]