예제 #1
0
import numpy as np
from scipy.spatial.distance import pdist

from sklearn.cluster import DBSCAN, KMeans
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
from sklearn.preprocessing import normalize

from utils.io import read_enbw_dataset

TOTAL_DAYS_YEAR = 365

data = read_enbw_dataset('data/hackathon_EnBW_smart_meter_data_30_hh.csv')

groups = data.groupby('id')

group = groups.get_group(9)
timeslice = group.iloc[3600:8000, :]
timeslice.dropna(inplace=True)

timeslice.value.plot()
timeslice.set_index(timeslice.timestampLocal, inplace=True)
timeslice = timeslice.resample('1H').mean()
timeslice_groups = timeslice.groupby(lambda x: x.dayofyear)

plt.figure()
timeslice['value'].rolling(window=100).mean().plot()
예제 #2
0

def str2float(x):
    return float(x.replace(',', '.'))


def group_by_hour_of_day(d):
    return d.hour


def transform(x):
    return (x - x.mean()) / x.std()


data_filepath = 'data/hackathon_EnBW_smart_meter_data_30_hh.csv'
data = read_enbw_dataset(data_filepath)

groups = data.groupby('id')
group = groups.get_group(2)
group.set_index(group.timestampLocal, inplace=True)

grouped_by_day = group.groupby(lambda x: x.dayofyear)

days = [1, 3, 31, 40, 100, 150, 200, 230, 300, 360]
k = 0
year = 2104
for day in days:
    day_group = grouped_by_day.get_group(day)
    plt.figure()
    day_group['value'].plot()
    print(day)
예제 #3
0
def detect_anomalies(filepath, output_file, visualize=True):
    data = read_enbw_dataset(filepath)
    new_data = []
    groups = data.groupby('id')

    m = np.reshape([1, 2], (2, 1))

    alpha = 0.005
    gamma = 4
    beta = 1 / 2

    year = 2009  # TODO: extract from file

    # get customer
    for household_id, group in groups:

        group.set_index(group.timestampLocal, inplace=True)
        group.dropna(inplace=True)

        first = group.loc[group.index[0], 'timestampLocal']
        last = group.loc[group.index[-1], 'timestampLocal']

        # group by day and create average day
        grouped_by_hour = group.groupby(
            lambda x: pd.to_timedelta(x.hour, unit='H'))

        average_day = grouped_by_hour.value.agg(['mean', 'min', 'max', 'std'])
        grouped_by_day = group.groupby(
            lambda x: datetime.datetime(x.year, x.month, x.day))

        assumed_inactivity = average_day.sort_values(
            by='mean').iloc[0:4, :].mean()
        assumed_inactivity_std = assumed_inactivity['std']
        overall_std = group.value.std()

        night = [0, 1, 2, 3, 4, 23]
        night_hours = np.zeros((24, 1), dtype=np.bool)
        night_hours[night] = True

        inactivity_anomaly_vectors = []
        is_nocturnal_activity_vectors = []

        day_manual = 0
        seasonal_mean = group.resample('1W').mean()
        dates = []
        for date, day_group in grouped_by_day:
            try:
                if day_manual > 0:
                    day_group = grouped_by_day.get_group(day_manual)
                day_statistics = day_group.resample('1H').value.agg(
                    ['mean', 'min', 'max', 'std'])
                day_statistics['offset'] = pd.to_timedelta(
                    day_statistics.index.hour, unit='H')

                seasonal_mean['sort_val'] = abs(
                    (seasonal_mean.index - date).days)
                overall_mean = seasonal_mean.sort_values('sort_val').iloc[0][
                    'value'] + beta * assumed_inactivity_std

                merged_statistics = day_statistics.merge(
                    average_day,
                    how='left',
                    left_on=day_statistics.offset,
                    right_on=average_day.index)

                is_unexpected = average_day['mean'] > overall_mean

                deviation_from_inactivity = ((merged_statistics['mean_x'] -
                                              assumed_inactivity['mean']) /
                                             assumed_inactivity['std']).values
                # comparison_to_night_activity.index.name = 'index'

                deviation_from_expectation = (
                    (merged_statistics['mean_x'] - merged_statistics['mean_y'])
                    / merged_statistics['mean_y']).values

                is_inactivity = deviation_from_inactivity < alpha * overall_std

                inactivity_anomaly_vector = np.sum(np.vstack(
                    (is_unexpected, is_inactivity)).astype(np.int32) * m,
                                                   axis=0)
                inactivity_anomaly_vectors.append(inactivity_anomaly_vector)

                # 2. anomaly: nocturnal activity
                nocturnal_activity_vector = \
                    np.logical_and(deviation_from_expectation > gamma, night_hours).astype(np.int)
                is_nocturnal_activity_vectors.append(nocturnal_activity_vector)

                date_vector = (merged_statistics.key_0 + date).values
                dates.extend(date_vector)
                if day_manual > 0:
                    break
            except ValueError:
                continue

        Y = np.hstack(inactivity_anomaly_vectors)

        unexpected_anomalies_count = 0
        current_anomaly = []
        unexpected_anomalies = []
        min_anomaly_length = 5

        for k, y in enumerate(Y):
            if y >= 3:
                unexpected_anomalies_count += 1
                current_anomaly.append(dates[k])
            else:
                if unexpected_anomalies_count >= min_anomaly_length:
                    anomaly_range = \
                        [
                            current_anomaly[0],
                            current_anomaly[-1],
                        ]

                    unexpected_anomalies.append(anomaly_range)

                current_anomaly = []
                unexpected_anomalies_count = 0

        if visualize:
            unexpected_anomalies_str = [[
                pd.to_datetime(str(x)).strftime('%Y-%m-%d %H:%M:%S') for x in r
            ] for r in unexpected_anomalies]
            visualize_household(group, first, last, unexpected_anomalies_str)

        num_columns = len(group.columns)

        group['isAnomaly'] = 0
        group['anomalyType'] = 'None'
        for start, end in unexpected_anomalies:
            b = np.logical_and(start <= group.index, group.index <= end)
            idx, = np.where(b)

            group.iloc[idx.tolist(), num_columns] = 1
            group.iloc[idx.tolist(), num_columns + 1] = 'lowActivity'

        new_data.append(group)

    new_df = pd.concat(new_data)
    # new_df = new_df.reset_index()
    new_df.to_csv(output_file)
예제 #4
0
import datetime
from datetime import timedelta

import pandas as pd
import numpy as np

from utils.datetime_utils import day_of_year_to_date
from utils.io import read_enbw_dataset
import seaborn as sns
import matplotlib.pyplot as plt

from visualize import visualize_household

data = read_enbw_dataset('data/data-sun-holidays.csv')

groups = data.groupby('id')

# get customer
household_id = 3
group = groups.get_group(household_id)
group.set_index(group.timestampLocal, inplace=True)
group.dropna(inplace=True)
# group by day and create average day
grouped_by_hour = group.groupby(lambda x: pd.to_timedelta(x.hour, unit='H'))

average_day = grouped_by_hour.value.agg(['mean', 'min', 'max', 'std'])
grouped_by_day = group.groupby(lambda x: x.dayofyear)

means_of_days = grouped_by_day.value.agg(['min', 'max', 'mean', 'std'])

means_of_days.index.name = 'day_of_year'