def load_ts_data(path, timestamp_col=None, date_cols=None, epoch_col=None, set_index=False, n_rows=None): """ path should be a full path to a csv file with exactly one of columns timestamp or epoch """ assert isinstance(path, str) assert isinstance(timestamp_col, (str, type(None))) assert isinstance(date_cols, (list, type(None))) # columns to be parsed as dates if date_cols is None: date_cols = False if timestamp_col is not None: assert timestamp_col in date_cols, 'timestamp_col should be one of date_cols.' assert isinstance(epoch_col, (str, type(None))) assert isinstance(set_index, bool) assert isinstance(n_rows, (int, type(None))) if timestamp_col is not None: ts = pd.read_csv(path, parse_dates=date_cols, infer_datetime_format=True) else: ts = pd.read_csv(path) assert isinstance(ts, pd.DataFrame) assert ((timestamp_col is None) and (epoch_col is not None)) or \ ((timestamp_col is not None) and ( epoch_col is None)), 'ts should have either \'timestamp\' or \'epoch\' columns (but not both).' log.debug('ts shape: {}.'.format(ts.shape)) log.debug('ts head: {}'.format(ts.head())) if timestamp_col is None: # ts has epoch column. rename and add timestamp column ts.rename(columns={epoch_col: 'epoch'}, inplace=True) # convert to timestamp ts['timestamp'] = pd.to_datetime(ts['epoch'], unit='s') if epoch_col is None: # ts has timestamp column. rename and add epoch column ts.rename(columns={timestamp_col: 'timestamp'}, inplace=True) # convert to epoch ts['epoch'] = ts['timestamp'].astype('int64') // 1e9 ts['epoch'] = ts['epoch'].astype('int64') if set_index: ts = ts.set_index('timestamp') if n_rows is not None: ts = ts.iloc[0:n_rows] return ts
def run(self): log.debug('Running feature engineering ..') fe_start_time = time.time() # load data dir = u'D:\\FAMILY\\Yuval\\Work\\Seebo\\' file = u'Yuval_TS_Table.csv' path = dir + file # TODO: see parsing dates is so slow # date_cols = ['end_time_stamp', 'start_time', 'end_time'] # data = pd.read_csv(path, parse_dates=date_cols, infer_datetime_format=True) data = pd.read_csv(path) sensors_per_batch = data.groupby('batch_id')['metric_id'].apply( lambda ts: ts.unique()) if sensors_per_batch.shape[0] > 1: # TODO: validate every batch has the same sensors data. Make more elegant first_batch_sensors = sensors_per_batch.iloc[0] for i in range(1, len(sensors_per_batch.shape[0])): assert np.equal(first_batch_sensors, sensors_per_batch.iloc[i]), \ 'All batches should have the same sensors' # TODO: sort by batch_id, metric_id, value data = data.sort_values(by=['batch_id', 'metric_id', 'sensor_value'], inplace=False) # impute missing values # TODO: replace this stupid imputation method data['sensor_value'] = data['sensor_value'].fillna(0.0, inplace=False) # instantiate composite feature extractor # TODO: map self._feature_extractor_names to self._feature_extractor_objects gfe = GlobalFeatureExtractor(self._time_series_features_enricher) tfe = TemporalFeatureExtractor() cfe = CompositeFeatureExtractor([gfe, tfe]) design_matrix = cfe.extract(data) feature_engineering_main_output = design_matrix fe_end_time = time.time() fe_duration = round((fe_end_time - fe_start_time) / 60, 2) log.debug( 'Done running feature engineering [Total time: {} mins.].'.format( fe_duration)) return feature_engineering_main_output
def extract(self, data): assert isinstance(data, pd.DataFrame) # assert that data have no missing values assert not pd.isnull( data).values.any(), 'data should not contain missing values.' log.debug('Running Global feature extractor ..') gfe_start_time = time.time() # setting time series features to extract or use default # fc_parameters = MinimalFCParameters() # fc_parameters = EfficientFCParameters() # fc_parameters = ComprehensiveFCParameters() # feature extraction design_matrix = extract_features( data, default_fc_parameters=self._fc_parameters, column_id='batch_id', column_sort='end_time_stamp', column_kind='metric_id', column_value='sensor_value', n_jobs=self._num_of_cores_to_use) # impute: use a builtin tsfresh method that replaces NaN with median and -inf # [+inf] with min [max] in a columnwise fashion (and in place) # If the column does not contain finite values at all, it is filled with zeros # Also, all columns will be guaranteed to be of type np.float64 # (can also be done by passing impute_function=impute) to extract_features()) impute(design_matrix) # TODO: assert that none cf the columns was filled with zeros # TODO: think about feature selection as well (extract_relevant_features), see: # https://github.com/blue-yonder/tsfresh/blob/master/notebooks/robot_failure_example.ipynb # note though that this may be problematic for real-time ts anomaly detection gfe_end_time = time.time() gfe_duration = round((gfe_end_time - gfe_start_time) / 60, 2) log.debug( 'Done running Global feature extractor [Total time: {} mins.].'. format(gfe_duration)) return design_matrix
def extract(self, data): assert isinstance(data, pd.DataFrame) # assert that data have no missing values assert not pd.isnull(data).values.any(), 'data should not contain missing values.' log.debug('Running Composite feature extractor ..') cfe_start_time = time.time() matrices_lst = [] for fe in self._feature_extractors: matrices_lst.append(fe.extract_features(data)) design_matrix = pd.concat(matrices_lst, axis=1) cfe_end_time = time.time() cfe_duration = round((cfe_end_time - cfe_start_time) / 60, 2) log.debug('Done running Composite feature extractor [Total time: {} mins.].'.format(cfe_duration)) return design_matrix
def extract(self, data): assert isinstance(data, pd.DataFrame) # assert that data have no missing values assert not pd.isnull( data).values.any(), 'data should not contain missing values.' log.debug('Running Temporal feature extractor ..') tfe_start_time = time.time() design_matrix = data.groupby( ['batch_id', 'metric_id'])['end_time_stamp'].aggregate(self._timespan) tfe_end_time = time.time() tfe_duration = round((tfe_end_time - tfe_start_time) / 60, 2) log.debug( 'Done running Temporal feature extractor [Total time: {} mins.].'. format(tfe_duration)) return design_matrix
def _validate_and_sort_data_prior_to_charting(data, anomalous_batch_id, sensor_id): assert not pd.isnull( data).any().any(), 'Data have missing values. Please check.' expected_columns = [ 'batch_id', 'sensor_id', 'timestamp', 'value', 'batch_label' ] assert set(data.columns) == set(expected_columns) s = 'batch {} has no records for sensor {}'.format(anomalous_batch_id, sensor_id) assert sensor_id in data.loc[data['batch_id'] == anomalous_batch_id, 'sensor_id'].unique(), s assert set(data['batch_label'].unique()) == set(np.array([0, 1])) assert (data.loc[data['batch_id'] == anomalous_batch_id, 'batch_label'] == 1 ).all(), 'batch_id should be an abnormal batch.' normal_batches = data.loc[data['batch_label'] == 0].copy() number_of_normal_batches = len(normal_batches['batch_id'].unique()) assert not normal_batches.empty, 'There are no normal batches.' s = 'At least one normal batch has no records for sensor: {}.'.format( sensor_id) assert normal_batches.groupby('batch_id')['sensor_id'].aggregate( lambda ts: sensor_id in ts.unique()).sum( ) == number_of_normal_batches, s # sort data by (batch_id, sensor_id, timestamp) data = data.sort_values(by=['batch_id', 'sensor_id', 'timestamp'], inplace=False) log.debug( 'Done validating and sorting data by (batch_id, sensor_id, timestamp) prior to charting.' ) return data
def generate_fake_data(n_batches=150, n_sensors=300, batch_anomalous_probability=0.3, mu_0=0.0, mu_1=1.5, sd_0=1.0, sd_1=1.0): log.debug('Generating fake data: {} batches, {} sensors, batch anomalous probability: {}'.format( n_batches, n_sensors, batch_anomalous_probability)) batch_ids = ['B-' + ''.join(random.choice('0123456789ABCDEF') for i in range(8)) for _ in range(n_batches)] sensor_ids = ['S-' + ''.join(random.choice(string.ascii_lowercase) for i in range(8)) for _ in range(n_sensors)] data = [] for batch_id in batch_ids: # create target labels batch_label = np.random.binomial(1, batch_anomalous_probability, 1)[0] # create data for sensor_id in sensor_ids: hour = random.randint(10, 12) min = random.choice(np.arange(0, 60, 5)) min_timestamp = pd.Timestamp(2018, 11, 1, hour, min) sensor_duration_in_minutes = random.choice(np.arange(300, 420, 5)) # 5 to 7 hours max_timestamp = min_timestamp + datetime.timedelta(minutes=float(sensor_duration_in_minutes)) timestamps = pd.date_range(min_timestamp, max_timestamp, freq='5min') if batch_label == 0: values = np.random.normal(mu_0, sd_0, len(timestamps)) else: values = np.random.normal(mu_1, sd_1, len(timestamps)) for timestamp, value in zip(timestamps, values): data.append([batch_id, sensor_id, timestamp, value, batch_label]) data = pd.DataFrame(data, columns=['batch_id', 'sensor_id', 'timestamp', 'value', 'batch_label']) log.debug('Done generating fake data: {} batches, {} sensors.'.format(n_batches, n_sensors)) return data
def create_anomalous_charts(data, anomalous_batch_id, sensor_id, dir=None, show=True, plotly=False): log.debug( 'Creating prospect/retrospect charts (Forward/Backward View) for batch {} and sensor {}.' .format(anomalous_batch_id, sensor_id)) assert isinstance(data, pd.DataFrame) assert isinstance(anomalous_batch_id, str) assert isinstance(sensor_id, str) assert isinstance(dir, (str, type(None))) assert isinstance(show, bool) data = _validate_and_sort_data_prior_to_charting(data, anomalous_batch_id, sensor_id) data_for_chart = _prepare_data_for_chart(data, anomalous_batch_id, sensor_id) batch_values = data_for_chart.get('batch_values') # forward view batch_duration_in_minutes_forward_view = data_for_chart.get( 'batch_duration_in_minutes_forward_view') normal_batches_duration_in_minutes_forward_view = data_for_chart.get( 'normal_batches_duration_in_minutes_forward_view') normal_batches_averages_forward_view = data_for_chart.get( 'normal_batches_averages_forward_view') normal_batches_lower_values_forward_view = data_for_chart.get( 'normal_batches_lower_values_forward_view') normal_batches_upper_values_forward_view = data_for_chart.get( 'normal_batches_upper_values_forward_view') # backward view batch_duration_in_minutes_backward_view = data_for_chart.get( 'batch_duration_in_minutes_backward_view') normal_batches_duration_in_minutes_backward_view = data_for_chart.get( 'normal_batches_duration_in_minutes_backward_view') normal_batches_averages_backward_view = data_for_chart.get( 'normal_batches_averages_backward_view') normal_batches_lower_values_backward_view = data_for_chart.get( 'normal_batches_lower_values_backward_view') normal_batches_upper_values_backward_view = data_for_chart.get( 'normal_batches_upper_values_backward_view') fig, ax = plt.subplots(2, 1) ax[0].plot(batch_duration_in_minutes_forward_view, batch_values, marker='', color='red', label='Batch id: {}'.format(anomalous_batch_id)) ax[0].plot(normal_batches_duration_in_minutes_forward_view, normal_batches_averages_forward_view, marker='', color='green', linewidth=3, label='Normal Batches (avg.)') ax[0].fill_between(normal_batches_duration_in_minutes_forward_view, normal_batches_lower_values_forward_view, normal_batches_upper_values_forward_view, color='lightgreen', alpha='0.2') # ax[0].title('Prospect (Forward) View: Sensor id {}'.format(sensor_id), fontsize=12) ax[0].set_title('Prospect (Forward) View', size=12) # ax[0].xlabel('Minutes (since start)') ax[0].set_xlabel('Minutes (since start)') ax[0].legend() ax[1].plot(batch_duration_in_minutes_backward_view, batch_values, marker='', color='red', label='Batch id: {}'.format(anomalous_batch_id)) ax[1].plot(normal_batches_duration_in_minutes_backward_view, normal_batches_averages_backward_view, marker='', color='green', linewidth=3, label='Normal Batches (avg.)') ax[1].fill_between(normal_batches_duration_in_minutes_backward_view, normal_batches_lower_values_backward_view, normal_batches_upper_values_backward_view, color='lightgreen', alpha='0.2') # ax[1].title('Retrospect (Backward) View: Sensor id {}'.format(sensor_id), fontsize=12) ax[1].set_title('Retrospect (Backward) View', size=12) # ax[1].xlabel('Minutes (prior to end)') ax[1].set_xlabel('Minutes (prior to end)') ax[1].legend() fig.suptitle('Anomaly Charts for batch id: {} and sensor id: {}'.format( anomalous_batch_id, sensor_id), size=15) if show: fig.show() if dir is not None: file_name = 'anomaly_chart_' + 'batch_id_' + anomalous_batch_id + 'sensor_id_' + sensor_id + '.pdf' full_path = dir + file_name fig.set_size_inches(10, 10) fig.savefig(full_path, dpi=100) if plotly: abnormal_batch = go.Scatter(x=batch_duration_in_minutes_forward_view, y=batch_values, name='Abnormal Batch', mode='lines+markers', line=dict(color='red')) normal_batches_average = go.Scatter( x=normal_batches_duration_in_minutes_forward_view, y=normal_batches_averages_forward_view, name='Normal Batches', line=dict(color='green', width=4)) normal_batches_lower = go.Scatter( x=normal_batches_duration_in_minutes_forward_view, y=normal_batches_lower_values_forward_view, name='lower', hoverinfo='skip', fill=None, mode='lines', line=dict(color='lightgreen'), showlegend=False) normal_batches_upper = go.Scatter( x=normal_batches_duration_in_minutes_forward_view, y=normal_batches_upper_values_forward_view, name='upper', hoverinfo='skip', fill='tonexty', #fillcolor='lightgreen', mode='lines', line=dict(color='lightgreen'), showlegend=False) data = [ normal_batches_lower, normal_batches_upper, abnormal_batch, normal_batches_average ] layout = dict(title='Prospect') fig = dict(data=data, layout=layout) plot(fig) log.debug( 'Done creating prospect/retrospect charts (Forward/Backward View) for batch {} and sensor {}.' .format(anomalous_batch_id, sensor_id))
def plot_ts_and_anomalies(ts, value_col, anomalies, anomaly_scores, ts_only=False, dir=None, show=True, plotly=False): assert isinstance(ts, pd.DataFrame) assert isinstance(value_col, str) assert not pd.isnull(ts[value_col]).any(), 'value_col has missing data' assert isinstance(anomalies, list) assert isinstance(anomaly_scores, TimeSeries) assert isinstance(ts_only, bool) assert isinstance(dir, (str, type(None))) if dir is not None: assert dir[-1] == '/' if not os.path.exists(dir): os.makedirs(dir) assert isinstance(show, bool) assert isinstance(plotly, bool) assert len(ts['timestamp'].unique() ) == ts.shape[0], 'timestamp should not have duplicated values' if (len(anomalies) == 0) or ts_only: # plot ts only if len(anomalies) == 0: log.debug('Found no anomalies.') plt.plot_date(ts['timestamp'], ts[value_col], color='blue', fmt='-') plt.title('ts', size=12) if show: plt.show() # TODO: add plotly plot in this case as well else: # plot ts and anomalies log.debug('Found {} anomalies.'.format(len(anomalies))) scores = anomaly_scores.values fig, ax = plt.subplots(2, 1) # plot ts # ax[0].plot(ts['epoch'], ts[value_col], color='blue') ax[0].plot_date(ts['timestamp'], ts[value_col], color='blue', fmt='-') ax[0].set_title('ts', size=12) # plot anomalies on top of ts for anomaly in anomalies: anomay_time_window = anomaly.get_time_window() epoch_left = anomay_time_window[0] epoch_right = anomay_time_window[1] timestamp_left = ts.loc[ts['epoch'] == epoch_left, 'timestamp'].values[0] timestamp_right = ts.loc[ts['epoch'] == epoch_right, 'timestamp'].values[0] ax[0].axvspan(timestamp_left, timestamp_right, alpha=0.5, color='gray') # plot anomaly scores # ax[1].plot(ts['epoch'], scores, color='red') ax[1].plot_date(ts['timestamp'], scores, color='red', fmt='-') ax[1].set_title('scores', size=12) if show: fig.show() if dir is not None: file_name = 'ts_and_anomaly_scores.pdf' full_path = dir + file_name fig.set_size_inches(10, 10) fig.savefig(full_path, dpi=100) if plotly: if dir is not None: file_name = 'ts_and_anomaly_scores.html' full_path = dir + file_name time_series = go.Scatter(x=ts['timestamp'], y=ts[value_col], name='ts', mode='lines', line=dict(color='blue')) anomaly_scores = go.Scatter(x=ts['timestamp'], y=scores, name='scores', line=dict(color='red')) fig = tools.make_subplots(rows=2, cols=1, specs=[[{}], [{}]], shared_xaxes=True, shared_yaxes=False) fig.append_trace(time_series, 1, 1) fig.append_trace(anomaly_scores, 2, 1) # fig['layout'].update(height=600, width=800, title='Time series and anomaly scores') fig['layout'].update(title='Time series and anomaly scores') plot(fig, filename=full_path) else: log.debug( 'Need to supply a dir in order to generate plotly chart.')
import pandas as pd import csv from src.utils.logger import log pd.set_option('display.expand_frame_repr', False) # pd.set_option('display.max_rows', None, 'display.max_columns', None) change_timestamp_format = False if change_timestamp_format: log.debug('Loading single_batch data ..') date_parser = lambda x: pd.datetime.strptime(x, '%d/%m/%y %H:%M') data = pd.read_csv('/Users/yuval/Downloads/Sensor_readings.csv', parse_dates=['end_time_stamp'], infer_datetime_format=True, date_parser=date_parser) log.debug('Done loading single_batch data.') date_format = u'%Y-%m-%d' timestamp_format = u'%Y-%m-%d %H:%M:%S' log.debug('Persisting to csv new timestamp format ..') data.to_csv('/Users/yuval/Downloads/Sensor_readings_YUVAL.csv', index=False, quoting=csv.QUOTE_ALL, doublequote=True, date_format=timestamp_format) log.debug('Done persisting to csv new timestamp format.') else: log.debug('Loading single_batch data ..')