def reshape_data_tsfresh(seq_dataset, n_classes, n_steps, settings): """ Transform sequences dataset into dataset of features """ len_data = seq_dataset.shape[0] data_divided = [] for i in range(n_classes): data_divided.append(seq_dataset[:, :, i].reshape(-1)) to_extract = [] for i in range(n_classes): ids = np.arange(len_data).repeat(n_steps) tmp = np.vstack((ids, data_divided[i])) tmp = tmp.T to_extract.append(pd.DataFrame(data=tmp, columns=["id", "value"])) tfs = [] # parameters of tsfresh features extraction if settings == "complete": settings = ComprehensiveFCParameters() elif settings == "efficient": settings = EfficientFCParameters() elif settings == "minimal": settings = MinimalFCParameters() for i in range(n_classes): tf = tsfresh.extract_features( to_extract[i], column_id="id", default_fc_parameters=settings ) tfs.append(tf) data_feat = pd.concat( [tfs[i].reindex(tfs[0].index) for i in range(n_classes)], axis=1 ) print(data_feat.shape) data_feat.fillna(0, inplace=True) data_feat.replace([np.inf, -np.inf], 0, inplace=True) data_tensor = torch.from_numpy(data_feat.values).float() return data_tensor
def _extract_tsfresh_features(self, X): X_df = self._convert_to_df(X) X_df_no_nans = X_df.dropna() if self.extraction_type == "minimal": extraction_setting = MinimalFCParameters() elif self.extraction_type == "efficient": extraction_setting = EfficientFCParameters() elif self.extraction_type == "all": extraction_setting = ComprehensiveFCParameters() else: raise ValueError( f"{self.extraction_type} is not a supported feature extraction option. Please choose one from " f"the following options: [minimal, efficient, all]." ) # Extract time series features from the dataframe # Replace any ``NaNs`` and ``infs`` in the extracted features with median/extreme values for that column tsfresh_features = extract_features( X_df_no_nans, default_fc_parameters=extraction_setting, column_id="id", column_sort="time", impute_function=impute, ) # If X_df.dropna() dropped some observations entirely (i.e., due to all NaNs), # impute each tsfresh feature for those observations with the median of that tsfresh feature tsfresh_features_imputed = impute(tsfresh_features.reindex(pd.RangeIndex(X_df["id"].max() + 1))) return tsfresh_features_imputed, X_df
def test_gen_global_feature_multi_id(self): dates = pd.date_range('1/1/2019', periods=8) data = np.random.randn(8, 3) df = pd.DataFrame({"datetime": dates, "values": data[:, 0], "A": data[:, 1], "B": data[:, 2], "id": ["00"]*4+["01"]*4}) from tsfresh.feature_extraction import ComprehensiveFCParameters from tsfresh.feature_extraction import MinimalFCParameters from tsfresh.feature_extraction import EfficientFCParameters for params in [ComprehensiveFCParameters(), MinimalFCParameters(), EfficientFCParameters()]: output_df, _ = generate_global_features(input_df=df, column_id="id", column_sort="datetime", default_fc_parameters=params) assert "datetime" in output_df.columns assert "values" in output_df.columns assert "A" in output_df.columns assert "B" in output_df.columns assert "id" in output_df.columns for col in output_df.columns: if col in ["datetime", "values", "A", "B", "id"]: continue assert len(set(output_df[output_df["id"] == "00"][col])) == 1 assert len(set(output_df[output_df["id"] == "01"][col])) == 1 assert output_df[output_df["id"] == "00"][col].isna().sum() == 0 assert output_df[output_df["id"] == "01"][col].isna().sum() == 0
def featurize_set(ids, fc_params=None): if fc_params is None: fc_params = EfficientFCParameters() X_df = pd.DataFrame() for id in tqdm(ids): X_df = pd.concat([X_df, featurize_audio(id, fc_params)]) return X_df
def main(): files = pd.read_excel( '/home/velaraptor/Downloads/Raw Data 10yrs (2018).xlsx', header=1) files = files.fillna(0) groups = files.groupby('Name') forecast_df = [] for name, group in tqdm.tqdm(groups): if len(group) > 1: group.index = group.Year df_shift, y = make_forecasting_frame(group["FantPt"], kind=name, max_timeshift=10, rolling_direction=1) forecast_df.append(df_shift) features_df = [] for sample in tqdm.tqdm(forecast_df): X = extract_features(sample, column_id="id", column_sort="time", column_value="value", impute_function=impute, show_warnings=False, disable_progressbar=True, default_fc_parameters=EfficientFCParameters()) X = X.reset_index() X.loc[:, 'Name'] = sample['kind'] features_df.append(X) features_time_series = pd.concat(features_df) features_time_series.to_csv('features_time_series.csv', index=False)
def get_tsfresh_features(df): """Calculate different aggregates/descriptive statistics, using tsfresh, of the some of the more informative raw timeseries. Parameters: ----------- - df: pd.DataFrame the raw (timeseries) data containing the categorical features Returns: -------- - ts_features: pd.DataFrame a DataFrame with each record a process, containing the features based on the binary-valued timeseries """ # We only keep the feature extraction functions that are not too # computationally expensive & that do not return too many values extraction_settings = EfficientFCParameters() filtered_funcs = [ 'abs_energy', 'mean_abs_change', 'mean_change', 'skewness', 'kurtosis', 'absolute_sum_of_changes', 'longest_strike_below_mean', 'longest_strike_above_mean', 'count_above_mean', 'count_below_mean', 'last_location_of_maximum', 'first_location_of_maximum', 'last_location_of_minimum', 'first_location_of_minimum', 'percentage_of_reoccurring_datapoints_to_all_datapoints', 'percentage_of_reoccurring_values_to_all_values', 'sum_of_reoccurring_values', 'sum_of_reoccurring_data_points', 'ratio_value_number_to_time_series_length', 'cid_ce', 'symmetry_looking', 'large_standard_deviation', 'quantile', 'autocorrelation', 'number_peaks', 'binned_entropy', 'index_mass_quantile', 'linear_trend', 'number_crossing_m', 'augmented_dickey_fuller', 'number_cwt_peaks', 'agg_autocorrelation', 'spkt_welch_density', 'friedrich_coefficients', 'max_langevin_fixed_point', 'c3', 'ar_coefficient', 'mean_second_derivative_central', 'ratio_beyond_r_sigma', 'energy_ratio_by_chunks', 'partial_autocorrelation', 'fft_aggregated', 'time_reversal_asymmetry_statistic', 'range_count' ] filtered_settings = {} for func in filtered_funcs: filtered_settings[func] = extraction_settings[func] # Extract the features ts_features = extract_features(df[[ 'process_id', 'timestamp', 'return_turbidity', 'return_flow', 'supply_flow', 'target_value', 'flow_diff' ]], column_id='process_id', column_sort="timestamp", column_kind=None, column_value=None, impute_function=impute, default_fc_parameters=filtered_settings, show_warnings=False, disable_progressbar=True) return ts_features
def add_tsfresh_day(new_data, data, tsfresh_features, columns): # The dictionary containing the features that we want to extract and the setting for those features if tsfresh_features == 'minimal': settings = MinimalFCParameters() elif tsfresh_features == 'efficient': settings = EfficientFCParameters() elif tsfresh_features == 'comprehensive': settings = ComprehensiveFCParameters() else: settings = MinimalFCParameters() for participant in range(len(data)): all_days = [] for day in range(len(data[participant])): # We only take the columns that we are interested in sub_data = data[participant][day].loc[data[participant][day] ['variable'].isin(columns)] # Drop all nan values sub_data = sub_data.dropna(axis=0) # If a columns is missing we add a row with that column and a 0. # If a column contains nan values we do the same for col in columns: if col not in sub_data['variable']: new_row = sub_data.iloc[0].copy(deep=True) new_row['variable'] = col new_row['value'] = 0 sub_data.append(new_row) from tsfresh.utilities.dataframe_functions import impute_dataframe_zero # Extract features for every variable still left in the dataframe extracted = extract_features(sub_data, default_fc_parameters=settings, column_id='variable', column_sort='time_seconds', column_value='value') # We do not want multiple rows therefore in the case of multiple variables therefore we need to change it # We also change the column names so that we know what kind if features they are extracted = extracted.stack() extracted.index = extracted.index.map('{0[1]}_{0[0]}_day'.format) extracted = extracted.to_frame().T # Add the extracted features to a list all_days.append(extracted) # Concat the days to make a new dataframe and reset the index to prevent conflicts all_days = pd.concat(all_days, axis=0).reset_index(drop=True) # Add the new features to the data new_data[participant] = pd.concat([new_data[participant], all_days], axis=1) return new_data
def create_agg_tsfresh(x_train, y_train, x_val, y_val, input_path, size=None): y_train = pd.DataFrame(y_train).idxmax(axis=1) y_val = pd.DataFrame(y_val).idxmax(axis=1) if os.path.exists(input_path + 'agg_train.csv') and os.path.exists( input_path + 'agg_val.csv') and size is None: x_train_filtered = pd.read_csv(input_path + 'agg_train.csv', index_col=0) x_val_filtered = pd.read_csv(input_path + 'agg_val.csv', index_col=0) x_train_filtered = x_train_filtered.loc[:, x_train_filtered.var() != 0] x_val_filtered = x_val_filtered[x_train_filtered.columns] else: x_train_df = df_from_3d_np(x_train) x_val_df = df_from_3d_np(x_val) x_train_df = x_train_df.fillna(0) x_val_df = x_val_df.fillna(0) # start_time = time.time() x_train_extracted = extract_features( x_train_df, column_id='index', column_sort='time', default_fc_parameters=EfficientFCParameters()) # duration = time.time() - start_time # print(f'feature extraction {duration}') if 'mts_archive' in input_path: x_train_sel = select_features(x_train_extracted, y_train, n_jobs=0) # if not enough features, take larger set if x_train_sel.shape[1] < 300: X_best = SelectKBest(f_classif, k='all').fit(x_train_extracted, y_train) ufs_scores = pd.DataFrame(X_best.scores_, index=x_train_extracted.columns, columns=['score']).sort_values( by=['score'], ascending=False) x_train_sel = x_train_extracted[ufs_scores.iloc[:300].index] x_train_extracted = x_train_sel x_train_extracted = x_train_extracted.dropna(axis='columns') x_train_extracted.to_csv(input_path + f'agg_train.csv') y_train.to_csv(input_path + f'y_train.csv') x_val_filtered = pd.read_csv(input_path + 'agg_val.csv', index_col=0) x_train_filtered = x_train_extracted.loc[:, x_train_extracted.var() != 0] x_val_filtered = x_val_filtered[x_train_filtered.columns] y_val.to_csv(input_path + 'y_test.csv') return x_train_filtered, y_train, x_val_filtered, y_val
def features(x: pd.Series) -> pd.DataFrame: data = pd.DataFrame(dtype=np.float64) data['x'] = x data['id'] = 1 df = extract_features(data, column_id='id', default_fc_parameters=EfficientFCParameters()) return df
def relevance(self, X, y): from tsfresh.feature_extraction import EfficientFCParameters features_extracted = tsfresh.extract_features( X, column_id="id", default_fc_parameters=EfficientFCParameters(), disable_progressbar=True) tsfresh.utilities.dataframe_functions.impute(features_extracted) relevance_features = tsfresh.feature_selection.relevance.calculate_relevance_table( features_extracted, y) return features_extracted, relevance_features
def gen_global_feature(self, settings="comprehensive", full_settings=None): ''' Generate per-time-series feature for each time series. This method will be implemented by tsfresh. :param settings: str or dict. If a string is set, then it must be one of "comprehensive" "minimal" and "efficient". If a dict is set then it should follow the instruction for default_fc_parameters in tsfresh. The value is defaulted to "comprehensive". :param full_settings: dict. It should follow the instruction for kind_to_fc_parameters in tsfresh. The value is defaulted to None. :return: the tsdataset instance. ''' if full_settings is not None: self.df = generate_global_features( input_df=self.df, column_id=self.id_col, column_sort=self.dt_col, kind_to_fc_parameters=full_settings) return self from tsfresh.feature_extraction import ComprehensiveFCParameters,\ MinimalFCParameters, EfficientFCParameters default_params = { "comprehensive": ComprehensiveFCParameters(), "minimal": MinimalFCParameters(), "efficient": EfficientFCParameters() } if isinstance(settings, str): assert settings in ["comprehensive", "minimal", "efficient"], \ f"settings str should be one of \"comprehensive\", \"minimal\", \"efficient\"\ , but found {settings}." default_fc_parameters = default_params[settings] else: default_fc_parameters = settings self.df,\ addtional_feature =\ generate_global_features(input_df=self.df, column_id=self.id_col, column_sort=self.dt_col, default_fc_parameters=default_fc_parameters) self.feature_col += addtional_feature return self
def add_tsfresh_participant(data, tsfresh_features, columns, k): # The dictionary containing the features that we want to extract and the setting for those features if tsfresh_features == 'minimal': settings = MinimalFCParameters() elif tsfresh_features == 'efficient': settings = EfficientFCParameters() elif tsfresh_features == 'comprehensive': settings = ComprehensiveFCParameters() else: settings = MinimalFCParameters() for participant in range(len(data)): # First we add the necesary columns data[participant]['id'] = 0 data[participant]['index'] = data[participant].index # We create the rolled time series which also creates new ids, also note that putting max_timeshift to none # means that it takes the maximal possible lengths rolled_series = roll_time_series(data[participant], column_id='id', column_sort='index', max_timeshift=k) all_features = [] for column in columns: # We extract the features for every element of the time series which return a dataframe with the same number # of rows as the original dataframe but a different number of columns extracted = extract_features(rolled_series, default_fc_parameters=settings, column_id='id', column_sort='index', column_value=column) # We need to reset the indexes as they have been changed and add them to our list of features all_features.append(extracted.reset_index(drop=True)) # Add all the features together extracted = pd.concat(all_features, axis=1) # We drop the columns that we previously created because we do no want them in the data del data[participant]['id'] # note that you can also use df.drop here del data[participant]['index'] data[participant] = pd.concat([data[participant], extracted], axis=1) return data
def features_generator(path_to_file): signals = pd.read_csv(path_to_file) seg = int(path_to_file.split('/')[-1].split('.')[0]) signals['segment_id'] = seg sel = signals.fillna(0).astype(bool).sum(axis=0) / 60001 > 0.5 signals = signals.fillna(0).loc[:,sel] extracted_features = extract_features(signals.iloc[:,:], column_id = 'segment_id', default_fc_parameters=EfficientFCParameters(), n_jobs = 0, disable_progressbar = True, chunksize = None ) return extracted_features
def extract(self, signal): df = pd.DataFrame(signal.reshape(-1, 1)) df['time'] = np.arange(len(df), dtype=int) df['id'] = 1 features = extract_features( df, column_id="id", column_sort="time", default_fc_parameters=EfficientFCParameters()) results = {} values, names = features.iloc[0, :].values, features.columns for name, value in zip(names, values): results[self.__class__.__name__ + name] = value return results
def compute_tsfresh_features(x, save_path, nb_splits=8, which_set='training'): print('Processing %s set...' % (which_set)) n = x.shape[0] split_breaks = [int(n / nb_splits) * i for i in range(nb_splits)] + [n] for i in range(nb_splits): start = split_breaks[i] stop = split_breaks[i + 1] print('Number of rows being processed:', stop - start) features = extract_features(TSFormatting().transform(x.iloc[start:stop]), column_id='id', column_sort='time', default_fc_parameters=EfficientFCParameters()) features['neuron_id'] = x.iloc[start:stop]['neuron_id'] if (i == 0): features.to_csv(save_path, mode='w', header=True, index=True) else: features.to_csv(save_path, mode='a', header=False, index=True) del features
def transform_ts(start, end, file): train_columns = pq.read_schema( file).names # List with all column names to test # print(train_columns) X = pd.DataFrame(data=None) for i in train_columns[start:end]: df_signal = pq.read_pandas(file, columns=[i]).to_pandas() # turn parquet to dataframe of one single signal # print("Shape of signal data {}".format(df_signal.shape)) sig = np.ravel(df_signal.iloc[:, 0].to_numpy()) # turn to numpy t = df_signal.index.to_numpy() # turn time to numpy x_dn = de_noising(high_pass_filter(sig)) x_deleted = delete_repeat(x_dn) x_deleted_cond = (x_deleted < 99998) x_deleted = x_deleted[x_deleted_cond] print(x_deleted.shape) t_deleted = t[x_deleted_cond] # Generating New Time Series Features from signal master_train = pd.DataFrame({ 0: x_deleted, 1: np.repeat(i, x_deleted.shape[0]), 2: t_deleted }) # print("Shape of master train data {}".format(master_train.shape)) # master_train.to_csv('output/master_train.csv') extraction_settings = EfficientFCParameters() X_signal = extract_features( master_train, column_id=1, column_sort=2, impute_function=impute, default_fc_parameters=extraction_settings) print("Number of extracted features in {}: {}.".format( i, X_signal.shape[1])) X = X.append(X_signal) return X
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV from sklearn.svm import l1_min_c from pathlib import Path print('Reading data...') wav_files = glob.glob('sounds/kick/*.wav') + glob.glob( 'sounds/snare/*.wav') + glob.glob('sounds/tom/*.wav') all_audio = pd.concat([audio_to_dataframe(path) for path in wav_files]) all_labels = pd.Series(np.repeat(['kick', 'snare', 'tom'], 25), index=wav_files) all_audio.head() regenerate_tsfresh = True if regenerate_tsfresh: print('Generating tsfresh data...') settings = EfficientFCParameters() audio_tsfresh = extract_relevant_features(all_audio, all_labels, column_id='file_id', column_sort='time_id', default_fc_parameters=settings) else: print('Reading tsfresh data...') all_labels = pd.read_pickle('pkl/drum_tsfresh_labels.pkl') audio_tsfresh = pd.read_pickle('pkl/drum_tsfresh.pkl') print('Running logistic regression CV...') print('Started CV %s' % datetime.now()) cs = l1_min_c(audio_tsfresh, all_labels, loss='log') * np.logspace(0, 7, 16) cv_result = LogisticRegressionCV(Cs=cs, penalty='l1',
from zoo.chronos.data.utils.feature import generate_dt_features, generate_global_features from zoo.chronos.data.utils.impute import impute_timeseries_dataframe from zoo.chronos.data.utils.deduplicate import deduplicate_timeseries_dataframe from zoo.chronos.data.utils.roll import roll_timeseries_dataframe from zoo.chronos.data.utils.scale import unscale_timeseries_numpy from zoo.chronos.data.utils.resample import resample_timeseries_dataframe from zoo.chronos.data.utils.split import split_timeseries_dataframe from tsfresh.utilities.dataframe_functions import roll_time_series from tsfresh.utilities.dataframe_functions import impute as impute_tsfresh from tsfresh import extract_features from tsfresh.feature_extraction import ComprehensiveFCParameters,\ MinimalFCParameters, EfficientFCParameters DEFAULT_PARAMS = {"comprehensive": ComprehensiveFCParameters(), "minimal": MinimalFCParameters(), "efficient": EfficientFCParameters()} _DEFAULT_ID_COL_NAME = "id" _DEFAULT_ID_PLACEHOLDER = "0" class TSDataset: def __init__(self, data, **schema): ''' TSDataset is an abstract of time series dataset. Cascade call is supported for most of the transform methods. ''' self.df = data self.id_col = schema["id_col"] self.dt_col = schema["dt_col"] self.feature_col = schema["feature_col"].copy()
def create_test_features_profile(json_file): filename = os.path.basename(json_file) metric = filename.replace('.mirage.redis.24h.json', '') metric_data_dir = os.path.dirname(json_file) anomaly_json = json_file ts_csv = '%s.test.echo.tsfresh.input.csv' % (json_file) fname_in = ts_csv t_fname_out = fname_in + '.features.transposed.csv' if os.path.isfile(t_fname_out): return t_fname_out start = timer() with open(anomaly_json, 'r') as f: raw_timeseries = f.read() # Convert the timeseries to csv try: timeseries_array_str = str(raw_timeseries).replace('(', '[').replace(')', ']') del raw_timeseries timeseries = literal_eval(timeseries_array_str) del timeseries_array_str except: print('error :: could not literal_eval %s' % anomaly_json) print(traceback.format_exc()) return False datapoints = timeseries del timeseries converted = [] for datapoint in datapoints: try: new_datapoint = [float(datapoint[0]), float(datapoint[1])] converted.append(new_datapoint) # @modified 20170913 - Task #2160: Test skyline with bandit # Added nosec to exclude from bandit tests except: # nosec continue if os.path.isfile(ts_csv): os.remove(ts_csv) for ts, value in converted: # print('%s,%s' % (str(int(ts)), str(value))) utc_ts_line = '%s,%s,%s\n' % (metric, str(int(ts)), str(value)) with open(ts_csv, 'a') as fh: fh.write(utc_ts_line) del converted df = pd.read_csv(ts_csv, delimiter=',', header=None, names=['metric', 'timestamp', 'value']) # print('DataFrame created with %s' % ts_csv) df.columns = ['metric', 'timestamp', 'value'] # @modified 20210101 - Task #3928: Update Skyline to use new tsfresh feature extraction method # tsf_settings = ReasonableFeatureExtractionSettings() # Disable tqdm progress bar # tsf_settings.disable_progressbar = True df_features = extract_features( # @modified 20210101 - Task #3928: Update Skyline to use new tsfresh feature extraction method # df, column_id='metric', column_sort='timestamp', column_kind=None, # column_value=None, feature_extraction_settings=tsf_settings) df, default_fc_parameters=EfficientFCParameters(), column_id='metric', column_sort='timestamp', column_kind=None, column_value=None, disable_progressbar=True) del df # print('features extracted from %s data' % ts_csv) # write to disk fname_out = fname_in + '.features.csv' # Transpose df_t = df_features.transpose() # print('features transposed') # Create transposed features csv t_fname_out = fname_in + '.features.transposed.csv' df_t.to_csv(t_fname_out) del df_t # Calculate the count and sum of the features values df_sum = pd.read_csv( t_fname_out, delimiter=',', header=0, names=['feature_name', 'value']) df_sum.columns = ['feature_name', 'value'] df_sum['feature_name'] = df_sum['feature_name'].astype(str) df_sum['value'] = df_sum['value'].astype(float) features_count = len(df_sum['value']) features_sum = df_sum['value'].sum() del df_sum # print('features saved to %s' % (fname_out)) # print('transposed features saved to %s' % (t_fname_out)) return t_fname_out
def calculate_features_other_minmax(use_file, i_json_file, metric): fp_id = 'testing.feature2484' base_name = metric metric_timestamp = 'none' not_anomalous = False minmax_not_anomalous = False minmax = 0 minmax_check = True with open(use_file, 'r') as f: raw_timeseries = f.read() # Convert the timeseries to csv timeseries_array_str = str(raw_timeseries).replace('(', '[').replace(')', ']') del raw_timeseries anomalous_timeseries = literal_eval(timeseries_array_str) anomalous_ts_values_count = len(anomalous_timeseries) with open(i_json_file, 'r') as f: fp_raw_timeseries = f.read() # Convert the timeseries to csv fp_timeseries_array_str = str(fp_raw_timeseries).replace('(', '[').replace(')', ']') del fp_raw_timeseries fp_id_metric_ts = literal_eval(fp_timeseries_array_str) fp_id_metric_ts_values_count = len(fp_id_metric_ts) try: range_tolerance = settings.IONOSPHERE_MINMAX_SCALING_RANGE_TOLERANCE except: range_tolerance = 0.15 range_tolerance_percentage = range_tolerance * 100 check_range = False range_similar = False if fp_id_metric_ts: if anomalous_ts_values_count > 0: check_range = True lower_range_similar = False upper_range_similar = False min_fp_value = None min_anomalous_value = None max_fp_value = None max_anomalous_value = None if check_range: try: minmax_fp_values = [x[1] for x in fp_id_metric_ts] min_fp_value = min(minmax_fp_values) max_fp_value = max(minmax_fp_values) except: min_fp_value = False max_fp_value = False try: minmax_anomalous_values = [x2[1] for x2 in anomalous_timeseries] min_anomalous_value = min(minmax_anomalous_values) max_anomalous_value = max(minmax_anomalous_values) except: min_anomalous_value = False max_anomalous_value = False lower_range_not_same = True try: if int(min_fp_value) == int(min_anomalous_value): lower_range_not_same = False lower_range_similar = True print('min value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are the same' % ( str(min_fp_value), str(min_anomalous_value))) except: lower_range_not_same = True if min_fp_value and min_anomalous_value and lower_range_not_same: if int(min_fp_value) == int(min_anomalous_value): lower_range_similar = True print('min value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are the same' % ( str(min_fp_value), str(min_anomalous_value))) else: lower_min_fp_value = int(min_fp_value - (min_fp_value * range_tolerance)) upper_min_fp_value = int(min_fp_value + (min_fp_value * range_tolerance)) if int(min_anomalous_value) in range(lower_min_fp_value, upper_min_fp_value): lower_range_similar = True print('min value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are similar within %s percent of each other' % ( str(min_fp_value), str(min_anomalous_value), str(range_tolerance_percentage))) if not lower_range_similar: print('lower range of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are not similar' % ( str(min_fp_value), str(min_anomalous_value))) upper_range_not_same = True try: if int(max_fp_value) == int(max_anomalous_value): upper_range_not_same = False upper_range_similar = True print('max value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are the same' % ( str(max_fp_value), str(max_anomalous_value))) except: upper_range_not_same = True if max_fp_value and max_anomalous_value and lower_range_similar and upper_range_not_same: # @added 20180717 - Task #2446: Optimize Ionosphere # Feature #2404: Ionosphere - fluid approximation # On low values such as 1 and 2, the range_tolerance # should be adjusted to account for the very small # range. TODO lower_max_fp_value = int(max_fp_value - (max_fp_value * range_tolerance)) upper_max_fp_value = int(max_fp_value + (max_fp_value * range_tolerance)) if int(max_anomalous_value) in range(lower_max_fp_value, upper_max_fp_value): upper_range_similar = True print('max value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are similar within %s percent of each other' % ( str(max_fp_value), str(max_anomalous_value), str(range_tolerance_percentage))) else: print('max value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are not similar' % ( str(max_fp_value), str(max_anomalous_value))) if lower_range_similar and upper_range_similar: range_similar = True else: print('the ranges of fp_id_metric_ts and anomalous_timeseries differ significantly Min-Max scaling will be skipped') minmax_fp_ts = [] # if fp_id_metric_ts: if range_similar: try: minmax_fp_values = [x[1] for x in fp_id_metric_ts] x_np = np.asarray(minmax_fp_values) # Min-Max scaling np_minmax = (x_np - x_np.min()) / (x_np.max() - x_np.min()) for (ts, v) in zip(fp_id_metric_ts, np_minmax): minmax_fp_ts.append([ts[0], v]) print('minmax_fp_ts list populated with the minmax scaled time series with %s data points' % str(len(minmax_fp_ts))) del minmax_fp_values except: print('error :: could not minmax scale fp id %s time series for %s' % (str(fp_id), str(base_name))) if not minmax_fp_ts: print('error :: minmax_fp_ts list not populated') minmax_anomalous_ts = [] if minmax_fp_ts: # Only process if they are approximately the same length minmax_fp_ts_values_count = len(minmax_fp_ts) if minmax_fp_ts_values_count - anomalous_ts_values_count in range(-14, 14): try: minmax_anomalous_values = [x2[1] for x2 in anomalous_timeseries] x_np = np.asarray(minmax_anomalous_values) # Min-Max scaling np_minmax = (x_np - x_np.min()) / (x_np.max() - x_np.min()) for (ts, v) in zip(fp_id_metric_ts, np_minmax): minmax_anomalous_ts.append([ts[0], v]) del anomalous_timeseries del minmax_anomalous_values except: print('error :: could not minmax scale current time series anomalous_timeseries for %s' % (str(fp_id), str(base_name))) if len(minmax_anomalous_ts) > 0: print('minmax_anomalous_ts is populated with %s data points' % str(len(minmax_anomalous_ts))) else: print('error :: minmax_anomalous_ts is not populated') else: print('minmax scaled check will be skipped - anomalous_ts_values_count is %s and minmax_fp_ts is %s' % (str(anomalous_ts_values_count), str(minmax_fp_ts_values_count))) minmax_fp_ts_csv = '%s/fpid.%s.%s.minmax_fp_ts.tsfresh.input.std.csv' % ( settings.SKYLINE_TMP_DIR, str(fp_id), base_name) if os.path.isfile(minmax_fp_ts_csv): os.remove(minmax_fp_ts_csv) minmax_fp_fname_out = minmax_fp_ts_csv + '.transposed.csv' if os.path.isfile(minmax_fp_fname_out): os.remove(minmax_fp_fname_out) anomalous_ts_csv = '%s/%s.%s.minmax_anomalous_ts.tsfresh.std.csv' % ( settings.SKYLINE_TMP_DIR, metric_timestamp, base_name) if os.path.isfile(anomalous_ts_csv): os.remove(anomalous_ts_csv) anomalous_fp_fname_out = anomalous_ts_csv + '.transposed.csv' if os.path.isfile(anomalous_fp_fname_out): os.remove(anomalous_fp_fname_out) tsf_settings = ReasonableFeatureExtractionSettings() tsf_settings.disable_progressbar = True minmax_fp_features_sum = None minmax_anomalous_features_sum = None if minmax_anomalous_ts and minmax_fp_ts: if not os.path.isfile(minmax_fp_ts_csv): datapoints = minmax_fp_ts converted = [] for datapoint in datapoints: try: new_datapoint = [float(datapoint[0]), float(datapoint[1])] converted.append(new_datapoint) except: # nosec continue for ts, value in converted: try: utc_ts_line = '%s,%s,%s\n' % (base_name, str(int(ts)), str(value)) with open(minmax_fp_ts_csv, 'a') as fh: fh.write(utc_ts_line) except: print('error :: could not write to file %s' % (str(minmax_fp_ts_csv))) del converted else: print('file found %s, using for data' % minmax_fp_ts_csv) if not os.path.isfile(minmax_fp_ts_csv): print('error :: file not found %s' % minmax_fp_ts_csv) else: print('file exists to create the minmax_fp_ts data frame from - %s' % minmax_fp_ts_csv) try: df = pd.read_csv(minmax_fp_ts_csv, delimiter=',', header=None, names=['metric', 'timestamp', 'value']) df.columns = ['metric', 'timestamp', 'value'] except: print('error :: failed to created data frame from %s' % (str(minmax_fp_ts_csv))) try: df_features = extract_features( # @modified 20210101 - Task #3928: Update Skyline to use new tsfresh feature extraction method # df, column_id='metric', column_sort='timestamp', column_kind=None, # column_value=None, feature_extraction_settings=tsf_settings) df, default_fc_parameters=EfficientFCParameters(), column_id='metric', column_sort='timestamp', column_kind=None, column_value=None, disable_progressbar=True) except: print('error :: failed to created df_features from %s' % (str(minmax_fp_ts_csv))) # Create transposed features csv if not os.path.isfile(minmax_fp_fname_out): # Transpose df_t = df_features.transpose() df_t.to_csv(minmax_fp_fname_out) try: # Calculate the count and sum of the features values df_sum = pd.read_csv( minmax_fp_fname_out, delimiter=',', header=0, names=['feature_name', 'value']) df_sum.columns = ['feature_name', 'value'] df_sum['feature_name'] = df_sum['feature_name'].astype(str) df_sum['value'] = df_sum['value'].astype(float) minmax_fp_features_count = len(df_sum['value']) minmax_fp_features_sum = df_sum['value'].sum() print('minmax_fp_ts - features_count: %s, features_sum: %s' % (str(minmax_fp_features_count), str(minmax_fp_features_sum))) del df_sum except: print('error :: failed to created df_sum from %s' % (str(minmax_fp_fname_out))) if minmax_fp_features_count > 0: print('debug :: minmax_fp_features_count of the minmax_fp_ts is %s' % str(minmax_fp_features_count)) else: print('error :: minmax_fp_features_count is %s' % str(minmax_fp_features_count)) if not os.path.isfile(anomalous_ts_csv): datapoints = minmax_anomalous_ts converted = [] for datapoint in datapoints: try: new_datapoint = [float(datapoint[0]), float(datapoint[1])] converted.append(new_datapoint) except: # nosec continue for ts, value in converted: utc_ts_line = '%s,%s,%s\n' % (base_name, str(int(ts)), str(value)) with open(anomalous_ts_csv, 'a') as fh: fh.write(utc_ts_line) del converted df = pd.read_csv(anomalous_ts_csv, delimiter=',', header=None, names=['metric', 'timestamp', 'value']) df.columns = ['metric', 'timestamp', 'value'] df_features_current = extract_features( # @modified 20210101 - Task #3928: Update Skyline to use new tsfresh feature extraction method # df, column_id='metric', column_sort='timestamp', column_kind=None, # column_value=None, feature_extraction_settings=tsf_settings) df, default_fc_parameters=EfficientFCParameters(), column_id='metric', column_sort='timestamp', column_kind=None, column_value=None, disable_progressbar=True) del df # Create transposed features csv if not os.path.isfile(anomalous_fp_fname_out): # Transpose df_t = df_features_current.transpose() df_t.to_csv(anomalous_fp_fname_out) del df_t del df_features_current # Calculate the count and sum of the features values df_sum_2 = pd.read_csv( anomalous_fp_fname_out, delimiter=',', header=0, names=['feature_name', 'value']) df_sum_2.columns = ['feature_name', 'value'] df_sum_2['feature_name'] = df_sum_2['feature_name'].astype(str) df_sum_2['value'] = df_sum_2['value'].astype(float) minmax_anomalous_features_count = len(df_sum_2['value']) minmax_anomalous_features_sum = df_sum_2['value'].sum() print('minmax_anomalous_ts - minmax_anomalous_features_count: %s, minmax_anomalous_features_sum: %s' % ( str(minmax_anomalous_features_count), str(minmax_anomalous_features_sum))) del df_sum_2 del minmax_anomalous_ts percent_different = 100 if minmax_fp_features_sum and minmax_anomalous_features_sum: percent_different = None try: fp_sum_array = [minmax_fp_features_sum] calc_sum_array = [minmax_anomalous_features_sum] percent_different = 100 sums_array = np.array([minmax_fp_features_sum, minmax_anomalous_features_sum], dtype=float) calc_percent_different = np.diff(sums_array) / sums_array[:-1] * 100. percent_different = calc_percent_different[0] print('percent_different between minmax scaled features sums - %s' % str(percent_different)) except: print('error :: failed to calculate percent_different from minmax scaled features sums') if percent_different: almost_equal = None try: np.testing.assert_array_almost_equal(fp_sum_array, calc_sum_array) almost_equal = True except: almost_equal = False if almost_equal: minmax_not_anomalous = True print('minmax scaled common features sums are almost equal, not anomalous') # if diff_in_sums <= 1%: if percent_different < 0: new_pdiff = percent_different * -1 percent_different = new_pdiff # @modified 20190321 # if percent_different < (settings.IONOSPHERE_FEATURES_PERCENT_SIMILAR + 1): if percent_different < IONOSPHERE_ECHO_MINMAX_SCALING_FEATURES_PERCENT_SIMILAR: minmax_not_anomalous = True # log print('not anomalous - minmax scaled features profile match - %s - %s' % (base_name, str(minmax_not_anomalous))) print( 'minmax scaled calculated features sum are within %s percent of fp_id %s with %s, not anomalous' % (str(IONOSPHERE_ECHO_MINMAX_SCALING_FEATURES_PERCENT_SIMILAR), str(fp_id), str(percent_different))) if minmax_not_anomalous: not_anomalous = True minmax = 1 # Created time series resources for graphing in # the matched page try: clean_file = anomalous_ts_csv if os.path.isfile(anomalous_ts_csv): os.remove(anomalous_ts_csv) # print('cleaned up - %s' % clean_file) except: print('no anomalous_ts_csv file to clean up') try: clean_file = anomalous_fp_fname_out if os.path.isfile(anomalous_fp_fname_out): os.remove(anomalous_fp_fname_out) # print('cleaned up - %s' % clean_file) except: print('no anomalous_fp_fname_out file to clean up') return not_anomalous
import numpy as np import pandas as pd from tsfresh import extract_features from tsfresh.feature_extraction import EfficientFCParameters, MinimalFCParameters from tsfresh.utilities.dataframe_functions import impute from tsfresh.feature_selection.relevance import calculate_relevance_table from sklearn.utils.validation import check_is_fitted from sklearn.base import BaseEstimator, TransformerMixin import warnings warnings.filterwarnings('ignore') # TODO: Make a dict from EfficientFCParameters with faster features extraction_settings = EfficientFCParameters() filtered_funcs = [ 'abs_energy', 'mean_abs_change', 'mean_change', 'skewness', 'kurtosis', 'absolute_sum_of_changes', 'longest_strike_below_mean', 'longest_strike_above_mean', 'count_above_mean', 'count_below_mean', 'last_location_of_maximum', 'first_location_of_maximum', 'last_location_of_minimum', 'first_location_of_minimum', 'percentage_of_reoccurring_datapoints_to_all_datapoints', 'percentage_of_reoccurring_values_to_all_values', 'sum_of_reoccurring_values', 'sum_of_reoccurring_data_points', 'ratio_value_number_to_time_series_length', 'cid_ce', 'symmetry_looking', 'large_standard_deviation', 'quantile', 'autocorrelation', 'number_peaks', 'binned_entropy', 'index_mass_quantile', 'linear_trend', 'number_crossing_m', 'augmented_dickey_fuller', 'number_cwt_peaks', 'agg_autocorrelation', 'spkt_welch_density', 'friedrich_coefficients', 'max_langevin_fixed_point', 'c3', 'ar_coefficient',
def predict( self, forecast_length: int, future_regressor=[], just_point_forecast: bool = False, ): """Generates forecast data immediately following dates of index supplied to .fit() Args: forecast_length (int): Number of periods of data to forecast ahead regressor (numpy.Array): additional regressor just_point_forecast (bool): If True, return a pandas.DataFrame of just point forecasts Returns: Either a PredictionObject of forecasts and metadata, or if just_point_forecast == True, a dataframe of point forecasts """ if not _has_tsfresh: raise ImportError("Package tsfresh is required") # num_subsamples = 10 predictStartTime = datetime.datetime.now() # from tsfresh import extract_features from tsfresh.utilities.dataframe_functions import make_forecasting_frame # from sklearn.ensemble import AdaBoostRegressor from tsfresh.utilities.dataframe_functions import impute as tsfresh_impute # from tsfresh.feature_extraction import EfficientFCParameters, MinimalFCParameters max_timeshift = 10 regression_model = 'Adaboost' feature_selection = None max_timeshift = self.max_timeshift regression_model = self.regression_model feature_selection = self.feature_selection sktraindata = self.df_train.copy() X = pd.DataFrame() y = pd.DataFrame() counter = 0 for column in sktraindata.columns: df_shift, current_y = make_forecasting_frame( sktraindata[column], kind="time_series", max_timeshift=max_timeshift, rolling_direction=1, ) # disable_progressbar = True MinimalFCParameters EfficientFCParameters current_X = extract_features( df_shift, column_id="id", column_sort="time", column_value="value", impute_function=tsfresh_impute, show_warnings=False, default_fc_parameters=EfficientFCParameters(), n_jobs=1, ) # current_X["feature_last_value"] = current_y.shift(1) current_X.rename(columns=lambda x: str(counter) + '_' + x, inplace=True) X = pd.concat([X, current_X], axis=1) y = pd.concat([y, current_y], axis=1) counter += 1 # drop constant features X = X.loc[:, X.apply(pd.Series.nunique) != 1] X = X.replace([np.inf, -np.inf], np.nan) X = X.fillna(0) y = y.fillna(method='ffill').fillna(method='bfill') if feature_selection == 'Variance': from sklearn.feature_selection import VarianceThreshold sel = VarianceThreshold(threshold=(0.15)) X = pd.DataFrame(sel.fit_transform(X)) if feature_selection == 'Percentile': from sklearn.feature_selection import SelectPercentile, chi2 X = pd.DataFrame( SelectPercentile(chi2, percentile=20).fit_transform( X, y[y.columns[0]])) if feature_selection == 'DecisionTree': from sklearn.tree import DecisionTreeRegressor from sklearn.feature_selection import SelectFromModel clf = DecisionTreeRegressor() clf = clf.fit(X, y) model = SelectFromModel(clf, prefit=True) X = model.transform(X) if feature_selection == 'Lasso': from sklearn.linear_model import MultiTaskLasso from sklearn.feature_selection import SelectFromModel clf = MultiTaskLasso(max_iter=2000) clf = clf.fit(X, y) model = SelectFromModel(clf, prefit=True) X = model.transform(X) """ decisionTreeList = X.columns[model.get_support()] LassoList = X.columns[model.get_support()] feature_list = decisionTreeList.to_list() set([x for x in feature_list if feature_list.count(x) > 1]) from collections import Counter repeat_features = Counter(feature_list) repeat_features = repeat_features.most_common(20) """ # Drop first line X = X.iloc[1:, ] y = y.iloc[1:] y = y.fillna(method='ffill').fillna(method='bfill') index = self.create_forecast_index(forecast_length=forecast_length) if regression_model == 'ElasticNet': from sklearn.linear_model import MultiTaskElasticNet regr = MultiTaskElasticNet(alpha=1.0, random_state=self.random_seed) elif regression_model == 'DecisionTree': from sklearn.tree import DecisionTreeRegressor regr = DecisionTreeRegressor(random_state=self.random_seed) elif regression_model == 'MLP': from sklearn.neural_network import MLPRegressor # relu/tanh lbfgs/adam layer_sizes (100) (10) regr = MLPRegressor( hidden_layer_sizes=(10, 25, 10), verbose=self.verbose_bool, max_iter=200, activation='tanh', solver='lbfgs', random_state=self.random_seed, ) elif regression_model == 'KNN': from sklearn.multioutput import MultiOutputRegressor from sklearn.neighbors import KNeighborsRegressor regr = MultiOutputRegressor( KNeighborsRegressor(random_state=self.random_seed)) elif regression_model == 'Adaboost': from sklearn.multioutput import MultiOutputRegressor from sklearn.ensemble import AdaBoostRegressor regr = MultiOutputRegressor(AdaBoostRegressor( n_estimators=200)) # , random_state=self.random_seed)) else: regression_model = 'RandomForest' from sklearn.ensemble import RandomForestRegressor regr = RandomForestRegressor(random_state=self.random_seed, n_estimators=1000, verbose=self.verbose) regr.fit(X, y) combined_index = self.df_train.index.append(index) forecast = pd.DataFrame() sktraindata.columns = [x for x in range(len(sktraindata.columns))] for x in range(forecast_length): x_dat = pd.DataFrame() y_dat = pd.DataFrame() counter = 0 for column in sktraindata.columns: df_shift, current_y = make_forecasting_frame( sktraindata.tail(max_timeshift)[column], kind="time_series", max_timeshift=max_timeshift, rolling_direction=1, ) # disable_progressbar = True MinimalFCParameters EfficientFCParameters current_X = extract_features( df_shift, column_id="id", column_sort="time", column_value="value", impute_function=tsfresh_impute, show_warnings=False, n_jobs=1, default_fc_parameters=EfficientFCParameters(), ) # default_fc_parameters=MinimalFCParameters(), current_X["feature_last_value"] = current_y.shift(1) current_X.rename(columns=lambda x: str(counter) + '_' + x, inplace=True) x_dat = pd.concat([x_dat, current_X], axis=1) y_dat = pd.concat([y_dat, current_y], axis=1) counter += 1 x_dat = x_dat[X.columns] rfPred = pd.DataFrame(regr.predict(x_dat.tail(1).values)) forecast = pd.concat([forecast, rfPred], axis=0, ignore_index=True) sktraindata = pd.concat([sktraindata, rfPred], axis=0, ignore_index=True) sktraindata.index = combined_index[:len(sktraindata.index)] forecast.columns = self.column_names forecast.index = index if just_point_forecast: return forecast else: upper_forecast, lower_forecast = Point_to_Probability( self.df_train, forecast, prediction_interval=self.prediction_interval) predict_runtime = datetime.datetime.now() - predictStartTime prediction = PredictionObject( model_name=self.name, forecast_length=forecast_length, forecast_index=forecast.index, forecast_columns=forecast.columns, lower_forecast=lower_forecast, forecast=forecast, upper_forecast=upper_forecast, prediction_interval=self.prediction_interval, predict_runtime=predict_runtime, fit_runtime=self.fit_runtime, model_parameters=self.get_params(), ) return prediction
def get_tsfresh_feat(df, colName=None): df = df.reset_index() df.columns = ['timestamp',colName] df['id'] = 0 # Mandatory for tsfresh to group ext_feat = extract_features(df, column_id='id', column_value=colName, column_sort='timestamp', default_fc_parameters=EfficientFCParameters(), disable_progressbar=True) ext_feat_val = ext_feat.values[0] #print(ext_feat_val, ext_feat.columns); exit() return ext_feat_val
return df # Processing first 20 min of raw scg signal and creating dataset with adaptive indexes and corresponding time row to extract features according to them. # Arguments are in order:start_hour,start_min,end_hour,end_min new_id_number_train, bin_length_train,targets_startInx_train, train_df = preprocess(raw_value, train_start_hour, train_start_minute, train_end_hour, train_end_minute) # Processing last 7 minutes of raw scg signal to predict future parameters. new_id_number_test, bin_length_test,targets_startInx_test, test_df = preprocess(raw_value, test_start_hour, test_start_minute, test_end_hour, test_end_minute) # Because of problems occuring while parallel processing i had to set n_jobs = 0,it calculates slower but works fine. train_extracted_features = extract_features(train_df, column_id="id", column_sort="time", default_fc_parameters=EfficientFCParameters(), n_jobs=0) test_extracted_features = extract_features(test_df, column_id="id", column_sort="time", default_fc_parameters=EfficientFCParameters(), n_jobs=0) # train_extracted_features.to_csv('train_final_features.csv',index = False) #tt = pd.read_csv(r'C:\Users\Samane\Desktop\hw\20minFeature.xlsx') # test_extracted_features.to_csv('test_final_features.csv',index = False) train_features = train_extracted_features[features] train_features_norm = data_segmenation_normalization1(train_features) S, D, H, R, labels_train= label_modification_df(targets, targets_startInx_train, new_id_number_train, bin_length_train) train_df, val_df, test_df = data_shaping(train_features_norm, S, D, H, R) test_array = test_df.values test_array = np.expand_dims(test_array, axis=0)
def gen_rolling_feature(self, window_size, settings="comprehensive", full_settings=None, n_jobs=1): ''' Generate aggregation feature for each sample. This method will be implemented by tsfresh. Make sure that the specified column name does not contain '__'. TODO: relationship with scale should be figured out. :param window_size: int, generate feature according to the rolling result. :param settings: str or dict. If a string is set, then it must be one of "comprehensive" "minimal" and "efficient". If a dict is set, then it should follow the instruction for default_fc_parameters in tsfresh. The value is defaulted to "comprehensive". :param full_settings: dict. It should follow the instruction for kind_to_fc_parameters in tsfresh. The value is defaulted to None. :param n_jobs: int. The number of processes to use for parallelization. :return: the tsdataset instance. ''' from tsfresh.utilities.dataframe_functions import roll_time_series from tsfresh.utilities.dataframe_functions import impute as impute_tsfresh from tsfresh import extract_features from tsfresh.feature_extraction import ComprehensiveFCParameters, \ MinimalFCParameters, EfficientFCParameters DEFAULT_PARAMS = { "comprehensive": ComprehensiveFCParameters(), "minimal": MinimalFCParameters(), "efficient": EfficientFCParameters() } assert not self._has_generate_agg_feature,\ "Only one of gen_global_feature and gen_rolling_feature should be called." if isinstance(settings, str): assert settings in ['comprehensive', 'minimal', 'efficient'], \ "settings str should be one of 'comprehensive', 'minimal', 'efficient'"\ f", but found {settings}." default_fc_parameters = DEFAULT_PARAMS[settings] else: default_fc_parameters = settings assert window_size < self.df.groupby(self.id_col).size().min() + 1, "gen_rolling_feature "\ "should have a window_size smaller than shortest time series length." df_rolled = roll_time_series(self.df, column_id=self.id_col, column_sort=self.dt_col, max_timeshift=window_size - 1, min_timeshift=window_size - 1, n_jobs=n_jobs) if not full_settings: self.roll_feature_df = extract_features( df_rolled, column_id=self.id_col, column_sort=self.dt_col, default_fc_parameters=default_fc_parameters, n_jobs=n_jobs) else: self.roll_feature_df = extract_features( df_rolled, column_id=self.id_col, column_sort=self.dt_col, kind_to_fc_parameters=full_settings, n_jobs=n_jobs) impute_tsfresh(self.roll_feature_df) self.feature_col += list(self.roll_feature_df.columns) self.roll_additional_feature = list(self.roll_feature_df.columns) self._has_generate_agg_feature = True return self
def calculate_features_profile(current_skyline_app, timestamp, metric, context): """ Calculates a tsfresh features profile from a training data set :param timestamp: the timestamp of metric anomaly with training data :type timestamp: str :param metric: the base_name of the metric :type metric: str :param context: the context :type metric: str :return: (features_profile_csv_file_path, successful, fail_msg, traceback_format_exc, calc_time) :rtype: int :rtype: (str, boolean, str, str, str) """ current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) base_name = str(metric) # @added 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Set a default log_context, just in case it is not set if something is # added in the future log_context = 'unknown' if context == 'training_data': log_context = 'training data' if context == 'features_profiles': log_context = 'features profile data' if context == 'ionosphere': log_context = 'ionosphere' # @added 20170114 - Feature #1854: Ionosphere learn if context == 'ionosphere_learn': log_context = 'ionosphere :: learn' # TODO # @added 20190314 - Feature #2484: FULL_DURATION feature profiles # Here we add the bifurcation to also create a features # profile at FULL_DURATION for all Mirage metrics. With a # view to increase the number of matches trained metric # achieve by also allowing for the creation and comparing of # the FULL_DURATION features profiles as well. # How I am not certain but needs to tie up with this Feature in: # skyline/ionosphere/ionosphere.py # skyline/webapp/webapp.py if context == 'ionosphere_echo': log_context = 'ionosphere :: echo' if context == 'ionosphere_echo_check': log_context = 'ionosphere :: echo check' current_logger.info('%s feature profile creation requested for %s at %s' % (log_context, base_name, timestamp)) timeseries_dir = base_name.replace('.', '/') if context == 'training_data' or context == 'ionosphere': metric_data_dir = '%s/%s/%s' % (settings.IONOSPHERE_DATA_FOLDER, timestamp, timeseries_dir) # @added 20200813 - Feature #3670: IONOSPHERE_CUSTOM_KEEP_TRAINING_TIMESERIES_FOR if context == 'training_data': metric_data_dir_does_not_exist = False if not os.path.exists(metric_data_dir): metric_data_dir_does_not_exist = True if IONOSPHERE_CUSTOM_KEEP_TRAINING_TIMESERIES_FOR and metric_data_dir_does_not_exist: try: historical_data, metric_data_dir = historical_data_dir_exists( current_skyline_app, metric_data_dir) if historical_data: current_logger.info( 'create_features_profile :: using historical training data - %s' % metric_data_dir) except: trace = traceback.format_exc() current_logger.error(trace) fail_msg = 'error :: create_features_profile :: failed to determine whether this is historical training data' current_logger.error('%s' % fail_msg) if context == 'training_data': # Raise to webbapp I believe to provide traceback to user in UI raise else: return False, False, False, fail_msg, trace if context == 'features_profiles': metric_data_dir = '%s/%s/%s' % (settings.IONOSPHERE_PROFILES_FOLDER, timeseries_dir, timestamp) # @added 20170113 - Feature #1854: Ionosphere learn if context == 'ionosphere_learn': metric_data_dir = '%s/%s/%s' % (settings.IONOSPHERE_LEARN_FOLDER, timestamp, timeseries_dir) # @added 20190327 - Feature #2484: FULL_DURATION feature profiles # Added ionosphere_echo and ionosphere_echo_check if context == 'ionosphere_echo' or context == 'ionosphere_echo_check': metric_data_dir = '%s/%s/%s' % (settings.IONOSPHERE_DATA_FOLDER, timestamp, timeseries_dir) features_profile_created_file = '%s/%s.%s.fp.created.txt' % ( metric_data_dir, str(timestamp), base_name) features_profile_details_file = '%s/%s.%s.fp.details.txt' % ( metric_data_dir, str(timestamp), base_name) # @added 20190327 - Feature #2484: FULL_DURATION feature profiles if context == 'ionosphere_echo_check': features_profile_created_file = '%s/%s.%s.echo.fp.created.txt' % ( metric_data_dir, str(timestamp), base_name) features_profile_details_file = '%s/%s.%s.echo.fp.details.txt' % ( metric_data_dir, str(timestamp), base_name) # @added 20170108 - Feature #1842: Ionosphere - Graphite now graphs # Added metric_check_file and ts_full_duration is needed to be determined # and added the to features_profile_details_file as it was not added here on # the 20170104 when it was added the webapp and ionosphere metric_var_filename = '%s.txt' % str(base_name) anomaly_check_file = '%s/%s' % (metric_data_dir, metric_var_filename) ts_full_duration = int(settings.FULL_DURATION) if os.path.isfile(anomaly_check_file): # Read the details file with open(anomaly_check_file, 'r') as f: anomaly_details = f.readlines() for i, line in enumerate(anomaly_details): if 'full_duration' in line: _ts_full_duration = '%s' % str(line).split("'", 2) full_duration_array = literal_eval(_ts_full_duration) ts_full_duration = str(int(full_duration_array[1])) anomaly_json = '%s/%s.json' % (metric_data_dir, base_name) # @added 20190327 - Feature #2484: FULL_DURATION feature profiles if context == 'ionosphere_echo' or context == 'ionosphere_echo_check': ts_full_duration = str(settings.FULL_DURATION) full_duration_in_hours = int(settings.FULL_DURATION / 60 / 60) anomaly_json = '%s/%s.mirage.redis.%sh.json' % ( metric_data_dir, base_name, str(full_duration_in_hours)) ts_csv = '%s/%s.tsfresh.input.csv' % (metric_data_dir, base_name) # @added 20190327 - Feature #2484: FULL_DURATION feature profiles if context == 'ionosphere_echo_check': ts_csv = '%s/%s.echo.tsfresh.input.csv' % (metric_data_dir, base_name) # anomaly_json = '/opt/skyline/ionosphere/data/1480104000/stats/statsd/graphiteStats/calculationtime/stats.statsd.graphiteStats.calculationtime.json' # ts_csv = '/opt/skyline/ionosphere/data/1480104000/stats/statsd/graphiteStats/calculationtime/stats.statsd.graphiteStats.calculationtime.tsfresh.input.csv' # This is simply to stay in line with tsfresh naming conventions in their # docs and examples fname_in = ts_csv t_fname_out = fname_in + '.features.transposed.csv' fp_id = None f_calc = 'unknown' if os.path.isfile(features_profile_details_file): # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context current_logger.info('%s :: features profile details file exist - %s' % (log_context, features_profile_details_file)) try: with open(features_profile_details_file, 'r') as f: fp_details_str = f.read() fp_details_array = literal_eval(fp_details_str) f_calc = ' (previously calculated by Ionosphere) - %s' % str( fp_details_array[2]) except: trace = traceback.format_exc() current_logger.error(trace) # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context current_logger.error('error: %s :: failed to read from %s' % (log_context, features_profile_details_file)) else: # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context current_logger.info( '%s - OK no features profile details file exists - %s' % (log_context, features_profile_details_file)) fp_created = None if os.path.isfile(features_profile_created_file): # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context current_logger.info('%s :: features profile created file exist - %s' % (log_context, features_profile_created_file)) try: with open(features_profile_created_file, 'r') as f: fp_created_str = f.read() fp_created_array = literal_eval(fp_created_str) fp_id = fp_created_array[0] fp_created = True except: trace = traceback.format_exc() current_logger.error(trace) # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context current_logger.error('error: %s :: failed to read fp_id from %s' % (log_context, features_profile_created_file)) else: # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context current_logger.info( '%s :: OK no features profile created file exists - %s' % (log_context, features_profile_created_file)) if os.path.isfile(t_fname_out): # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context current_logger.info('%s :: transposed features already exist - %s' % (log_context, t_fname_out)) return str( t_fname_out), True, fp_created, fp_id, 'none', 'none', f_calc start = timer() raw_timeseries = [] if os.path.isfile(anomaly_json): try: # Read the timeseries json file with open(anomaly_json, 'r') as f: raw_timeseries = f.read() except: trace = traceback.format_exc() current_logger.error(trace) # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context current_logger.error( 'error: %s :: failed to read timeseries data from %s' % (log_context, anomaly_json)) fail_msg = 'error: %s :: failed to read timeseries data from %s' % ( log_context, anomaly_json) end = timer() return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc else: trace = 'none' # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context fail_msg = 'error :: %s :: file not found - %s' % (log_context, anomaly_json) current_logger.error(fail_msg) end = timer() return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc # Convert the timeseries to csv timeseries_array_str = str(raw_timeseries).replace('(', '[').replace(')', ']') del raw_timeseries timeseries = literal_eval(timeseries_array_str) datapoints = timeseries del timeseries converted = [] for datapoint in datapoints: try: new_datapoint = [float(datapoint[0]), float(datapoint[1])] converted.append(new_datapoint) # @modified 20170913 - Task #2160: Test skyline with bandit # Added nosec to exclude from bandit tests except: # nosec continue del datapoints if os.path.isfile(ts_csv): os.remove(ts_csv) for ts, value in converted: # print('%s,%s' % (str(int(ts)), str(value))) utc_ts_line = '%s,%s,%s\n' % (metric, str(int(ts)), str(value)) with open(ts_csv, 'a') as fh: fh.write(utc_ts_line) del converted try: df = pd.read_csv(ts_csv, delimiter=',', header=None, names=['metric', 'timestamp', 'value']) # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context current_logger.info('%s :: DataFrame created with %s' % (log_context, ts_csv)) except: trace = traceback.format_exc() current_logger.error(trace) # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context fail_msg = 'error: %s :: failed to create a pandas DataFrame with %s' % ( log_context, ts_csv) current_logger.error('%s' % fail_msg) if os.path.isfile(ts_csv): os.remove(ts_csv) # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context current_logger.info('%s :: removed %s' % (log_context, ts_csv)) end = timer() return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc # @added 20161207 - Task #1658: Patterning Skyline Ionosphere # Coverting the Dataframe types to suit MySQL data types # For anyone in here if you have done a code review of Skyline there are # a number of questions that arise from the decision to deviate from json or # storing msgppack as BLOB etc. tsfresh used csv and we can csv from Graphite # etc. Skyline should be able to handle csv. As for how data is stored in # MySQL, this was given considerable review and thought. Given that Ionosphere # and Skyline in general should not be limited to the domain of analyzing # Graphite machine metrics but other timeseries data sources too. # df['feature_name'] = df['feature_name'].astype(string) # df['value'] = df['value'].astype(float) # Test the DataFrame try: df_created = df.head() del df_created except: trace = traceback.format_exc() current_logger.debug(trace) # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context fail_msg = 'error: %s :: failed to read the pandas DataFrame created with %s' % ( log_context, ts_csv) current_logger.error('%s' % fail_msg) if os.path.isfile(ts_csv): os.remove(ts_csv) # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context current_logger.info('%s :: removed %s' % (log_context, ts_csv)) end = timer() return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc df.columns = ['metric', 'timestamp', 'value'] start_feature_extraction = timer() # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context current_logger.info('%s :: starting extract_features with %s' % (log_context, str(TSFRESH_VERSION))) df_features = False try: # @modified 20161226 - Bug #1822: tsfresh extract_features process stalling # Changed to use the new ReasonableFeatureExtractionSettings that was # introduced in tsfresh-0.4.0 to exclude the computationally high cost # of extracting features from very static timeseries that has little to # no variation is the values, which results in features taking up to # almost 600 seconds to calculate on a timeseries of length 10075 # (168h - 1 datapoint per 60s) # In terms of inline feature calculatation, always exclude # high_comp_cost features. # df_features = extract_features(df, column_id='metric', column_sort='timestamp', column_kind=None, column_value=None) # @modified 20210101 - Task #3928: Update Skyline to use new tsfresh feature extraction method # tsf_settings = ReasonableFeatureExtractionSettings() # >>> from tsfresh.feature_extraction import extract_features, EfficientFCParameters # >>> extract_features(df, default_fc_parameters=EfficientFCParameters()) # Disable tqdm progress bar # @modified 20210101 - Task #3928: Update Skyline to use new tsfresh feature extraction method # tsf_settings.disable_progressbar = True df_features = extract_features( # @modified 20210101 - Task #3928: Update Skyline to use new tsfresh feature extraction method # df, column_id='metric', column_sort='timestamp', column_kind=None, # column_value=None, feature_extraction_settings=tsf_settings) df, default_fc_parameters=EfficientFCParameters(), column_id='metric', column_sort='timestamp', column_kind=None, column_value=None, disable_progressbar=True) # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context current_logger.info('%s :: features extracted from %s data' % (log_context, ts_csv)) except: trace = traceback.print_exc() current_logger.debug(trace) # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context fail_msg = 'error: %s :: extracting features with tsfresh from - %s' % ( log_context, ts_csv) current_logger.error('%s' % fail_msg) end_feature_extraction = timer() # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context current_logger.info( '%s :: feature extraction failed in %.6f seconds' % (log_context, (end_feature_extraction - start_feature_extraction))) if os.path.isfile(ts_csv): os.remove(ts_csv) # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context current_logger.info('%s :: removed %s' % (log_context, ts_csv)) end = timer() return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc end_feature_extraction = timer() feature_extraction_time = end_feature_extraction - start_feature_extraction # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context current_logger.info('%s :: feature extraction took %.6f seconds' % (log_context, feature_extraction_time)) del df # write to disk fname_out = fname_in + '.features.csv' # df_features.to_csv(fname_out) # Transpose df_t = False try: df_t = df_features.transpose() # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context current_logger.info('%s :: features transposed' % log_context) except: trace = traceback.print_exc() current_logger.debug(trace) # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context fail_msg = 'error :: %s :: transposing tsfresh features from - %s' % ( log_context, ts_csv) current_logger.error('%s' % fail_msg) if os.path.isfile(ts_csv): os.remove(ts_csv) # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context current_logger.info('%s :: removed %s' % (log_context, ts_csv)) end = timer() return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc del df_features # Create transposed features csv t_fname_out = fname_in + '.features.transposed.csv' try: df_t.to_csv(t_fname_out) except: trace = traceback.print_exc() current_logger.debug(trace) # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context fail_msg = 'error :: %s :: saving transposed tsfresh features from - %s' % ( log_context, ts_csv) current_logger.error('%s' % fail_msg) if os.path.isfile(ts_csv): os.remove(ts_csv) # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context current_logger.info('%s :: removed %s' % (log_context, ts_csv)) end = timer() return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc del df_t # Calculate the count and sum of the features values df_sum = False try: df_sum = pd.read_csv(t_fname_out, delimiter=',', header=0, names=['feature_name', 'value']) df_sum.columns = ['feature_name', 'value'] df_sum['feature_name'] = df_sum['feature_name'].astype(str) df_sum['value'] = df_sum['value'].astype(float) except: trace = traceback.print_exc() current_logger.error(trace) # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context current_logger.error( 'error :: %s :: failed to create Dataframe to sum' % log_context) try: features_count = len(df_sum['value']) except: trace = traceback.print_exc() current_logger.debug(trace) # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context current_logger.error( 'error :: %s :: failed to count number of features, set to 0' % log_context) features_count = 0 try: features_sum = df_sum['value'].sum() except: trace = traceback.print_exc() current_logger.debug(trace) # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context current_logger.error( 'error :: %s :: failed to sum feature values, set to 0' % log_context) features_sum = 0 end = timer() # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context current_logger.info('%s :: features saved to %s' % (log_context, fname_out)) current_logger.info('%s :: transposed features saved to %s' % (log_context, t_fname_out)) total_calc_time = '%.6f' % (end - start) calc_time = '%.6f' % (feature_extraction_time) current_logger.info('%s :: total feature profile completed in %s seconds' % (log_context, str(total_calc_time))) # Create a features profile details file try: # @modified 20170108 - Feature #1842: Ionosphere - Graphite now graphs # Added the ts_full_duration here as it was not added here on the 20170104 # when it was added the webapp and ionosphere data = '[%s, \'%s\', %s, %s, %s, %s]' % ( str(int(time.time())), str(tsfresh_version), str(calc_time), str(features_count), str(features_sum), str(ts_full_duration)) write_data_to_file(current_skyline_app, features_profile_details_file, 'w', data) except: trace = traceback.format_exc() current_logger.error('%s' % trace) # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context fail_msg = 'error :: %s :: failed to write %s' % ( log_context, features_profile_details_file) current_logger.error('%s' % fail_msg) del df_sum if os.path.isfile(ts_csv): os.remove(ts_csv) # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Added log_context to report the context current_logger.info('%s :: removed the created csv - %s' % (log_context, ts_csv)) # @added 20170112 - Feature #1854: Ionosphere learn - Redis ionosphere.learn.work namespace # Ionosphere learn needs Redis works sets, but this was moved to # ionosphere_backend.py and learn.py not done here return str(t_fname_out), True, fp_created, fp_id, 'none', 'none', str( calc_time)
"ch57": chanels_data[57, :], "ch58": chanels_data[58, :], "ch59": chanels_data[59, :], "ch60": chanels_data[60, :], "ch61": chanels_data[61, :], "ch62": chanels_data[62, :], "ch63": chanels_data[63, :] } #d = {'ID': pd.Series(ids), 'time': pd.Series(time),'x': pd.Series(full_data_matrix[i])} df = pd.DataFrame(d) extracted_features[i, :] = extract_features( df, column_id="id", column_sort="time", default_fc_parameters=EfficientFCParameters()) ################################################Quick test #Normalize data full_normalized_array = preprocessing.scale(extracted_features) #normalize ################PCA AND VARIANCE EXPLAINED pca = PCA(svd_solver='auto') #PCA with all components pca.fit(full_normalized_array) pca_cumsum = np.cumsum(pca.explained_variance_ratio_) * 100 plt.figure() plt.plot(pca_cumsum) plt.grid() plt.ylabel('% Variance Explained') plt.xlabel('# of Features')
def compute(self, data, features=EfficientFCParameters()): self.features = features feature_values = generate_tsfresh_features(data, features) return feature_values.reshape(1, feature_values.size)
def gen_global_feature(self, settings="comprehensive", full_settings=None, n_jobs=1): ''' Generate per-time-series feature for each time series. This method will be implemented by tsfresh. Make sure that the specified column name does not contain '__'. TODO: relationship with scale should be figured out. :param settings: str or dict. If a string is set, then it must be one of "comprehensive" "minimal" and "efficient". If a dict is set, then it should follow the instruction for default_fc_parameters in tsfresh. The value is defaulted to "comprehensive". :param full_settings: dict. It should follow the instruction for kind_to_fc_parameters in tsfresh. The value is defaulted to None. :param n_jobs: int. The number of processes to use for parallelization. :return: the tsdataset instance. ''' from tsfresh import extract_features from tsfresh.feature_extraction import ComprehensiveFCParameters, \ MinimalFCParameters, EfficientFCParameters DEFAULT_PARAMS = { "comprehensive": ComprehensiveFCParameters(), "minimal": MinimalFCParameters(), "efficient": EfficientFCParameters() } assert not self._has_generate_agg_feature, \ "Only one of gen_global_feature and gen_rolling_feature should be called." if full_settings is not None: self.df,\ addtional_feature =\ generate_global_features(input_df=self.df, column_id=self.id_col, column_sort=self.dt_col, kind_to_fc_parameters=full_settings, n_jobs=n_jobs) self.feature_col += addtional_feature return self if isinstance(settings, str): assert settings in ['comprehensive', 'minimal', 'efficient'], \ "settings str should be one of 'comprehensive', 'minimal', 'efficient'"\ f", but found {settings}." default_fc_parameters = DEFAULT_PARAMS[settings] else: default_fc_parameters = settings self.df,\ addtional_feature =\ generate_global_features(input_df=self.df, column_id=self.id_col, column_sort=self.dt_col, default_fc_parameters=default_fc_parameters, n_jobs=n_jobs) self.feature_col += addtional_feature self._has_generate_agg_feature = True return self
nobs = np.size(full_data_matrix,0) ntime = np.size(full_data_matrix,1) extracted_features = np.zeros((nobs,788)) time = np.arange(ntime) for i in range(nobs): print(i) ids = np.repeat(i,ntime) d = {'ID': pd.Series(ids), 'time': pd.Series(time),'x': pd.Series(full_data_matrix[i])} df = pd.DataFrame(d) extracted_features[i,:] = extract_features(df, column_id="ID", column_sort="time", default_fc_parameters=EfficientFCParameters()) ################################################Quick test #Normalize data full_normalized_array = preprocessing.scale(extracted_features)#normalize ################PCA AND VARIANCE EXPLAINED pca = PCA(svd_solver='auto')#PCA with all components pca.fit(full_normalized_array) pca_cumsum = np.cumsum(pca.explained_variance_ratio_)*100 plt.figure() plt.plot(pca_cumsum) plt.grid()