def compute_standard_features_block(xc, seg_id, X, fs, prefix=''): # Generic stats X.loc[seg_id, prefix + 'mean'] = xc.mean() X.loc[seg_id, prefix + 'std'] = xc.std() X.loc[seg_id, prefix + 'max'] = xc.max() X.loc[seg_id, prefix + 'min'] = xc.min() X.loc[seg_id, prefix + 'hmean'] = stats.hmean(np.abs(xc[np.nonzero(xc)[0]])) X.loc[seg_id, prefix + 'gmean'] = stats.gmean(np.abs(xc[np.nonzero(xc)[0]])) X.loc[seg_id, prefix + 'mad'] = xc.mad() X.loc[seg_id, prefix + 'kurt'] = xc.kurtosis() X.loc[seg_id, prefix + 'skew'] = xc.skew() X.loc[seg_id, prefix + 'med'] = xc.median() for p in [1, 5, 10, 20, 25, 30, 40, 50, 60, 70, 75, 80, 90, 95, 99]: X.loc[seg_id, prefix + f'percentile_{p}'] = np.percentile(xc, p) X.loc[seg_id, prefix + f'abs_percentile_{p}'] = np.percentile(np.abs(xc), p) X.loc[seg_id, prefix + 'num_crossing_0'] = feature_calculators.number_crossing_m(xc, 0) for p in [95,99]: X.loc[seg_id, prefix + f'binned_entropy_{p}'] = feature_calculators.binned_entropy(xc, p) # Andrew stats X.loc[seg_id, prefix + 'mean_diff'] = np.mean(np.diff(xc)) X.loc[seg_id, prefix + 'mean_abs_diff'] = np.mean(np.abs(np.diff(xc))) X.loc[seg_id, prefix + 'mean_change_rate'] = change_rate(xc, method='original') X.loc[seg_id, prefix + 'mean_change_rate_v2'] = change_rate(xc, method='modified') X.loc[seg_id, prefix + 'abs_max'] = np.abs(xc).max() X.loc[seg_id, prefix + 'abs_min'] = np.abs(xc).min() X.loc[seg_id, prefix + 'mean_change_abs'] = np.mean(np.diff(xc)) # Classical stats by segment for agg_type, slice_length, direction in product(['std', 'min', 'max', 'mean'], [1000, 10000, 50000], ['first', 'last']): if direction == 'first': X.loc[seg_id, prefix + f'{agg_type}_{direction}_{slice_length}'] = xc[:slice_length].agg(agg_type) elif direction == 'last': X.loc[seg_id, prefix + f'{agg_type}_{direction}_{slice_length}'] = xc[-slice_length:].agg(agg_type) X.loc[seg_id, prefix + 'avg_first_50000'] = xc[:50000].mean() X.loc[seg_id, prefix + 'avg_last_50000'] = xc[-50000:].mean() X.loc[seg_id, prefix + 'avg_first_10000'] = xc[:10000].mean() X.loc[seg_id, prefix + 'avg_last_10000'] = xc[-10000:].mean() # k-statistic and moments for i in range(1, 5): X.loc[seg_id, prefix + f'kstat_{i}'] = stats.kstat(xc, i) X.loc[seg_id, prefix + f'moment_{i}'] = stats.moment(xc, i) for i in [1, 2]: X.loc[seg_id, prefix + f'kstatvar_{i}'] = stats.kstatvar(xc, i) X.loc[seg_id, prefix + 'range_minf_m4000'] = feature_calculators.range_count(xc, -np.inf, -4000) X.loc[seg_id, prefix + 'range_p4000_pinf'] = feature_calculators.range_count(xc, 4000, np.inf) for i, j in zip(borders, borders[1:]): X.loc[seg_id, prefix + f'range_{i}_{j}'] = feature_calculators.range_count(xc, i, j) X.loc[seg_id, prefix + 'ratio_unique_values'] = feature_calculators.ratio_value_number_to_time_series_length(xc) X.loc[seg_id, prefix + 'max_to_min'] = xc.max() / np.abs(xc.min()) X.loc[seg_id, prefix + 'max_to_min_diff'] = xc.max() - np.abs(xc.min()) X.loc[seg_id, prefix + 'count_big'] = len(xc[np.abs(xc) > 500]) X.loc[seg_id, prefix + 'sum'] = xc.sum() # calc_change_rate on slices of data for slice_length, direction in product([1000, 10000, 50000], ['first', 'last']): if direction == 'first': X.loc[seg_id, prefix + f'mean_change_rate_{direction}_{slice_length}'] = change_rate(xc[:slice_length], method='original') X.loc[seg_id, prefix + f'mean_change_rate_{direction}_{slice_length}_v2'] = change_rate(xc[:slice_length], method='modified') elif direction == 'last': X.loc[seg_id, prefix + f'mean_change_rate_{direction}_{slice_length}'] = change_rate(xc[-slice_length:], method='original') X.loc[seg_id, prefix + f'mean_change_rate_{direction}_{slice_length}_v2'] = change_rate(xc[-slice_length:], method='modified') X.loc[seg_id, prefix + 'q95'] = np.quantile(xc, 0.95) X.loc[seg_id, prefix + 'q99'] = np.quantile(xc, 0.99) X.loc[seg_id, prefix + 'q05'] = np.quantile(xc, 0.05) X.loc[seg_id, prefix + 'q01'] = np.quantile(xc, 0.01) X.loc[seg_id, prefix + 'abs_q95'] = np.quantile(np.abs(xc), 0.95) X.loc[seg_id, prefix + 'abs_q99'] = np.quantile(np.abs(xc), 0.99) X.loc[seg_id, prefix + 'abs_q05'] = np.quantile(np.abs(xc), 0.05) X.loc[seg_id, prefix + 'abs_q01'] = np.quantile(np.abs(xc), 0.01) X.loc[seg_id, prefix + 'trend'] = add_trend_feature(xc) X.loc[seg_id, prefix + 'abs_trend'] = add_trend_feature(xc, abs_values=True) X.loc[seg_id, prefix + 'abs_mean'] = np.abs(xc).mean() X.loc[seg_id, prefix + 'abs_std'] = np.abs(xc).std() X.loc[seg_id, prefix + 'Hilbert_mean'] = np.abs(hilbert(xc)).mean() X.loc[seg_id, prefix + 'Hann_window_mean'] = (convolve(xc, hann(150), mode='same') / sum(hann(150))).mean() for hw in [50, 150, 1500, 15000]: X.loc[seg_id, prefix + f'Hann_window_mean_{hw}'] = (convolve(xc, hann(hw), mode='same') / sum(hann(hw))).mean() sta_lta_method = 'original' classic_sta_lta1 = sta_lta_ratio(xc, 500, 10000, method=sta_lta_method) classic_sta_lta2 = sta_lta_ratio(xc, 5000, 100000, method=sta_lta_method) classic_sta_lta3 = sta_lta_ratio(xc, 3333, 6666, method=sta_lta_method) classic_sta_lta4 = sta_lta_ratio(xc, 10000, 25000, method=sta_lta_method) classic_sta_lta5 = sta_lta_ratio(xc, 50, 1000, method=sta_lta_method) classic_sta_lta6 = sta_lta_ratio(xc, 100, 5000, method=sta_lta_method) classic_sta_lta7 = sta_lta_ratio(xc, 333, 666, method=sta_lta_method) classic_sta_lta8 = sta_lta_ratio(xc, 4000, 10000, method=sta_lta_method) X.loc[seg_id, prefix + 'classic_sta_lta1_mean'] = classic_sta_lta1.mean() X.loc[seg_id, prefix + 'classic_sta_lta2_mean'] = classic_sta_lta2.mean() X.loc[seg_id, prefix + 'classic_sta_lta3_mean'] = classic_sta_lta3.mean() X.loc[seg_id, prefix + 'classic_sta_lta4_mean'] = classic_sta_lta4.mean() X.loc[seg_id, prefix + 'classic_sta_lta5_mean'] = classic_sta_lta5.mean() X.loc[seg_id, prefix + 'classic_sta_lta6_mean'] = classic_sta_lta6.mean() X.loc[seg_id, prefix + 'classic_sta_lta7_mean'] = classic_sta_lta7.mean() X.loc[seg_id, prefix + 'classic_sta_lta8_mean'] = classic_sta_lta8.mean() X.loc[seg_id, prefix + 'classic_sta_lta1_q95'] = np.quantile(classic_sta_lta1, 0.95) X.loc[seg_id, prefix + 'classic_sta_lta2_q95'] = np.quantile(classic_sta_lta2, 0.95) X.loc[seg_id, prefix + 'classic_sta_lta3_q95'] = np.quantile(classic_sta_lta3, 0.95) X.loc[seg_id, prefix + 'classic_sta_lta4_q95'] = np.quantile(classic_sta_lta4, 0.95) X.loc[seg_id, prefix + 'classic_sta_lta5_q95'] = np.quantile(classic_sta_lta5, 0.95) X.loc[seg_id, prefix + 'classic_sta_lta6_q95'] = np.quantile(classic_sta_lta6, 0.95) X.loc[seg_id, prefix + 'classic_sta_lta7_q95'] = np.quantile(classic_sta_lta7, 0.95) X.loc[seg_id, prefix + 'classic_sta_lta8_q95'] = np.quantile(classic_sta_lta8, 0.95) X.loc[seg_id, prefix + 'classic_sta_lta1_q05'] = np.quantile(classic_sta_lta1, 0.05) X.loc[seg_id, prefix + 'classic_sta_lta2_q05'] = np.quantile(classic_sta_lta2, 0.05) X.loc[seg_id, prefix + 'classic_sta_lta3_q05'] = np.quantile(classic_sta_lta3, 0.05) X.loc[seg_id, prefix + 'classic_sta_lta4_q05'] = np.quantile(classic_sta_lta4, 0.05) X.loc[seg_id, prefix + 'classic_sta_lta5_q05'] = np.quantile(classic_sta_lta5, 0.05) X.loc[seg_id, prefix + 'classic_sta_lta6_q05'] = np.quantile(classic_sta_lta6, 0.05) X.loc[seg_id, prefix + 'classic_sta_lta7_q05'] = np.quantile(classic_sta_lta7, 0.05) X.loc[seg_id, prefix + 'classic_sta_lta8_q05'] = np.quantile(classic_sta_lta8, 0.05) sta_lta_method = 'modified' classic_sta_lta1 = sta_lta_ratio(xc, 500, 10000, method=sta_lta_method) classic_sta_lta2 = sta_lta_ratio(xc, 5000, 100000, method=sta_lta_method) classic_sta_lta3 = sta_lta_ratio(xc, 3333, 6666, method=sta_lta_method) classic_sta_lta4 = sta_lta_ratio(xc, 10000, 25000, method=sta_lta_method) classic_sta_lta5 = sta_lta_ratio(xc, 50, 1000, method=sta_lta_method) classic_sta_lta6 = sta_lta_ratio(xc, 100, 5000, method=sta_lta_method) classic_sta_lta7 = sta_lta_ratio(xc, 333, 666, method=sta_lta_method) classic_sta_lta8 = sta_lta_ratio(xc, 4000, 10000, method=sta_lta_method) X.loc[seg_id, prefix + 'modified_sta_lta1_mean'] = classic_sta_lta1.mean() X.loc[seg_id, prefix + 'modified_sta_lta2_mean'] = classic_sta_lta2.mean() X.loc[seg_id, prefix + 'modified_sta_lta3_mean'] = classic_sta_lta3.mean() X.loc[seg_id, prefix + 'modified_sta_lta4_mean'] = classic_sta_lta4.mean() X.loc[seg_id, prefix + 'modified_sta_lta5_mean'] = classic_sta_lta5.mean() X.loc[seg_id, prefix + 'modified_sta_lta6_mean'] = classic_sta_lta6.mean() X.loc[seg_id, prefix + 'modified_sta_lta7_mean'] = classic_sta_lta7.mean() X.loc[seg_id, prefix + 'modified_sta_lta8_mean'] = classic_sta_lta8.mean() X.loc[seg_id, prefix + 'modified_sta_lta1_q95'] = np.quantile(classic_sta_lta1, 0.95) X.loc[seg_id, prefix + 'modified_sta_lta2_q95'] = np.quantile(classic_sta_lta2, 0.95) X.loc[seg_id, prefix + 'modified_sta_lta3_q95'] = np.quantile(classic_sta_lta3, 0.95) X.loc[seg_id, prefix + 'modified_sta_lta4_q95'] = np.quantile(classic_sta_lta4, 0.95) X.loc[seg_id, prefix + 'modified_sta_lta5_q95'] = np.quantile(classic_sta_lta5, 0.95) X.loc[seg_id, prefix + 'modified_sta_lta6_q95'] = np.quantile(classic_sta_lta6, 0.95) X.loc[seg_id, prefix + 'modified_sta_lta7_q95'] = np.quantile(classic_sta_lta7, 0.95) X.loc[seg_id, prefix + 'modified_sta_lta8_q95'] = np.quantile(classic_sta_lta8, 0.95) X.loc[seg_id, prefix + 'modified_sta_lta1_q05'] = np.quantile(classic_sta_lta1, 0.05) X.loc[seg_id, prefix + 'modified_sta_lta2_q05'] = np.quantile(classic_sta_lta2, 0.05) X.loc[seg_id, prefix + 'modified_sta_lta3_q05'] = np.quantile(classic_sta_lta3, 0.05) X.loc[seg_id, prefix + 'modified_sta_lta4_q05'] = np.quantile(classic_sta_lta4, 0.05) X.loc[seg_id, prefix + 'modified_sta_lta5_q05'] = np.quantile(classic_sta_lta5, 0.05) X.loc[seg_id, prefix + 'modified_sta_lta6_q05'] = np.quantile(classic_sta_lta6, 0.05) X.loc[seg_id, prefix + 'modified_sta_lta7_q05'] = np.quantile(classic_sta_lta7, 0.05) X.loc[seg_id, prefix + 'modified_sta_lta8_q05'] = np.quantile(classic_sta_lta8, 0.05) X.loc[seg_id, prefix + 'Moving_average_700_mean'] = xc.rolling(window=700).mean().mean(skipna=True) X.loc[seg_id, prefix + 'Moving_average_1500_mean'] = xc.rolling(window=1500).mean().mean(skipna=True) X.loc[seg_id, prefix + 'Moving_average_3000_mean'] = xc.rolling(window=3000).mean().mean(skipna=True) X.loc[seg_id, prefix + 'Moving_average_6000_mean'] = xc.rolling(window=6000).mean().mean(skipna=True) X.loc[seg_id, prefix + 'Moving_average_30000_mean'] = xc.rolling(window=30000).mean().mean(skipna=True) ewma = pd.Series.ewm X.loc[seg_id, prefix + 'exp_Moving_average_300_mean'] = ewma(xc, span=300).mean().mean(skipna=True) X.loc[seg_id, prefix + 'exp_Moving_average_3000_mean'] = ewma(xc, span=3000).mean().mean(skipna=True) X.loc[seg_id, prefix + 'exp_Moving_average_6000_mean'] = ewma(xc, span=6000).mean().mean(skipna=True) X.loc[seg_id, prefix + 'exp_Moving_average_30000_mean'] = ewma(xc, span=30000).mean().mean(skipna=True) X.loc[seg_id, prefix + 'exp_Moving_average_50000_mean'] = ewma(xc, span=50000).mean().mean(skipna=True) X.loc[seg_id, prefix + 'exp_Moving_average_300_std'] = ewma(xc, span=300).mean().std(skipna=True) X.loc[seg_id, prefix + 'exp_Moving_average_3000_std'] = ewma(xc, span=3000).mean().std(skipna=True) X.loc[seg_id, prefix + 'exp_Moving_average_6000_std'] = ewma(xc, span=6000).mean().std(skipna=True) X.loc[seg_id, prefix + 'exp_Moving_average_30000_std'] = ewma(xc, span=30000).mean().std(skipna=True) X.loc[seg_id, prefix + 'exp_Moving_average_50000_std'] = ewma(xc, span=50000).mean().std(skipna=True) X.loc[seg_id, prefix + 'exp_Moving_std_300_mean'] = ewma(xc, span=300).mean().mean(skipna=True) X.loc[seg_id, prefix + 'exp_Moving_std_3000_mean'] = ewma(xc, span=3000).mean().mean(skipna=True) X.loc[seg_id, prefix + 'exp_Moving_std_6000_mean'] = ewma(xc, span=6000).mean().mean(skipna=True) X.loc[seg_id, prefix + 'exp_Moving_std_30000_mean'] = ewma(xc, span=30000).mean().mean(skipna=True) X.loc[seg_id, prefix + 'exp_Moving_std_50000_mean'] = ewma(xc, span=50000).mean().mean(skipna=True) X.loc[seg_id, prefix + 'exp_Moving_std_300_std'] = ewma(xc, span=300).std().std(skipna=True) X.loc[seg_id, prefix + 'exp_Moving_std_3000_std'] = ewma(xc, span=3000).std().std(skipna=True) X.loc[seg_id, prefix + 'exp_Moving_std_6000_std'] = ewma(xc, span=6000).std().std(skipna=True) X.loc[seg_id, prefix + 'exp_Moving_std_30000_std'] = ewma(xc, span=30000).std().std(skipna=True) X.loc[seg_id, prefix + 'exp_Moving_std_50000_std'] = ewma(xc, span=50000).std().std(skipna=True) no_of_std = 2 X.loc[seg_id, prefix + 'MA_700MA_std_mean'] = xc.rolling(window=700).std().mean() X.loc[seg_id, prefix + 'MA_700MA_BB_high_mean'] = (X.loc[seg_id, prefix + 'Moving_average_700_mean'] + no_of_std * X.loc[seg_id, prefix + 'MA_700MA_std_mean']).mean() X.loc[seg_id, prefix + 'MA_700MA_BB_low_mean'] = (X.loc[seg_id, prefix + 'Moving_average_700_mean'] - no_of_std * X.loc[seg_id, prefix + 'MA_700MA_std_mean']).mean() X.loc[seg_id, prefix + 'MA_400MA_std_mean'] = xc.rolling(window=400).std().mean() X.loc[seg_id, prefix + 'MA_400MA_BB_high_mean'] = (X.loc[seg_id, prefix + 'Moving_average_700_mean'] + no_of_std * X.loc[seg_id, prefix + 'MA_400MA_std_mean']).mean() X.loc[seg_id, prefix + 'MA_400MA_BB_low_mean'] = (X.loc[seg_id, prefix + 'Moving_average_700_mean'] - no_of_std * X.loc[seg_id, prefix + 'MA_400MA_std_mean']).mean() X.loc[seg_id, prefix + 'MA_1000MA_std_mean'] = xc.rolling(window=1000).std().mean() X.loc[seg_id, prefix + 'iqr'] = np.subtract(*np.percentile(xc, [75, 25])) X.loc[seg_id, prefix + 'iqr1'] = np.subtract(*np.percentile(xc, [95, 5])) X.loc[seg_id, prefix + 'q999'] = np.quantile(xc, 0.999) X.loc[seg_id, prefix + 'q001'] = np.quantile(xc, 0.001) X.loc[seg_id, prefix + 'ave10'] = stats.trim_mean(xc, 0.1) X.loc[seg_id, prefix + 'freq_cross_first_50000'] = freq_from_crossings(xc.values[:50000], fs) X.loc[seg_id, prefix + 'freq_cross_last_50000'] = freq_from_crossings(xc.values[-50000:], fs) X.loc[seg_id, prefix + 'freq_cross_first_10000'] = freq_from_crossings(xc.values[:10000], fs) X.loc[seg_id, prefix + 'freq_cross_last_10000'] = freq_from_crossings(xc.values[-10000:], fs) for peak in [10, 20, 50, 100]: X.loc[seg_id, prefix + f'num_peaks_{peak}'] = feature_calculators.number_peaks(xc, peak) for c in [1, 5, 10, 50, 100]: X.loc[seg_id, prefix + f'spkt_welch_density_{c}'] = list(feature_calculators.spkt_welch_density(xc, [{'coeff': c}]))[0][1] X.loc[seg_id, prefix + f'time_rev_asym_stat_{c}'] = feature_calculators.time_reversal_asymmetry_statistic(xc, c) for autocorr_lag in [5, 10, 50, 100, 500, 1000, 5000, 10000]: X.loc[seg_id, prefix + f'autocorrelation_{autocorr_lag}'] = feature_calculators.autocorrelation(xc, autocorr_lag) X.loc[seg_id, prefix + f'c3_{autocorr_lag}'] = feature_calculators.c3(xc, autocorr_lag) for windows in [10, 50, 100, 500, 1000, 10000]: x_roll_std = xc.rolling(windows).std().dropna().values x_roll_mean = xc.rolling(windows).mean().dropna().values for p in [1, 5, 10, 20, 25, 30, 40, 50, 60, 70, 75, 80, 90, 95, 99]: X.loc[seg_id, prefix + f'percentile_roll_std_{p}_window_{windows}'] = np.percentile(x_roll_std, p) X.loc[seg_id, prefix + f'percentile_roll_mean_{p}_window_{windows}'] = np.percentile(x_roll_mean, p) X.loc[seg_id, prefix + 'ave_roll_std_' + str(windows)] = x_roll_std.mean() X.loc[seg_id, prefix + 'std_roll_std_' + str(windows)] = x_roll_std.std() X.loc[seg_id, prefix + 'max_roll_std_' + str(windows)] = x_roll_std.max() X.loc[seg_id, prefix + 'min_roll_std_' + str(windows)] = x_roll_std.min() X.loc[seg_id, prefix + 'q01_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.01) X.loc[seg_id, prefix + 'q05_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.05) X.loc[seg_id, prefix + 'q95_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.95) X.loc[seg_id, prefix + 'q99_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.99) X.loc[seg_id, prefix + 'av_change_abs_roll_std_' + str(windows)] = np.mean(np.abs(np.diff(x_roll_std))) X.loc[seg_id, prefix + 'av_change_rate_roll_std_' + str(windows)] = change_rate(pd.Series(x_roll_std), method='original') X.loc[seg_id, prefix + 'av_change_rate_roll_std_' + str(windows) + 'v2'] = change_rate(pd.Series(x_roll_std), method='modified') X.loc[seg_id, prefix + 'abs_max_roll_std_' + str(windows)] = np.abs(x_roll_std).max() X.loc[seg_id, prefix + 'ave_roll_mean_' + str(windows)] = x_roll_mean.mean() X.loc[seg_id, prefix + 'std_roll_mean_' + str(windows)] = x_roll_mean.std() X.loc[seg_id, prefix + 'max_roll_mean_' + str(windows)] = x_roll_mean.max() X.loc[seg_id, prefix + 'min_roll_mean_' + str(windows)] = x_roll_mean.min() X.loc[seg_id, prefix + 'q01_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.01) X.loc[seg_id, prefix + 'q05_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.05) X.loc[seg_id, prefix + 'q95_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.95) X.loc[seg_id, prefix + 'q99_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.99) X.loc[seg_id, prefix + 'av_change_abs_roll_mean_' + str(windows)] = np.mean(np.abs(np.diff(x_roll_mean))) X.loc[seg_id, prefix + 'av_change_rate_roll_mean_' + str(windows)] = change_rate(pd.Series(x_roll_mean), method='original') X.loc[seg_id, prefix + 'av_change_rate_roll_mean_' + str(windows) + '_v2'] = change_rate(pd.Series(x_roll_mean), method='modified') X.loc[seg_id, prefix + 'abs_max_roll_mean_' + str(windows)] = np.abs(x_roll_mean).max() for p in [1, 5, 10, 20, 25, 30, 40, 50, 60, 70, 75, 80, 90, 95, 99]: X.loc[seg_id, prefix + f'percentile_roll_std_{p}'] = X.loc[seg_id, prefix + f'percentile_roll_std_{p}_window_10000'] X.loc[seg_id, prefix + f'percentile_roll_mean_{p}'] = X.loc[seg_id, prefix + f'percentile_roll_mean_{p}_window_10000']
def feature_extract(X_train, i, X_element, y_train=None, y_element=None, is_TrainDataSet=True): if is_TrainDataSet: y_train.loc[i, 'time_to_failure'] = y_element X_element = X_element.reshape(-1) xcdm = X_element - np.mean(X_element) b, a = des_bw_filter_lp(cutoff=18000) xcz = sg.lfilter(b, a, xcdm) zc = np.fft.fft(xcz) zc = zc[:MAX_FREQ_IDX] # FFT transform values realFFT = np.real(zc) imagFFT = np.imag(zc) freq_bands = [x for x in range(0, MAX_FREQ_IDX, FREQ_STEP)] magFFT = np.sqrt(realFFT**2 + imagFFT**2) phzFFT = np.arctan(imagFFT / realFFT) phzFFT[phzFFT == -np.inf] = -np.pi / 2.0 phzFFT[phzFFT == np.inf] = np.pi / 2.0 phzFFT = np.nan_to_num(phzFFT) for freq in freq_bands: X_train.loc[i, 'FFT_Mag_01q%d' % freq] = np.quantile( magFFT[freq:freq + FREQ_STEP], 0.01) X_train.loc[i, 'FFT_Mag_10q%d' % freq] = np.quantile( magFFT[freq:freq + FREQ_STEP], 0.1) X_train.loc[i, 'FFT_Mag_90q%d' % freq] = np.quantile( magFFT[freq:freq + FREQ_STEP], 0.9) X_train.loc[i, 'FFT_Mag_99q%d' % freq] = np.quantile( magFFT[freq:freq + FREQ_STEP], 0.99) X_train.loc[i, 'FFT_Mag_mean%d' % freq] = np.mean(magFFT[freq:freq + FREQ_STEP]) X_train.loc[i, 'FFT_Mag_std%d' % freq] = np.std(magFFT[freq:freq + FREQ_STEP]) X_train.loc[i, 'FFT_Mag_max%d' % freq] = np.max(magFFT[freq:freq + FREQ_STEP]) X_train.loc[i, 'FFT_Phz_mean%d' % freq] = np.mean(phzFFT[freq:freq + FREQ_STEP]) X_train.loc[i, 'FFT_Phz_std%d' % freq] = np.std(phzFFT[freq:freq + FREQ_STEP]) X_train.loc[i, 'FFT_Rmean'] = realFFT.mean() X_train.loc[i, 'FFT_Rstd'] = realFFT.std() X_train.loc[i, 'FFT_Rmax'] = realFFT.max() X_train.loc[i, 'FFT_Rmin'] = realFFT.min() X_train.loc[i, 'FFT_Imean'] = imagFFT.mean() X_train.loc[i, 'FFT_Istd'] = imagFFT.std() X_train.loc[i, 'FFT_Imax'] = imagFFT.max() X_train.loc[i, 'FFT_Imin'] = imagFFT.min() X_train.loc[i, 'FFT_Rmean_first_6000'] = realFFT[:6000].mean() X_train.loc[i, 'FFT_Rstd__first_6000'] = realFFT[:6000].std() X_train.loc[i, 'FFT_Rmax_first_6000'] = realFFT[:6000].max() X_train.loc[i, 'FFT_Rmin_first_6000'] = realFFT[:6000].min() X_train.loc[i, 'FFT_Rmean_first_18000'] = realFFT[:18000].mean() X_train.loc[i, 'FFT_Rstd_first_18000'] = realFFT[:18000].std() X_train.loc[i, 'FFT_Rmax_first_18000'] = realFFT[:18000].max() X_train.loc[i, 'FFT_Rmin_first_18000'] = realFFT[:18000].min() peaks = [10, 20, 50, 100] for peak in peaks: X_train.loc[ i, 'num_peaks_{}'.format(peak)] = feature_calculators.number_peaks( X_element, peak) autocorr_lags = [5, 10, 50, 100, 500, 1000, 5000, 10000] for autocorr_lag in autocorr_lags: X_train.loc[i, 'autocorrelation_{}'.format( autocorr_lag)] = feature_calculators.autocorrelation( X_element, autocorr_lag) X_train.loc[i, 'c3_{}'.format(autocorr_lag)] = feature_calculators.c3( X_element, autocorr_lag) X_train.loc[i, 'ave'] = X_element.mean() X_train.loc[i, 'std'] = X_element.std() X_train.loc[i, 'max'] = X_element.max() X_train.loc[i, 'min'] = X_element.min() # geometric and harminic means X_train.loc[i, 'hmean'] = stats.hmean( np.abs(X_element[np.nonzero(X_element)[0]])) X_train.loc[i, 'gmean'] = stats.gmean( np.abs(X_element[np.nonzero(X_element)[0]])) # nth k-statistic and nth moment for ii in range(1, 5): X_train.loc[i, 'kstat_{}'.format(ii)] = stats.kstat(X_element, ii) X_train.loc[i, 'moment_{}'.format(ii)] = stats.moment(X_element, ii) for ii in [1, 2]: X_train.loc[i, 'kstatvar_{}.format(ii)'] = stats.kstatvar(X_element, ii) X_train.loc[i, 'max_to_min'] = X_element.max() / np.abs(X_element.min()) X_train.loc[i, 'max_to_min_diff'] = X_element.max() - np.abs(X_element.min()) X_train.loc[i, 'count_big'] = len(X_element[np.abs(X_element) > 500]) X_train.loc[i, 'sum'] = X_element.sum() X_train.loc[i, 'av_change_abs'] = np.mean(np.diff(X_element)) tmp = np.diff(X_element) / X_element[:-1] tmp = tmp[~np.isnan(tmp)] tmp = tmp[~np.isinf(tmp)] X_train.loc[i, 'av_change_rate'] = np.mean(tmp) X_train.loc[i, 'abs_max'] = np.abs(X_element).max() X_train.loc[i, 'abs_min'] = np.abs(X_element).min() X_train.loc[i, 'std_first_50000'] = X_element[:50000].std() X_train.loc[i, 'std_last_50000'] = X_element[-50000:].std() X_train.loc[i, 'std_first_10000'] = X_element[:10000].std() X_train.loc[i, 'std_last_10000'] = X_element[-10000:].std() X_train.loc[i, 'avg_first_50000'] = X_element[:50000].mean() X_train.loc[i, 'avg_last_50000'] = X_element[-50000:].mean() X_train.loc[i, 'avg_first_10000'] = X_element[:10000].mean() X_train.loc[i, 'avg_last_10000'] = X_element[-10000:].mean() X_train.loc[i, 'min_first_50000'] = X_element[:50000].min() X_train.loc[i, 'min_last_50000'] = X_element[-50000:].min() X_train.loc[i, 'min_first_10000'] = X_element[:10000].min() X_train.loc[i, 'min_last_10000'] = X_element[-10000:].min() X_train.loc[i, 'max_first_50000'] = X_element[:50000].max() X_train.loc[i, 'max_last_50000'] = X_element[-50000:].max() X_train.loc[i, 'max_first_10000'] = X_element[:10000].max() X_train.loc[i, 'max_last_10000'] = X_element[-10000:].max() percentiles = [1, 5, 10, 20, 25, 30, 40, 50, 60, 70, 75, 80, 90, 95, 99] for p in percentiles: X_train.loc[i, 'percentile_{}'.format(p)] = np.percentile(X_element, p) X_train.loc[i, 'abs_percentile_{}'.format(p)] = np.percentile( np.abs(X_element), p) windows = [10, 50, 100, 500, 1000, 10000] X_element_df = pd.DataFrame(X_element) for w in windows: x_roll_std = X_element_df.rolling(w).std().dropna().values x_roll_mean = X_element_df.rolling(w).mean().dropna().values x_roll_std = x_roll_std.reshape(-1) x_roll_mean = x_roll_mean.reshape(-1) X_train.loc[i, 'ave_roll_std_{}'.format(w)] = x_roll_std.mean() X_train.loc[i, 'std_roll_std_{}'.format(w)] = x_roll_std.std() X_train.loc[i, 'max_roll_std_{}'.format(w)] = x_roll_std.max() X_train.loc[i, 'min_roll_std_{}'.format(w)] = x_roll_std.min() for p in percentiles: X_train.loc[i, 'percentile_roll_std_{}_window_{}'. format(p, w)] = np.percentile(x_roll_std, p) X_train.loc[i, 'av_change_abs_roll_std_{}'.format(w)] = np.mean( np.diff(x_roll_std)) tmp = np.diff(x_roll_std) / x_roll_std[:-1] tmp = tmp[~np.isnan(tmp)] tmp = tmp[~np.isinf(tmp)] X_train.loc[i, 'av_change_rate_roll_std_{}'.format(w)] = np.mean(tmp) X_train.loc[i, 'abs_max_roll_std_{}'.format(w)] = np.abs( x_roll_std).max() X_train.loc[i, 'ave_roll_mean_{}'.format(w)] = x_roll_mean.mean() X_train.loc[i, 'std_roll_mean_{}'.format(w)] = x_roll_mean.std() X_train.loc[i, 'max_roll_mean_{}'.format(w)] = x_roll_mean.max() X_train.loc[i, 'min_roll_mean_{}'.format(w)] = x_roll_mean.min() for p in percentiles: X_train.loc[i, 'percentile_roll_mean_{}_window_{}'. format(p, w)] = np.percentile(x_roll_mean, p) X_train.loc[i, 'av_change_abs_roll_mean_{}'.format(w)] = np.mean( np.diff(x_roll_mean)) tmp = np.diff(x_roll_mean) / x_roll_mean[:-1] tmp = tmp[~np.isnan(tmp)] tmp = tmp[~np.isinf(tmp)] X_train.loc[i, 'av_change_rate_roll_mean_{}'.format(w)] = np.mean(tmp) X_train.loc[i, 'abs_max_roll_mean_{}'.format(w)] = np.abs( x_roll_mean).max()
def features(self, x, y, seg_id): feature_dict = dict() feature_dict['target'] = y feature_dict['seg_id'] = seg_id # create features here # lists with parameters to iterate over them percentiles = [ 1, 5, 10, 20, 25, 30, 40, 50, 60, 70, 75, 80, 90, 95, 99 ] hann_windows = [50, 150, 1500, 15000] spans = [300, 3000, 30000, 50000] windows = [10, 50, 100, 500, 1000, 10000] borders = list(range(-4000, 4001, 1000)) peaks = [10, 20, 50, 100] coefs = [1, 5, 10, 50, 100] lags = [10, 100, 1000, 10000] autocorr_lags = [5, 10, 50, 100, 500, 1000, 5000, 10000] # basic stats feature_dict['mean'] = x.mean() feature_dict['std'] = x.std() feature_dict['max'] = x.max() feature_dict['min'] = x.min() # basic stats on absolute values feature_dict['mean_change_abs'] = np.mean(np.diff(x)) feature_dict['abs_max'] = np.abs(x).max() feature_dict['abs_mean'] = np.abs(x).mean() feature_dict['abs_std'] = np.abs(x).std() # geometric and harminic means feature_dict['hmean'] = stats.hmean(np.abs(x[np.nonzero(x)[0]])) feature_dict['gmean'] = stats.gmean(np.abs(x[np.nonzero(x)[0]])) # k-statistic and moments for i in range(1, 5): feature_dict[f'kstat_{i}'] = stats.kstat(x, i) feature_dict[f'moment_{i}'] = stats.moment(x, i) for i in [1, 2]: feature_dict[f'kstatvar_{i}'] = stats.kstatvar(x, i) # aggregations on various slices of data for agg_type, slice_length, direction in product( ['std', 'min', 'max', 'mean'], [1000, 10000, 50000], ['first', 'last']): if direction == 'first': feature_dict[ f'{agg_type}_{direction}_{slice_length}'] = x[: slice_length].agg( agg_type) elif direction == 'last': feature_dict[f'{agg_type}_{direction}_{slice_length}'] = x[ -slice_length:].agg(agg_type) feature_dict['max_to_min'] = x.max() / np.abs(x.min()) feature_dict['max_to_min_diff'] = x.max() - np.abs(x.min()) feature_dict['count_big'] = len(x[np.abs(x) > 500]) feature_dict['sum'] = x.sum() feature_dict['mean_change_rate'] = calc_change_rate(x) # calc_change_rate on slices of data for slice_length, direction in product([1000, 10000, 50000], ['first', 'last']): if direction == 'first': feature_dict[ f'mean_change_rate_{direction}_{slice_length}'] = calc_change_rate( x[:slice_length]) elif direction == 'last': feature_dict[ f'mean_change_rate_{direction}_{slice_length}'] = calc_change_rate( x[-slice_length:]) # percentiles on original and absolute values for p in percentiles: feature_dict[f'percentile_{p}'] = np.percentile(x, p) feature_dict[f'abs_percentile_{p}'] = np.percentile(np.abs(x), p) feature_dict['trend'] = add_trend_feature(x) feature_dict['abs_trend'] = add_trend_feature(x, abs_values=True) feature_dict['mad'] = x.mad() feature_dict['kurt'] = x.kurtosis() feature_dict['skew'] = x.skew() feature_dict['med'] = x.median() feature_dict['Hilbert_mean'] = np.abs(hilbert(x)).mean() for hw in hann_windows: feature_dict[f'Hann_window_mean_{hw}'] = ( convolve(x, hann(hw), mode='same') / sum(hann(hw))).mean() feature_dict['classic_sta_lta1_mean'] = classic_sta_lta(x, 500, 10000).mean() feature_dict['classic_sta_lta2_mean'] = classic_sta_lta( x, 5000, 100000).mean() feature_dict['classic_sta_lta3_mean'] = classic_sta_lta(x, 3333, 6666).mean() feature_dict['classic_sta_lta4_mean'] = classic_sta_lta( x, 10000, 25000).mean() feature_dict['classic_sta_lta5_mean'] = classic_sta_lta(x, 50, 1000).mean() feature_dict['classic_sta_lta6_mean'] = classic_sta_lta(x, 100, 5000).mean() feature_dict['classic_sta_lta7_mean'] = classic_sta_lta(x, 333, 666).mean() feature_dict['classic_sta_lta8_mean'] = classic_sta_lta( x, 4000, 10000).mean() # exponential rolling statistics ewma = pd.Series.ewm for s in spans: feature_dict[f'exp_Moving_average_{s}_mean'] = (ewma( x, span=s).mean(skipna=True)).mean(skipna=True) feature_dict[f'exp_Moving_average_{s}_std'] = (ewma( x, span=s).mean(skipna=True)).std(skipna=True) feature_dict[f'exp_Moving_std_{s}_mean'] = (ewma( x, span=s).std(skipna=True)).mean(skipna=True) feature_dict[f'exp_Moving_std_{s}_std'] = (ewma( x, span=s).std(skipna=True)).std(skipna=True) feature_dict['iqr'] = np.subtract(*np.percentile(x, [75, 25])) feature_dict['iqr1'] = np.subtract(*np.percentile(x, [95, 5])) feature_dict['ave10'] = stats.trim_mean(x, 0.1) for slice_length, threshold in product([50000, 100000, 150000], [5, 10, 20, 50, 100]): feature_dict[f'count_big_{slice_length}_threshold_{threshold}'] = ( np.abs(x[-slice_length:]) > threshold).sum() feature_dict[ f'count_big_{slice_length}_less_threshold_{threshold}'] = ( np.abs(x[-slice_length:]) < threshold).sum() # tfresh features take too long to calculate, so I comment them for now # feature_dict['abs_energy'] = feature_calculators.abs_energy(x) # feature_dict['abs_sum_of_changes'] = feature_calculators.absolute_sum_of_changes(x) # feature_dict['count_above_mean'] = feature_calculators.count_above_mean(x) # feature_dict['count_below_mean'] = feature_calculators.count_below_mean(x) # feature_dict['mean_abs_change'] = feature_calculators.mean_abs_change(x) # feature_dict['mean_change'] = feature_calculators.mean_change(x) # feature_dict['var_larger_than_std_dev'] = feature_calculators.variance_larger_than_standard_deviation(x) feature_dict['range_minf_m4000'] = feature_calculators.range_count( x, -np.inf, -4000) feature_dict['range_p4000_pinf'] = feature_calculators.range_count( x, 4000, np.inf) for i, j in zip(borders, borders[1:]): feature_dict[f'range_{i}_{j}'] = feature_calculators.range_count( x, i, j) # feature_dict['ratio_unique_values'] = feature_calculators.ratio_value_number_to_time_series_length(x) # feature_dict['first_loc_min'] = feature_calculators.first_location_of_minimum(x) # feature_dict['first_loc_max'] = feature_calculators.first_location_of_maximum(x) # feature_dict['last_loc_min'] = feature_calculators.last_location_of_minimum(x) # feature_dict['last_loc_max'] = feature_calculators.last_location_of_maximum(x) # for lag in lags: # feature_dict[f'time_rev_asym_stat_{lag}'] = feature_calculators.time_reversal_asymmetry_statistic(x, lag) for autocorr_lag in autocorr_lags: feature_dict[ f'autocorrelation_{autocorr_lag}'] = feature_calculators.autocorrelation( x, autocorr_lag) feature_dict[f'c3_{autocorr_lag}'] = feature_calculators.c3( x, autocorr_lag) # for coeff, attr in product([1, 2, 3, 4, 5], ['real', 'imag', 'angle']): # feature_dict[f'fft_{coeff}_{attr}'] = list(feature_calculators.fft_coefficient(x, [{'coeff': coeff, 'attr': attr}]))[0][1] # feature_dict['long_strk_above_mean'] = feature_calculators.longest_strike_above_mean(x) # feature_dict['long_strk_below_mean'] = feature_calculators.longest_strike_below_mean(x) # feature_dict['cid_ce_0'] = feature_calculators.cid_ce(x, 0) # feature_dict['cid_ce_1'] = feature_calculators.cid_ce(x, 1) for p in percentiles: feature_dict[ f'binned_entropy_{p}'] = feature_calculators.binned_entropy( x, p) feature_dict['num_crossing_0'] = feature_calculators.number_crossing_m( x, 0) for peak in peaks: feature_dict[ f'num_peaks_{peak}'] = feature_calculators.number_peaks( x, peak) for c in coefs: feature_dict[f'spkt_welch_density_{c}'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': c }]))[0][1] feature_dict[ f'time_rev_asym_stat_{c}'] = feature_calculators.time_reversal_asymmetry_statistic( x, c) # statistics on rolling windows of various sizes for w in windows: x_roll_std = x.rolling(w).std().dropna().values x_roll_mean = x.rolling(w).mean().dropna().values feature_dict[f'ave_roll_std_{w}'] = x_roll_std.mean() feature_dict[f'std_roll_std_{w}'] = x_roll_std.std() feature_dict[f'max_roll_std_{w}'] = x_roll_std.max() feature_dict[f'min_roll_std_{w}'] = x_roll_std.min() for p in percentiles: feature_dict[ f'percentile_roll_std_{p}_window_{w}'] = np.percentile( x_roll_std, p) feature_dict[f'av_change_abs_roll_std_{w}'] = np.mean( np.diff(x_roll_std)) feature_dict[f'av_change_rate_roll_std_{w}'] = np.mean( np.nonzero((np.diff(x_roll_std) / x_roll_std[:-1]))[0]) feature_dict[f'abs_max_roll_std_{w}'] = np.abs(x_roll_std).max() feature_dict[f'ave_roll_mean_{w}'] = x_roll_mean.mean() feature_dict[f'std_roll_mean_{w}'] = x_roll_mean.std() feature_dict[f'max_roll_mean_{w}'] = x_roll_mean.max() feature_dict[f'min_roll_mean_{w}'] = x_roll_mean.min() for p in percentiles: feature_dict[ f'percentile_roll_mean_{p}_window_{w}'] = np.percentile( x_roll_mean, p) feature_dict[f'av_change_abs_roll_mean_{w}'] = np.mean( np.diff(x_roll_mean)) feature_dict[f'av_change_rate_roll_mean_{w}'] = np.mean( np.nonzero((np.diff(x_roll_mean) / x_roll_mean[:-1]))[0]) feature_dict[f'abs_max_roll_mean_{w}'] = np.abs(x_roll_mean).max() return feature_dict
def features(self, x, y, seg_id): feature_dict = dict() feature_dict['target'] = y feature_dict['seg_id'] = seg_id # lists with parameters to iterate over them percentiles = [ 1, 5, 10, 20, 25, 30, 40, 50, 60, 70, 75, 80, 90, 95, 99 ] hann_windows = [50, 150, 1500, 15000] spans = [300, 3000, 30000, 50000] windows = [10, 50, 100, 500, 1000, 10000] borders = list(range(-4000, 4001, 1000)) peaks = [10, 20, 50, 100] coefs = [1, 5, 10, 50, 100] autocorr_lags = [5, 10, 50, 100, 500, 1000, 5000, 10000] # basic stats feature_dict['mean'] = x.mean() feature_dict['std'] = x.std() feature_dict['max'] = x.max() feature_dict['min'] = x.min() # basic stats on absolute values feature_dict['mean_change_abs'] = np.mean(np.diff(x)) feature_dict['abs_max'] = np.abs(x).max() feature_dict['abs_mean'] = np.abs(x).mean() feature_dict['abs_std'] = np.abs(x).std() # geometric and harmonic means feature_dict['hmean'] = stats.hmean(np.abs(x[np.nonzero(x)[0]])) feature_dict['gmean'] = stats.gmean(np.abs(x[np.nonzero(x)[0]])) # k-statistic and moments for i in range(1, 5): feature_dict[f'kstat_{i}'] = stats.kstat(x, i) feature_dict[f'moment_{i}'] = stats.moment(x, i) for i in [1, 2]: feature_dict[f'kstatvar_{i}'] = stats.kstatvar(x, i) # aggregations on various slices of data for agg_type, slice_length, direction in product( ['std', 'min', 'max', 'mean'], [1000, 10000, 50000], ['first', 'last']): if direction == 'first': feature_dict[ f'{agg_type}_{direction}_{slice_length}'] = x[: slice_length].agg( agg_type) elif direction == 'last': feature_dict[f'{agg_type}_{direction}_{slice_length}'] = x[ -slice_length:].agg(agg_type) feature_dict['max_to_min'] = x.max() / np.abs(x.min()) feature_dict['max_to_min_diff'] = x.max() - np.abs(x.min()) feature_dict['count_big'] = len(x[np.abs(x) > 500]) feature_dict['sum'] = x.sum() feature_dict['mean_change_rate'] = self.calc_change_rate(x) # calc_change_rate on slices of data for slice_length, direction in product([1000, 10000, 50000], ['first', 'last']): if direction == 'first': feature_dict[ f'mean_change_rate_{direction}_{slice_length}'] = self.calc_change_rate( x[:slice_length]) elif direction == 'last': feature_dict[ f'mean_change_rate_{direction}_{slice_length}'] = self.calc_change_rate( x[-slice_length:]) # percentiles on original and absolute values for p in percentiles: feature_dict[f'percentile_{p}'] = np.percentile(x, p) feature_dict[f'abs_percentile_{p}'] = np.percentile(np.abs(x), p) feature_dict['trend'] = self.add_trend_feature(x) feature_dict['abs_trend'] = self.add_trend_feature(x, abs_values=True) feature_dict['mad'] = x.mad() feature_dict['kurt'] = x.kurtosis() feature_dict['skew'] = x.skew() feature_dict['med'] = x.median() feature_dict['Hilbert_mean'] = np.abs(signal.hilbert(x)).mean() for hw in hann_windows: feature_dict[f'Hann_window_mean_{hw}'] = ( signal.convolve(x, signal.hann(hw), mode='same') / sum(signal.hann(hw))).mean() feature_dict['classic_sta_lta1_mean'] = self.classic_sta_lta( x, 500, 10000).mean() feature_dict['classic_sta_lta2_mean'] = self.classic_sta_lta( x, 5000, 100000).mean() feature_dict['classic_sta_lta3_mean'] = self.classic_sta_lta( x, 3333, 6666).mean() feature_dict['classic_sta_lta4_mean'] = self.classic_sta_lta( x, 10000, 25000).mean() feature_dict['classic_sta_lta5_mean'] = self.classic_sta_lta( x, 50, 1000).mean() feature_dict['classic_sta_lta6_mean'] = self.classic_sta_lta( x, 100, 5000).mean() feature_dict['classic_sta_lta7_mean'] = self.classic_sta_lta( x, 333, 666).mean() feature_dict['classic_sta_lta8_mean'] = self.classic_sta_lta( x, 4000, 10000).mean() # exponential rolling statistics ewma = pd.Series.ewm for s in spans: feature_dict[f'exp_Moving_average_{s}_mean'] = (ewma( x, span=s).mean(skipna=True)).mean(skipna=True) feature_dict[f'exp_Moving_average_{s}_std'] = (ewma( x, span=s).mean(skipna=True)).std(skipna=True) feature_dict[f'exp_Moving_std_{s}_mean'] = (ewma( x, span=s).std(skipna=True)).mean(skipna=True) feature_dict[f'exp_Moving_std_{s}_std'] = (ewma( x, span=s).std(skipna=True)).std(skipna=True) feature_dict['iqr'] = np.subtract(*np.percentile(x, [75, 25])) feature_dict['iqr1'] = np.subtract(*np.percentile(x, [95, 5])) feature_dict['ave10'] = stats.trim_mean(x, 0.1) for slice_length, threshold in product([50000, 100000, 150000], [5, 10, 20, 50, 100]): feature_dict[f'count_big_{slice_length}_threshold_{threshold}'] = ( np.abs(x[-slice_length:]) > threshold).sum() feature_dict[ f'count_big_{slice_length}_less_threshold_{threshold}'] = ( np.abs(x[-slice_length:]) < threshold).sum() feature_dict['range_minf_m4000'] = feature_calculators.range_count( x, -np.inf, -4000) feature_dict['range_p4000_pinf'] = feature_calculators.range_count( x, 4000, np.inf) for i, j in zip(borders, borders[1:]): feature_dict[f'range_{i}_{j}'] = feature_calculators.range_count( x, i, j) for autocorr_lag in autocorr_lags: feature_dict[ f'autocorrelation_{autocorr_lag}'] = feature_calculators.autocorrelation( x, autocorr_lag) feature_dict[f'c3_{autocorr_lag}'] = feature_calculators.c3( x, autocorr_lag) for p in percentiles: feature_dict[ f'binned_entropy_{p}'] = feature_calculators.binned_entropy( x, p) feature_dict['num_crossing_0'] = feature_calculators.number_crossing_m( x, 0) for peak in peaks: feature_dict[ f'num_peaks_{peak}'] = feature_calculators.number_peaks( x, peak) for c in coefs: feature_dict[f'spkt_welch_density_{c}'] = \ list(feature_calculators.spkt_welch_density(x, [{'coeff': c}]))[0][1] feature_dict[ f'time_rev_asym_stat_{c}'] = feature_calculators.time_reversal_asymmetry_statistic( x, c) for w in windows: x_roll_std = x.rolling(w).std().dropna().values x_roll_mean = x.rolling(w).mean().dropna().values feature_dict[f'ave_roll_std_{w}'] = x_roll_std.mean() feature_dict[f'std_roll_std_{w}'] = x_roll_std.std() feature_dict[f'max_roll_std_{w}'] = x_roll_std.max() feature_dict[f'min_roll_std_{w}'] = x_roll_std.min() for p in percentiles: feature_dict[ f'percentile_roll_std_{p}_window_{w}'] = np.percentile( x_roll_std, p) feature_dict[f'av_change_abs_roll_std_{w}'] = np.mean( np.diff(x_roll_std)) feature_dict[f'av_change_rate_roll_std_{w}'] = np.mean( np.nonzero((np.diff(x_roll_std) / x_roll_std[:-1]))[0]) feature_dict[f'abs_max_roll_std_{w}'] = np.abs(x_roll_std).max() feature_dict[f'ave_roll_mean_{w}'] = x_roll_mean.mean() feature_dict[f'std_roll_mean_{w}'] = x_roll_mean.std() feature_dict[f'max_roll_mean_{w}'] = x_roll_mean.max() feature_dict[f'min_roll_mean_{w}'] = x_roll_mean.min() for p in percentiles: feature_dict[ f'percentile_roll_mean_{p}_window_{w}'] = np.percentile( x_roll_mean, p) feature_dict[f'av_change_abs_roll_mean_{w}'] = np.mean( np.diff(x_roll_mean)) feature_dict[f'av_change_rate_roll_mean_{w}'] = np.mean( np.nonzero((np.diff(x_roll_mean) / x_roll_mean[:-1]))[0]) feature_dict[f'abs_max_roll_mean_{w}'] = np.abs(x_roll_mean).max() # Mel-frequency cepstral coefficients (MFCCs) x = x.values.astype('float32') mfcc = librosa.feature.mfcc(y=x) for i in range(len(mfcc)): feature_dict[f'mfcc_{i}_avg'] = np.mean(np.abs(mfcc[i])) # spectral features feature_dict['spectral_centroid'] = np.mean( np.abs(librosa.feature.spectral_centroid(y=x)[0])) feature_dict['zero_crossing_rate'] = np.mean( np.abs(librosa.feature.zero_crossing_rate(y=x)[0])) feature_dict['spectral_flatness'] = np.mean( np.abs(librosa.feature.spectral_flatness(y=x)[0])) feature_dict['spectral_contrast'] = np.mean( np.abs( librosa.feature.spectral_contrast( S=np.abs(librosa.stft(x)))[0])) feature_dict['spectral_bandwidth'] = np.mean( np.abs(librosa.feature.spectral_bandwidth(y=x)[0])) return feature_dict
def generate_features(x): # collection of features feature_collection = {} # collection of intervals feature_intervals = { 'k_static': list(range(1, 5)), 'variable_k_static': [1, 2] } for interval in [50, 10, 100, 20]: feature_collection[f'discrimination_power_{interval}'] = feature_calculators.c3(x, interval) for interval in [500, 10000, 1000, 10, 50, 100]: standard_dev = pd.DataFrame(x).rolling(interval).std().dropna().values for sub_interval in [50, 60, 70, 75, 1, 40, 80, 90, 95, 99, 5, 10, 20, 25, 30]: feature_collection[f'{interval}_{sub_interval}_standard_percentile'] = np.percentile(standard_dev, sub_interval) for interval in feature_intervals['k_static']: feature_collection[f'{interval}_k_static'] = stats.kstat(x, interval) feature_collection['median_abs_dev'] = stats.median_absolute_deviation(x) for interval in feature_intervals['variable_k_static']: feature_collection[f'{interval}_variable_k_static'] = stats.kstatvar(x, interval) feature_collection['kurtosis'] = stats.kurtosis(x) for interval in feature_intervals['k_static']: feature_collection[f'{interval}_moments'] = stats.moment(x, interval) feature_collection['median'] = statistics.median(x) feature_collection['skewness'] = stats.skew(x) for interval in [1000, 5000, 10000, 5, 10, 50, 100, 500]: feature_collection[f'{interval}_correlation'] = feature_calculators.autocorrelation(x, interval) for interval in [50, 10, 100, 20]: feature_collection[f'{interval}_peak_number'] = feature_calculators.number_peaks(x, interval) # geometric and harmonic means x_val = x[x.to_numpy().nonzero()[0]] feature_collection['geometric_mean'] = stats.gmean(np.abs(x_val)) feature_collection['harmonic_mean'] = stats.hmean(np.abs(x_val)) # basic stats feature_collection['mean'] = mean(x) feature_collection['std'] = x.std() feature_collection['max'] = max(x) feature_collection['min'] = min(x) # basic stats on absolute values feature_collection['mean_change_abs'] = (np.diff(x)).mean() feature_collection['abs_max'] = max(np.abs(x)) feature_collection['abs_mean'] = np.mean(np.abs(x)) feature_collection['abs_std'] = np.abs(x).std() percentile_divisions = [1, 5, 10, 20, 25, 30, 40, 50, 60, 70, 75, 80, 90, 95, 99] for p in percentile_divisions: feature_collection[f'{p}th_abs_percentile'] = np.percentile(np.abs(x), p) feature_collection[f'{p}th_percentile'] = np.percentile(x, p) feature_collection['maximum_absoluteMinimum_ratio'] = max(x) / np.abs(min(x)) feature_collection['diff_maximum_and_minimum'] = max(x) - np.abs(min(x)) feature_collection['x_sum'] = x.sum() feature_collection['count_x_greater_than_500_BIG'] = len(x[np.abs(x) > 500]) feature_collection['max_to_min'] = x.max() / np.abs(x.min()) feature_collection['max_to_min_diff'] = x.max() - np.abs(x.min()) feature_collection['count_big'] = len(x[np.abs(x) > 500]) feature_collection['sum'] = x.sum() feature_collection['valid_mean_change_rate'] = change_rate_calculation(x) # calc_change_rate on slices of data for slice, movement_direction in product([50000, 1000, 1000], ['last', 'first']): if movement_direction == 'last': x_sliced = x[-slice:] feature_collection[f'from_{movement_direction}_slice_{slice}_valid_mean_change_rate'] = change_rate_calculation(x_sliced) elif movement_direction == 'first': x_sliced = x[:slice] feature_collection[f'from_{movement_direction}_slice_{slice}_valid_mean_change_rate'] = change_rate_calculation(x_sliced) for slice_length, direction in product([50000, 1000, 1000], ['last', 'first']): if direction == 'first': feature_collection[f'mean_change_rate_{direction}_{slice_length}'] = change_rate_calculation(x[:slice_length]) elif direction == 'last': feature_collection[f'mean_change_rate_{direction}_{slice_length}'] = change_rate_calculation(x[-slice_length:]) feature_collection['linear_trend'] = trend_adding_feature(x) feature_collection['absolute_linear_trend'] = trend_adding_feature(x, absolute=True) for slice, threshold_limit in product([50000, 100000, 150000], [5, 10, 20, 50, 100]): x_sliced = np.abs(x[-slice:]) feature_collection[f'count_{slice}_greater_than_threshold_{threshold_limit}'] = (x_sliced > threshold_limit).sum() feature_collection[f'count_{slice}_less_than_threshold_{threshold_limit}'] = (x_sliced < threshold_limit).sum() # aggregations on various slices of data for type_of_aggregation, movement_direction, slice in product(['std', 'mean', 'max', 'min'], ['last', 'first'], [50000, 10000, 1000]): if movement_direction == 'last': feature_collection[f'from_{movement_direction}_slice_{slice}_typeOfAggregation{type_of_aggregation}'] = pd.DataFrame(x[-slice:]).agg(type_of_aggregation)[0] elif movement_direction == 'first': feature_collection[f'from_{movement_direction}_slice_{slice}_typeOfAggregation{type_of_aggregation}'] = pd.DataFrame(x[:slice]).agg(type_of_aggregation)[0] return feature_collection
def features(x: pd.Series) -> pd.DataFrame: feature_dict = pd.DataFrame(dtype=np.float64) seg_id = 1 # lists with parameters to iterate over them percentiles = [1, 5, 10, 20, 25, 30, 40, 50, 60, 70, 75, 80, 90, 95, 99] hann_windows = [50, 150, 1500, 15000] spans = [300, 3000, 30000, 50000] windows = [10, 50, 100, 500, 1000, 10000] # basic stats feature_dict.loc[seg_id, 'mean'] = x.mean() feature_dict.loc[seg_id, 'std'] = x.std() feature_dict.loc[seg_id, 'max'] = x.max() feature_dict.loc[seg_id, 'min'] = x.min() # basic stats on absolute values feature_dict.loc[seg_id, 'mean_change_abs'] = np.mean(np.diff(x)) feature_dict.loc[seg_id, 'abs_max'] = np.abs(x).max() feature_dict.loc[seg_id, 'abs_mean'] = np.abs(x).mean() feature_dict.loc[seg_id, 'abs_std'] = np.abs(x).std() # geometric and harminic means feature_dict.loc[seg_id, 'hmean'] = stats.hmean(np.abs(x[np.nonzero(x)[0]])) feature_dict.loc[seg_id, 'gmean'] = stats.gmean(np.abs(x[np.nonzero(x)[0]])) # k-statistic and moments for i in range(1, 5): feature_dict.loc[seg_id, f'kstat_{i}'] = stats.kstat(x, i) feature_dict.loc[seg_id, f'moment_{i}'] = stats.moment(x, i) for i in [1, 2]: feature_dict.loc[seg_id, f'kstatvar_{i}'] = stats.kstatvar(x, i) # aggregations on various slices of data for agg_type, slice_length, direction in product(['std', 'min', 'max', 'mean'], [1000, 10000, 50000], ['first', 'last']): if direction == 'first': feature_dict.loc[seg_id, f'{agg_type}_{direction}_{slice_length}'] = x[:slice_length].agg(agg_type) elif direction == 'last': feature_dict.loc[seg_id, f'{agg_type}_{direction}_{slice_length}'] = x[-slice_length:].agg(agg_type) feature_dict.loc[seg_id, 'max_to_min'] = x.max() / np.abs(x.min()) feature_dict.loc[seg_id, 'max_to_min_diff'] = x.max() - np.abs(x.min()) feature_dict.loc[seg_id, 'count_big'] = len(x[np.abs(x) > 500]) feature_dict.loc[seg_id, 'sum'] = x.sum() feature_dict.loc[seg_id, 'mean_change_rate'] = calc_change_rate(x) # calc_change_rate on slices of data for slice_length, direction in product([1000, 10000, 50000], ['first', 'last']): if direction == 'first': feature_dict.loc[seg_id, f'mean_change_rate_{direction}_{slice_length}'] = calc_change_rate( x[:slice_length]) elif direction == 'last': feature_dict.loc[seg_id, f'mean_change_rate_{direction}_{slice_length}'] = calc_change_rate( x[-slice_length:]) # percentiles on original and absolute values for p in percentiles: feature_dict.loc[seg_id, f'percentile_{p}'] = np.percentile(x, p) feature_dict.loc[seg_id, f'abs_percentile_{p}'] = np.percentile(np.abs(x), p) feature_dict.loc[seg_id, 'trend'] = add_trend_feature(x) feature_dict.loc[seg_id, 'abs_trend'] = add_trend_feature(x, abs_values=True) feature_dict.loc[seg_id, 'mad'] = x.mad() feature_dict.loc[seg_id, 'kurt'] = x.kurtosis() feature_dict.loc[seg_id, 'skew'] = x.skew() feature_dict.loc[seg_id, 'med'] = x.median() feature_dict.loc[seg_id, 'Hilbert_mean'] = np.abs(hilbert(x)).mean() for hw in hann_windows: feature_dict.loc[seg_id, f'Hann_window_mean_{hw}'] = ( convolve(x, hann(hw), mode='same') / sum(hann(hw))).mean() feature_dict.loc[seg_id, 'classic_sta_lta1_mean'] = classic_sta_lta(x, 500, 10000).mean() feature_dict.loc[seg_id, 'classic_sta_lta2_mean'] = classic_sta_lta(x, 5000, 100000).mean() feature_dict.loc[seg_id, 'classic_sta_lta3_mean'] = classic_sta_lta(x, 3333, 6666).mean() feature_dict.loc[seg_id, 'classic_sta_lta4_mean'] = classic_sta_lta(x, 10000, 25000).mean() feature_dict.loc[seg_id, 'classic_sta_lta5_mean'] = classic_sta_lta(x, 50, 1000).mean() feature_dict.loc[seg_id, 'classic_sta_lta6_mean'] = classic_sta_lta(x, 100, 5000).mean() feature_dict.loc[seg_id, 'classic_sta_lta7_mean'] = classic_sta_lta(x, 333, 666).mean() feature_dict.loc[seg_id, 'classic_sta_lta8_mean'] = classic_sta_lta(x, 4000, 10000).mean() # exponential rolling statistics ewma = pd.Series.ewm for s in spans: feature_dict.loc[seg_id, f'exp_Moving_average_{s}_mean'] = (ewma(x, span=s).mean(skipna=True)).mean( skipna=True) feature_dict.loc[seg_id, f'exp_Moving_average_{s}_std'] = (ewma(x, span=s).mean(skipna=True)).std( skipna=True) feature_dict.loc[seg_id, f'exp_Moving_std_{s}_mean'] = (ewma(x, span=s).std(skipna=True)).mean(skipna=True) feature_dict.loc[seg_id, f'exp_Moving_std_{s}_std'] = (ewma(x, span=s).std(skipna=True)).std(skipna=True) feature_dict.loc[seg_id, 'iqr'] = np.subtract(*np.percentile(x, [75, 25])) feature_dict.loc[seg_id, 'iqr1'] = np.subtract(*np.percentile(x, [95, 5])) feature_dict.loc[seg_id, 'ave10'] = stats.trim_mean(x, 0.1) for slice_length, threshold in product([50000, 100000, 150000], [5, 10, 20, 50, 100]): feature_dict.loc[seg_id, f'count_big_{slice_length}_threshold_{threshold}'] = ( np.abs(x[-slice_length:]) > threshold).sum() feature_dict.loc[seg_id, f'count_big_{slice_length}_less_threshold_{threshold}'] = ( np.abs(x[-slice_length:]) < threshold).sum() # statistics on rolling windows of various sizes for w in windows: x_roll_std = x.rolling(w).std().dropna().values x_roll_mean = x.rolling(w).mean().dropna().values feature_dict.loc[seg_id, f'ave_roll_std_{w}'] = x_roll_std.mean() feature_dict.loc[seg_id, f'std_roll_std_{w}'] = x_roll_std.std() feature_dict.loc[seg_id, f'max_roll_std_{w}'] = x_roll_std.max() feature_dict.loc[seg_id, f'min_roll_std_{w}'] = x_roll_std.min() for p in percentiles: feature_dict.loc[seg_id, f'percentile_roll_std_{p}_window_{w}'] = np.percentile(x_roll_std, p) feature_dict.loc[seg_id, f'av_change_abs_roll_std_{w}'] = np.mean(np.diff(x_roll_std)) feature_dict.loc[seg_id, f'av_change_rate_roll_std_{w}'] = np.mean( np.nonzero((np.diff(x_roll_std) / x_roll_std[:-1]))[0]) feature_dict.loc[seg_id, f'abs_max_roll_std_{w}'] = np.abs(x_roll_std).max() feature_dict.loc[seg_id, f'ave_roll_mean_{w}'] = x_roll_mean.mean() feature_dict.loc[seg_id, f'std_roll_mean_{w}'] = x_roll_mean.std() feature_dict.loc[seg_id, f'max_roll_mean_{w}'] = x_roll_mean.max() feature_dict.loc[seg_id, f'min_roll_mean_{w}'] = x_roll_mean.min() for p in percentiles: feature_dict.loc[seg_id, f'percentile_roll_mean_{p}_window_{w}'] = np.percentile(x_roll_mean, p) feature_dict.loc[seg_id, f'av_change_abs_roll_mean_{w}'] = np.mean(np.diff(x_roll_mean)) feature_dict.loc[seg_id, f'av_change_rate_roll_mean_{w}'] = np.mean( np.nonzero((np.diff(x_roll_mean) / x_roll_mean[:-1]))[0]) feature_dict.loc[seg_id, f'abs_max_roll_mean_{w}'] = np.abs(x_roll_mean).max() return feature_dict
def create_features2(seg, ): data_row = {} xcz = des_filter(seg, high=CUTOFF) zc = np.fft.fft(xcz) zc = zc[:MAX_FREQ] # FFT transform values realFFT = np.real(zc) imagFFT = np.imag(zc) freq_bands = list(range(0, MAX_FREQ, FREQ_STEP)) magFFT = np.abs(zc) phzFFT = np.angle(zc) phzFFT[phzFFT == -np.inf] = -np.pi / 2.0 phzFFT[phzFFT == np.inf] = np.pi / 2.0 phzFFT = np.nan_to_num(phzFFT) for freq in freq_bands: data_row['FFT_Mag_01q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_STEP], 0.01) data_row['FFT_Mag_10q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_STEP], 0.1) data_row['FFT_Mag_90q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_STEP], 0.9) data_row['FFT_Mag_99q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_STEP], 0.99) data_row['FFT_Mag_mean%d' % freq] = np.mean(magFFT[freq: freq + FREQ_STEP]) data_row['FFT_Mag_std%d' % freq] = np.std(magFFT[freq: freq + FREQ_STEP]) data_row['FFT_Mag_max%d' % freq] = np.max(magFFT[freq: freq + FREQ_STEP]) data_row['FFT_Mag_min%d' % freq] = np.min(magFFT[freq: freq + FREQ_STEP]) data_row['FFT_Phz_mean%d' % freq] = np.mean(phzFFT[freq: freq + FREQ_STEP]) data_row['FFT_Phz_std%d' % freq] = np.std(phzFFT[freq: freq + FREQ_STEP]) data_row['FFT_Phz_max%d' % freq] = np.max(phzFFT[freq: freq + FREQ_STEP]) data_row['FFT_Phz_min%d' % freq] = np.min(phzFFT[freq: freq + FREQ_STEP]) data_row['FFT_Rmean'] = realFFT.mean() data_row['FFT_Rstd'] = realFFT.std() data_row['FFT_Rmax'] = realFFT.max() data_row['FFT_Rmin'] = realFFT.min() data_row['FFT_Imean'] = imagFFT.mean() data_row['FFT_Istd'] = imagFFT.std() data_row['FFT_Imax'] = imagFFT.max() data_row['FFT_Imin'] = imagFFT.min() data_row['FFT_Rmean_first_6000'] = realFFT[:6000].mean() data_row['FFT_Rstd__first_6000'] = realFFT[:6000].std() data_row['FFT_Rmax_first_6000'] = realFFT[:6000].max() data_row['FFT_Rmin_first_6000'] = realFFT[:6000].min() data_row['FFT_Rmean_first_18000'] = realFFT[:18000].mean() data_row['FFT_Rstd_first_18000'] = realFFT[:18000].std() data_row['FFT_Rmax_first_18000'] = realFFT[:18000].max() data_row['FFT_Rmin_first_18000'] = realFFT[:18000].min() del xcz del zc # gc.collect() sigs = [seg] for freq in range(0, MAX_FREQ + FREQ_STEP, FREQ_STEP): if freq == 0: xc_ = des_filter(seg, high=FREQ_STEP) elif freq == MAX_FREQ: xc_ = des_filter(seg, low=freq) else: xc_ = des_filter(seg, low=freq, high=freq + FREQ_STEP) sigs.append(pd.Series(xc_)) for window in [50, 200, 1000]: roll_mean = seg.rolling(window).mean().dropna() roll_std = seg.rolling(window).std().dropna() sigs.append(pd.Series(roll_mean)) sigs.append(pd.Series(roll_std)) for span in [30, 300, 3000]: exp_mean = seg.ewm(span).mean().dropna() exp_std = seg.ewm(span).std().dropna() sigs.append(pd.Series(exp_mean)) sigs.append(pd.Series(exp_std)) for i, sig in enumerate(sigs): data_row['mean_%d' % i] = sig.mean() data_row['std_%d' % i] = sig.std() data_row['max_%d' % i] = sig.max() data_row['min_%d' % i] = sig.min() data_row['mean_change_abs_%d' % i] = np.mean(np.diff(sig)) data_row['mean_change_rate_%d' % i] = np.mean(np.nonzero((np.diff(sig) / sig[:-1]))[0]) data_row['abs_max_%d' % i] = np.abs(sig).max() data_row['abs_min_%d' % i] = np.abs(sig).min() data_row['std_first_50000_%d' % i] = sig[:50000].std() data_row['std_last_50000_%d' % i] = sig[-50000:].std() data_row['std_first_10000_%d' % i] = sig[:10000].std() data_row['std_last_10000_%d' % i] = sig[-10000:].std() data_row['avg_first_50000_%d' % i] = sig[:50000].mean() data_row['avg_last_50000_%d' % i] = sig[-50000:].mean() data_row['avg_first_10000_%d' % i] = sig[:10000].mean() data_row['avg_last_10000_%d' % i] = sig[-10000:].mean() data_row['min_first_50000_%d' % i] = sig[:50000].min() data_row['min_last_50000_%d' % i] = sig[-50000:].min() data_row['min_first_10000_%d' % i] = sig[:10000].min() data_row['min_last_10000_%d' % i] = sig[-10000:].min() data_row['max_first_50000_%d' % i] = sig[:50000].max() data_row['max_last_50000_%d' % i] = sig[-50000:].max() data_row['max_first_10000_%d' % i] = sig[:10000].max() data_row['max_last_10000_%d' % i] = sig[-10000:].max() data_row['max_to_min_%d' % i] = sig.max() / np.abs(sig.min()) data_row['max_to_min_diff_%d' % i] = sig.max() - np.abs(sig.min()) data_row['count_big_%d' % i] = len(sig[np.abs(sig) > 500]) data_row['sum_%d' % i] = sig.sum() data_row['mean_change_rate_first_50000_%d' % i] = np.mean( np.nonzero((np.diff(sig[:50000]) / sig[:50000][:-1]))[0]) data_row['mean_change_rate_last_50000_%d' % i] = np.mean( np.nonzero((np.diff(sig[-50000:]) / sig[-50000:][:-1]))[0]) data_row['mean_change_rate_first_10000_%d' % i] = np.mean( np.nonzero((np.diff(sig[:10000]) / sig[:10000][:-1]))[0]) data_row['mean_change_rate_last_10000_%d' % i] = np.mean( np.nonzero((np.diff(sig[-10000:]) / sig[-10000:][:-1]))[0]) for p in [1, 5, 10, 25, 50, 75, 90, 95, 99]: data_row['percentile_p{}_{}'.format(p, i)] = np.percentile(sig, p) data_row['abd_percentile_p{}_{}'.format(p, i)] = np.percentile(np.abs(sig), p) data_row['trend_%d' % i] = add_trend_feature(sig) data_row['abs_trend_%d' % i] = add_trend_feature(sig, abs_values=True) data_row['abs_mean_%d' % i] = np.abs(sig).mean() data_row['abs_std_%d' % i] = np.abs(sig).std() data_row['mad_%d' % i] = sig.mad() data_row['kurt_%d' % i] = sig.kurtosis() data_row['skew_%d' % i] = sig.skew() data_row['med_%d' % i] = sig.median() # data_row['Hilbert_mean_%d' % i] = np.abs(hilbert(sig)).mean() data_row['Hann_window50_%d' % i] = (convolve(sig, hann(50), mode='same') / sum(hann(50))).mean() data_row['Hann_window500_%d' % i] = (convolve(sig, hann(500), mode='same') / sum(hann(500))).mean() data_row['classic_sta_lta0_mean_%d' % i] = classic_sta_lta(sig, 50, 1000).mean() data_row['classic_sta_lta1_mean_%d' % i] = classic_sta_lta(sig, 500, 10000).mean() data_row['classic_sta_lta2_mean_%d' % i] = classic_sta_lta(sig, 5000, 100000).mean() data_row['classic_sta_lta3_mean_%d' % i] = classic_sta_lta(sig, 3333, 6666).mean() data_row['classic_sta_lta4_mean_%d' % i] = classic_sta_lta(sig, 10000, 25000).mean() no_of_std = 2 for w in [10, 100, 500]: signal_mean = sig.rolling(window=w).mean() signal_std = sig.rolling(window=w).std() data_row['high_bound_mean_win{}_{}'.format(w, i)] = (signal_mean + no_of_std * signal_std).mean() data_row['low_bound_mean_win{}_{}'.format(w, i)] = (signal_mean - no_of_std * signal_std).mean() data_row['range_inf_4000_%d' % i] = feature_calculators.range_count(sig, -np.inf, -4000) data_row['range_4000_inf_%d' % i] = feature_calculators.range_count(sig, 4000, np.inf) for l, h in [[-4000, -2000], [-2000, 0], [0, 2000], [2000, 4000]]: data_row['range_{}_{}_{}'.format(np.abs(l), np.abs(h), i)] = feature_calculators.range_count(sig, l, h) data_row['iqr0_%d' % i] = np.subtract(*np.percentile(sig, [75, 25])) data_row['iqr1_%d' % i] = np.subtract(*np.percentile(sig, [95, 5])) data_row['ave10_%d' % i] = stats.trim_mean(sig, 0.1) data_row['num_cross_0_%d' % i] = feature_calculators.number_crossing_m(sig, 0) data_row['ratio_value_number_%d' % i] = feature_calculators.ratio_value_number_to_time_series_length(sig) # data_row['var_larger_than_std_dev_%d' % i] = feature_calculators.variance_larger_than_standard_deviation(sig) data_row['ratio_unique_values_%d' % i] = feature_calculators.ratio_value_number_to_time_series_length(sig) data_row['abs_energy_%d' % i] = feature_calculators.abs_energy(sig) data_row['abs_sum_of_changes_%d' % i] = feature_calculators.absolute_sum_of_changes(sig) data_row['count_above_mean_%d' % i] = feature_calculators.count_above_mean(sig) data_row['count_below_mean_%d' % i] = feature_calculators.count_below_mean(sig) data_row['mean_abs_change_%d' % i] = feature_calculators.mean_abs_change(sig) data_row['mean_change_%d' % i] = feature_calculators.mean_change(sig) data_row['first_loc_min_%d' % i] = feature_calculators.first_location_of_minimum(sig) data_row['first_loc_max_%d' % i] = feature_calculators.first_location_of_maximum(sig) data_row['last_loc_min_%d' % i] = feature_calculators.last_location_of_minimum(sig) data_row['last_loc_max_%d' % i] = feature_calculators.last_location_of_maximum(sig) data_row['long_strk_above_mean_%d' % i] = feature_calculators.longest_strike_above_mean(sig) data_row['long_strk_below_mean_%d' % i] = feature_calculators.longest_strike_below_mean(sig) # data_row['cid_ce_0_%d' % i] = feature_calculators.cid_ce(sig, 0) # data_row['cid_ce_1_%d' % i] = feature_calculators.cid_ce(sig, 1) for j in [10, 50, ]: data_row['peak_num_p{}_{}'.format(j, i)] = feature_calculators.number_peaks(sig, j) for j in [1, 10, 50, 100]: data_row['spkt_welch_density_coeff{}_{}'.format(j, i)] = \ list(feature_calculators.spkt_welch_density(sig, [{'coeff': j}]))[0][1] for j in [5, 10, 100]: data_row['c3_c{}_{}'.format(j, i)] = feature_calculators.c3(sig, j) for j in [5, 10, 50, 100, 1000]: data_row['autocorrelation_auto{}_{}'.format(j, i)] = feature_calculators.autocorrelation(sig, j) for j in [10, 100, 1000]: data_row['time_rev_asym_stat_t{}_{}'.format(j, i)] = feature_calculators.time_reversal_asymmetry_statistic( sig, j) for j in range(1, 5): data_row['kstat_k{}_{}'.format(j, i)] = stats.kstat(sig, j) data_row['moment_m{}_{}'.format(j, i)] = stats.moment(sig, j) for j in range(1, 3): data_row['kstatvar_k{}_{}'.format(j, i)] = stats.kstatvar(sig, j) for j in [5, 10, 50, 100]: data_row['binned_entropy_b{}_{}'.format(j, i)] = feature_calculators.binned_entropy(sig, j) return data_row
def kstatvar2(x): return stats.kstatvar(x, 2)
def kstatvar1(x): return stats.kstatvar(x, 1)