def get_global_feature(self): """ 获取时域全局特征,包含最大值、标准差、平均值 :param hadcropped: :return: """ square_data, square_energy, square_azrate = self.pre_process(method='hanning', ifcrop=True) func = lambda x: [ # feature_calc.autocorrelation(norm(x), 5), np.std(x), feature_calc.approximate_entropy(norm(x), 5, 1), feature_calc.cid_ce(x, normalize=True), feature_calc.count_above_mean(x), feature_calc.first_location_of_minimum(x), feature_calc.first_location_of_maximum(x), feature_calc.last_location_of_maximum(x), feature_calc.last_location_of_minimum(x), feature_calc.longest_strike_above_mean(x), feature_calc.number_crossing_m(x, 0.8*np.max(x)), feature_calc.skewness(x), feature_calc.time_reversal_asymmetry_statistic(x, 5) ] # global features I want to get upper_rate = self.get_upper_rate(square_energy) feature = np.hstack([ [np.mean(norm(square_energy))], [upper_rate], func(square_azrate), func(square_energy) ]) return feature
def complexity(mag): """This function calculator is an estimate for a time series complexity. A higher value represents more complexity (more peaks,valleys,etc.) See: Batista, Gustavo EAPA, et al (2014). CID: an efficient complexity-invariant distance for time series. Data Mining and Knowledge Difscovery 28.3 (2014): 634-669. rtype: float """ c = ts.cid_ce(mag, True) return c
def get_mfcc_feature(self, hadcropped=False): ''' calculate Mel-frequency cepstral coefficients in frequency domain and extract features from MFCC :return: numpy array ''' assert self.frame_per_second not in [32, 64, 128, 256], \ Exception("Cannot operate butterfly computation ," "frame per second should in [32, 64, 128, 256]") hanning_kernel = self.get_window(method='hanning') windowed = self._add_window(hanning_kernel, self.meta_audio_data) # [num_frame, kernel_size] hanning_energy = self.get_energy(self.meta_audio_data, hanning_kernel) if not hadcropped: boundary = self.get_boundary(hanning_energy) cropped = windowed[boundary[0]: boundary[1] + 1, :] frequency = np.vstack([fft.fft(frame.squeeze()) for frame in np.vsplit(cropped, len(cropped))]) else: frequency = np.vstack([fft.fft(windowed)]) frequency = np.abs(frequency) frequency_energy = frequency ** 2 low_freq = self.sr / self.num_per_frame high_freq = self.sr H = self._mfcc_filter(self.mfcc_cof, low_freq, high_freq) S = np.dot(frequency_energy, H.transpose()) # (F, M) cos_ary = self._discrete_cosine_transform() mfcc_raw_features = np.sqrt(2 / self.mfcc_cof) * np.dot(S, cos_ary) # (F,N) upper = [self.get_upper_rate(fea) for fea in mfcc_raw_features.transpose()] assert len(upper) == mfcc_raw_features.shape[1] func = lambda x: [ # feature_calc.autocorrelation(norm(x), 5), np.std(x), feature_calc.approximate_entropy(norm(x), 5, 1), feature_calc.cid_ce(x, normalize=True), feature_calc.count_above_mean(x), feature_calc.first_location_of_minimum(x), feature_calc.first_location_of_maximum(x), feature_calc.last_location_of_maximum(x), feature_calc.last_location_of_minimum(x), feature_calc.longest_strike_above_mean(x), feature_calc.number_crossing_m(x, 0.8*np.max(x)), feature_calc.skewness(x), feature_calc.time_reversal_asymmetry_statistic(x, 5) ] mfcc_features = np.hstack( [func(col) for col in mfcc_raw_features.transpose()] ) return mfcc_features
def features(self, x, y, seg_id): feature_dict = dict() feature_dict['target'] = y feature_dict['seg_id'] = seg_id x = pd.Series(denoise_signal(x, wavelet='db1', level=1)) #x = x - np.mean(x) zc = np.fft.fft(x) zc = zc[:37500] # FFT transform values realFFT = np.real(zc) imagFFT = np.imag(zc) freq_bands = [x for x in range(0, 37500, 7500)] magFFT = np.sqrt(realFFT**2 + imagFFT**2) phzFFT = np.arctan(imagFFT / realFFT) phzFFT[phzFFT == -np.inf] = -np.pi / 2.0 phzFFT[phzFFT == np.inf] = np.pi / 2.0 phzFFT = np.nan_to_num(phzFFT) for freq in freq_bands: if freq == 0: continue feature_dict['FFT_Mag_01q%d' % freq] = np.quantile( magFFT[freq:freq + 7500], 0.01) feature_dict['FFT_Mag_10q%d' % freq] = np.quantile( magFFT[freq:freq + 7500], 0.1) feature_dict['FFT_Mag_90q%d' % freq] = np.quantile( magFFT[freq:freq + 7500], 0.9) feature_dict['FFT_Mag_99q%d' % freq] = np.quantile( magFFT[freq:freq + 7500], 0.99) feature_dict['FFT_Mag_mean%d' % freq] = np.mean(magFFT[freq:freq + 7500]) feature_dict['FFT_Mag_std%d' % freq] = np.std(magFFT[freq:freq + 7500]) feature_dict['FFT_Mag_max%d' % freq] = np.max(magFFT[freq:freq + 7500]) for p in [10]: feature_dict[f'num_peaks_{p}'] = feature_calculators.number_peaks( x, 10) feature_dict['cid_ce'] = feature_calculators.cid_ce(x, normalize=True) for w in [5]: feature_dict[ f'autocorrelation_{w}'] = feature_calculators.autocorrelation( x, w) return feature_dict
def preproc(d): df = pd.DataFrame(d) x_autocorr = df.apply(lambda x: x.autocorr(lag=5), axis=1) x_mean = df.apply(lambda x: np.mean(x), axis=1) x_max = df.apply(lambda x: np.max(x), axis=1) x_c3 = df.apply(lambda x: tsf_calc.c3(x, 5), axis=1) x_cid = standardize(df.apply(lambda x: tsf_calc.cid_ce(x, False), axis=1)) x_sym = df.apply(lambda x: 0 if tsf_calc.symmetry_looking(x, [{ 'r': 0.0106 }])[0][1] else 1, axis=1) # all observations that are strongly asymmetric return pd.concat([x_autocorr, x_mean, x_max, x_c3, x_cid, x_sym], axis=1)
def CIDCELag3(fragment): return fc.cid_ce(fragment,3)
def calculate_complexity_estimation(traffic): return feature_calculators.cid_ce(traffic, normalize=True)
def function(x): return cid_ce(x, normalize=self.normalize)
def features(self, x, prefix): feature_dict = dict() # create features here # numpy feature_dict[prefix + '_' + 'mean'] = np.mean(x) feature_dict[prefix + '_' + 'max'] = np.max(x) feature_dict[prefix + '_' + 'min'] = np.min(x) feature_dict[prefix + '_' + 'std'] = np.std(x) feature_dict[prefix + '_' + 'var'] = np.var(x) feature_dict[prefix + '_' + 'ptp'] = np.ptp(x) feature_dict[prefix + '_' + 'percentile_10'] = np.percentile(x, 10) feature_dict[prefix + '_' + 'percentile_20'] = np.percentile(x, 20) feature_dict[prefix + '_' + 'percentile_30'] = np.percentile(x, 30) feature_dict[prefix + '_' + 'percentile_40'] = np.percentile(x, 40) feature_dict[prefix + '_' + 'percentile_50'] = np.percentile(x, 50) feature_dict[prefix + '_' + 'percentile_60'] = np.percentile(x, 60) feature_dict[prefix + '_' + 'percentile_70'] = np.percentile(x, 70) feature_dict[prefix + '_' + 'percentile_80'] = np.percentile(x, 80) feature_dict[prefix + '_' + 'percentile_90'] = np.percentile(x, 90) # scipy feature_dict[prefix + '_' + 'skew'] = sp.stats.skew(x) feature_dict[prefix + '_' + 'kurtosis'] = sp.stats.kurtosis(x) feature_dict[prefix + '_' + 'kstat_1'] = sp.stats.kstat(x, 1) feature_dict[prefix + '_' + 'kstat_2'] = sp.stats.kstat(x, 2) feature_dict[prefix + '_' + 'kstat_3'] = sp.stats.kstat(x, 3) feature_dict[prefix + '_' + 'kstat_4'] = sp.stats.kstat(x, 4) feature_dict[prefix + '_' + 'moment_1'] = sp.stats.moment(x, 1) feature_dict[prefix + '_' + 'moment_2'] = sp.stats.moment(x, 2) feature_dict[prefix + '_' + 'moment_3'] = sp.stats.moment(x, 3) feature_dict[prefix + '_' + 'moment_4'] = sp.stats.moment(x, 4) # tsfresh feature_dict[prefix + '_' + 'abs_energy'] = feature_calculators.abs_energy(x) feature_dict[ prefix + '_' + 'abs_sum_of_changes'] = feature_calculators.absolute_sum_of_changes( x) feature_dict[ prefix + '_' + 'count_above_mean'] = feature_calculators.count_above_mean(x) feature_dict[ prefix + '_' + 'count_below_mean'] = feature_calculators.count_below_mean(x) feature_dict[prefix + '_' + 'mean_abs_change'] = feature_calculators.mean_abs_change( x) feature_dict[prefix + '_' + 'mean_change'] = feature_calculators.mean_change(x) feature_dict[ prefix + '_' + 'var_larger_than_std_dev'] = feature_calculators.variance_larger_than_standard_deviation( x) feature_dict[prefix + '_' + 'range_minf_m4000'] = feature_calculators.range_count( x, -np.inf, -4000) feature_dict[prefix + '_' + 'range_m4000_m3000'] = feature_calculators.range_count( x, -4000, -3000) feature_dict[prefix + '_' + 'range_m3000_m2000'] = feature_calculators.range_count( x, -3000, -2000) feature_dict[prefix + '_' + 'range_m2000_m1000'] = feature_calculators.range_count( x, -2000, -1000) feature_dict[prefix + '_' + 'range_m1000_0'] = feature_calculators.range_count( x, -1000, 0) feature_dict[prefix + '_' + 'range_0_p1000'] = feature_calculators.range_count( x, 0, 1000) feature_dict[prefix + '_' + 'range_p1000_p2000'] = feature_calculators.range_count( x, 1000, 2000) feature_dict[prefix + '_' + 'range_p2000_p3000'] = feature_calculators.range_count( x, 2000, 3000) feature_dict[prefix + '_' + 'range_p3000_p4000'] = feature_calculators.range_count( x, 3000, 4000) feature_dict[prefix + '_' + 'range_p4000_pinf'] = feature_calculators.range_count( x, 4000, np.inf) feature_dict[ prefix + '_' + 'ratio_unique_values'] = feature_calculators.ratio_value_number_to_time_series_length( x) feature_dict[ prefix + '_' + 'first_loc_min'] = feature_calculators.first_location_of_minimum(x) feature_dict[ prefix + '_' + 'first_loc_max'] = feature_calculators.first_location_of_maximum(x) feature_dict[ prefix + '_' + 'last_loc_min'] = feature_calculators.last_location_of_minimum(x) feature_dict[ prefix + '_' + 'last_loc_max'] = feature_calculators.last_location_of_maximum(x) feature_dict[ prefix + '_' + 'time_rev_asym_stat_10'] = feature_calculators.time_reversal_asymmetry_statistic( x, 10) feature_dict[ prefix + '_' + 'time_rev_asym_stat_100'] = feature_calculators.time_reversal_asymmetry_statistic( x, 100) feature_dict[ prefix + '_' + 'time_rev_asym_stat_1000'] = feature_calculators.time_reversal_asymmetry_statistic( x, 1000) feature_dict[ prefix + '_' + 'autocorrelation_1'] = feature_calculators.autocorrelation(x, 1) feature_dict[ prefix + '_' + 'autocorrelation_2'] = feature_calculators.autocorrelation(x, 2) feature_dict[ prefix + '_' + 'autocorrelation_3'] = feature_calculators.autocorrelation(x, 3) feature_dict[ prefix + '_' + 'autocorrelation_4'] = feature_calculators.autocorrelation(x, 4) feature_dict[ prefix + '_' + 'autocorrelation_5'] = feature_calculators.autocorrelation(x, 5) feature_dict[ prefix + '_' + 'autocorrelation_6'] = feature_calculators.autocorrelation(x, 6) feature_dict[ prefix + '_' + 'autocorrelation_7'] = feature_calculators.autocorrelation(x, 7) feature_dict[ prefix + '_' + 'autocorrelation_8'] = feature_calculators.autocorrelation(x, 8) feature_dict[ prefix + '_' + 'autocorrelation_9'] = feature_calculators.autocorrelation(x, 9) feature_dict[ prefix + '_' + 'autocorrelation_10'] = feature_calculators.autocorrelation(x, 10) feature_dict[ prefix + '_' + 'autocorrelation_50'] = feature_calculators.autocorrelation(x, 50) feature_dict[ prefix + '_' + 'autocorrelation_100'] = feature_calculators.autocorrelation( x, 100) feature_dict[ prefix + '_' + 'autocorrelation_1000'] = feature_calculators.autocorrelation( x, 1000) feature_dict[prefix + '_' + 'c3_1'] = feature_calculators.c3(x, 1) feature_dict[prefix + '_' + 'c3_2'] = feature_calculators.c3(x, 2) feature_dict[prefix + '_' + 'c3_3'] = feature_calculators.c3(x, 3) feature_dict[prefix + '_' + 'c3_4'] = feature_calculators.c3(x, 4) feature_dict[prefix + '_' + 'c3_5'] = feature_calculators.c3(x, 5) feature_dict[prefix + '_' + 'c3_10'] = feature_calculators.c3(x, 10) feature_dict[prefix + '_' + 'c3_100'] = feature_calculators.c3(x, 100) for c in range(1, 34): feature_dict[prefix + '_' + 'fft_{0}_real'.format(c)] = list( feature_calculators.fft_coefficient(x, [{ 'coeff': c, 'attr': 'real' }]))[0][1] feature_dict[prefix + '_' + 'fft_{0}_imag'.format(c)] = list( feature_calculators.fft_coefficient(x, [{ 'coeff': c, 'attr': 'imag' }]))[0][1] feature_dict[prefix + '_' + 'fft_{0}_ang'.format(c)] = list( feature_calculators.fft_coefficient(x, [{ 'coeff': c, 'attr': 'angle' }]))[0][1] feature_dict[ prefix + '_' + 'long_strk_above_mean'] = feature_calculators.longest_strike_above_mean( x) feature_dict[ prefix + '_' + 'long_strk_below_mean'] = feature_calculators.longest_strike_below_mean( x) feature_dict[prefix + '_' + 'cid_ce_0'] = feature_calculators.cid_ce( x, 0) feature_dict[prefix + '_' + 'cid_ce_1'] = feature_calculators.cid_ce( x, 1) feature_dict[prefix + '_' + 'binned_entropy_5'] = feature_calculators.binned_entropy( x, 5) feature_dict[prefix + '_' + 'binned_entropy_10'] = feature_calculators.binned_entropy( x, 10) feature_dict[prefix + '_' + 'binned_entropy_20'] = feature_calculators.binned_entropy( x, 20) feature_dict[prefix + '_' + 'binned_entropy_50'] = feature_calculators.binned_entropy( x, 50) feature_dict[prefix + '_' + 'binned_entropy_80'] = feature_calculators.binned_entropy( x, 80) feature_dict[ prefix + '_' + 'binned_entropy_100'] = feature_calculators.binned_entropy(x, 100) feature_dict[prefix + '_' + 'num_crossing_0'] = feature_calculators.number_crossing_m( x, 0) feature_dict[prefix + '_' + 'num_peaks_1'] = feature_calculators.number_peaks(x, 1) feature_dict[prefix + '_' + 'num_peaks_3'] = feature_calculators.number_peaks(x, 3) feature_dict[prefix + '_' + 'num_peaks_5'] = feature_calculators.number_peaks(x, 5) feature_dict[prefix + '_' + 'num_peaks_10'] = feature_calculators.number_peaks(x, 10) feature_dict[prefix + '_' + 'num_peaks_50'] = feature_calculators.number_peaks(x, 50) feature_dict[prefix + '_' + 'num_peaks_100'] = feature_calculators.number_peaks( x, 100) feature_dict[prefix + '_' + 'num_peaks_500'] = feature_calculators.number_peaks( x, 500) feature_dict[prefix + '_' + 'spkt_welch_density_1'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': 1 }]))[0][1] feature_dict[prefix + '_' + 'spkt_welch_density_2'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': 2 }]))[0][1] feature_dict[prefix + '_' + 'spkt_welch_density_5'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': 5 }]))[0][1] feature_dict[prefix + '_' + 'spkt_welch_density_8'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': 8 }]))[0][1] feature_dict[prefix + '_' + 'spkt_welch_density_10'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': 10 }]))[0][1] feature_dict[prefix + '_' + 'spkt_welch_density_50'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': 50 }]))[0][1] feature_dict[prefix + '_' + 'spkt_welch_density_100'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': 100 }]))[0][1] feature_dict[ prefix + '_' + 'time_rev_asym_stat_1'] = feature_calculators.time_reversal_asymmetry_statistic( x, 1) feature_dict[ prefix + '_' + 'time_rev_asym_stat_2'] = feature_calculators.time_reversal_asymmetry_statistic( x, 2) feature_dict[ prefix + '_' + 'time_rev_asym_stat_3'] = feature_calculators.time_reversal_asymmetry_statistic( x, 3) feature_dict[ prefix + '_' + 'time_rev_asym_stat_4'] = feature_calculators.time_reversal_asymmetry_statistic( x, 4) feature_dict[ prefix + '_' + 'time_rev_asym_stat_10'] = feature_calculators.time_reversal_asymmetry_statistic( x, 10) feature_dict[ prefix + '_' + 'time_rev_asym_stat_100'] = feature_calculators.time_reversal_asymmetry_statistic( x, 100) for r in range(20): feature_dict[prefix + '_' + 'symmetry_looking_' + str(r)] = feature_calculators.symmetry_looking( x, [{ 'r': r * 0.05 }])[0][1] for r in range(1, 20): feature_dict[ prefix + '_' + 'large_standard_deviation_' + str(r)] = feature_calculators.large_standard_deviation( x, r * 0.05) for r in range(1, 10): feature_dict[prefix + '_' + 'quantile_' + str(r)] = feature_calculators.quantile(x, r * 0.1) for r in ['mean', 'median', 'var']: feature_dict[prefix + '_' + 'agg_autocorr_' + r] = feature_calculators.agg_autocorrelation( x, [{ 'f_agg': r, 'maxlag': 40 }])[0][-1] #for r in range(1, 6): # feature_dict[prefix+'_'+'number_cwt_peaks_'+str(r)] = feature_calculators.number_cwt_peaks(x, r) for r in range(1, 10): feature_dict[prefix + '_' + 'index_mass_quantile_' + str(r)] = feature_calculators.index_mass_quantile( x, [{ 'q': r }])[0][1] #for ql in [0., .2, .4, .6, .8]: # for qh in [.2, .4, .6, .8, 1.]: # if ql < qh: # for b in [False, True]: # for f in ["mean", "var"]: # feature_dict[prefix+'_'+'change_quantiles_'+str(ql)+'_'+str(qh)+'_'+str(b)+'_'+str(f)] = feature_calculators.change_quantiles(x, ql, qh, b, f) #for r in [.1, .3, .5, .7, .9]: # feature_dict[prefix+'_'+'approximate_entropy_'+str(r)] = feature_calculators.approximate_entropy(x, 2, r) feature_dict[ prefix + '_' + 'max_langevin_fixed_point'] = feature_calculators.max_langevin_fixed_point( x, 3, 30) for r in ['pvalue', 'rvalue', 'intercept', 'slope', 'stderr']: feature_dict[prefix + '_' + 'linear_trend_' + str(r)] = feature_calculators.linear_trend( x, [{ 'attr': r }])[0][1] for r in ['pvalue', 'teststat', 'usedlag']: feature_dict[prefix + '_' + 'augmented_dickey_fuller_' + r] = feature_calculators.augmented_dickey_fuller( x, [{ 'attr': r }])[0][1] for r in [0.5, 1, 1.5, 2, 2.5, 3, 5, 6, 7, 10]: feature_dict[prefix + '_' + 'ratio_beyond_r_sigma_' + str(r)] = feature_calculators.ratio_beyond_r_sigma( x, r) #for attr in ["pvalue", "rvalue", "intercept", "slope", "stderr"]: # feature_dict[prefix+'_'+'linear_trend_timewise_'+attr] = feature_calculators.linear_trend_timewise(x, [{'attr': attr}])[0][1] #for attr in ["rvalue", "intercept", "slope", "stderr"]: # for i in [5, 10, 50]: # for f in ["max", "min", "mean", "var"]: # feature_dict[prefix+'_'+'agg_linear_trend_'+attr+'_'+str(i)+'_'+f] = feature_calculators.agg_linear_trend(x, [{'attr': attr, 'chunk_len': i, 'f_agg': f}])[0][-1] #for width in [2, 5, 10, 20]: # for coeff in range(15): # for w in [2, 5, 10, 20]: # feature_dict[prefix+'_'+'cwt_coefficients_'+str(width)+'_'+str(coeff)+'_'+str(w)] = list(feature_calculators.cwt_coefficients(x, [{'widths': width, 'coeff': coeff, 'w': w}]))[0][1] #for r in range(10): # feature_dict[prefix+'_'+'partial_autocorr_'+str(r)] = feature_calculators.partial_autocorrelation(x, [{'lag': r}])[0][1] # "ar_coefficient": [{"coeff": coeff, "k": k} for coeff in range(5) for k in [10]], # "fft_coefficient": [{"coeff": k, "attr": a} for a, k in product(["real", "imag", "abs", "angle"], range(100))], # "fft_aggregated": [{"aggtype": s} for s in ["centroid", "variance", "skew", "kurtosis"]], # "value_count": [{"value": value} for value in [0, 1, -1]], # "range_count": [{"min": -1, "max": 1}, {"min": 1e12, "max": 0}, {"min": 0, "max": 1e12}], # "friedrich_coefficients": (lambda m: [{"coeff": coeff, "m": m, "r": 30} for coeff in range(m + 1)])(3), # "energy_ratio_by_chunks": [{"num_segments": 10, "segment_focus": i} for i in range(10)], return feature_dict
def generate_time_series_feats(x_dataset, dataset_name="raw", test=False): make_dir_if_not_exists(os.path.join(FEATURES_PATH, 'tsfeats')) time_length = x_dataset.shape[1] features_function_dict = { "mean": mean, "median": median, "length": length, "minimum": minimum, "maximum": maximum, "variance": variance, "skewness": skewness, "kurtosis": kurtosis, "sum_values": sum_values, "abs_energy": abs_energy, "mean_change": mean_change, "mean_abs_change": mean_abs_change, "count_below_mean": count_below_mean, "count_above_mean": count_above_mean, "has_duplicate_min": has_duplicate_min, "has_duplicate_max": has_duplicate_max, "standard_deviation": standard_deviation, "absolute_sum_of_changes": absolute_sum_of_changes, "last_location_of_minimum": last_location_of_minimum, "last_location_of_maximum": last_location_of_maximum, "first_location_of_maximum": first_location_of_maximum, "longest_strike_below_mean": longest_strike_below_mean, "longest_strike_above_mean": longest_strike_above_mean, "sum_of_reoccurring_values": sum_of_reoccurring_values, "first_location_of_minimum": first_location_of_minimum, "sum_of_reoccurring_data_points": sum_of_reoccurring_data_points, "variance_larger_than_standard_deviation": variance_larger_than_standard_deviation, "ratio_value_number_to_time_series_length": ratio_value_number_to_time_series_length, "percentage_of_reoccurring_values_to_all_values": percentage_of_reoccurring_values_to_all_values, "binned_entropy_max300": lambda x: binned_entropy(x, 300), "binned_entropy_max400": lambda x: binned_entropy(x, 400), "cid_ce_true": lambda x: cid_ce(x, True), "cid_ce_false": lambda x: cid_ce(x, False), "percentage_of_reoccurring_datapoints_to_all_datapoints": percentage_of_reoccurring_datapoints_to_all_datapoints } for feature_name, function_call in features_function_dict.iteritems(): print "{:.<70s}".format("- Processing feature: %s" % feature_name), feature_name = 'tsfeats/%s_%s' % (dataset_name, feature_name) if not features_exists(feature_name, test): feats = x_dataset.apply(function_call, axis=1, raw=True).values save_features(feats, feature_name, test) print("Done") else: print("Already generated") ar_param_k100 = [{"coeff": i, "k": 100} for i in range(100 + 1)] ar_param_k500 = [{"coeff": i, "k": 500} for i in range(500 + 1)] agg50_mean_linear_trend = [{ "attr": val, "chunk_len": 50, "f_agg": "mean" } for val in ("pvalue", "rvalue", "intercept", "slope", "stderr")] aug_dickey_fuler_params = [{ "attr": "teststat" }, { "attr": "pvalue" }, { "attr": "usedlag" }] energy_ratio_num10_focus5 = [{"num_segments": 10, "segment_focus": 5}] fft_aggr_spectrum = [{ "aggtype": "centroid" }, { "aggtype": "variance" }, { "aggtype": "skew" }, { "aggtype": "kurtosis" }] fft_coefficient_real = [{ "coeff": i, "attr": "real" } for i in range((time_length + 1) // 2)] fft_coefficient_imag = [{ "coeff": i, "attr": "imag" } for i in range((time_length + 1) // 2)] fft_coefficient_abs = [{ "coeff": i, "attr": "abs" } for i in range((time_length + 1) // 2)] fft_coefficient_angle = [{ "coeff": i, "attr": "angle" } for i in range((time_length + 1) // 2)] linear_trend_params = [{ "attr": val } for val in ("pvalue", "rvalue", "intercept", "slope", "stderr")] other_feats_dict = { "ar_coeff100": lambda x: dict(ar_coefficient(x, ar_param_k100)), "ar_coeff500": lambda x: dict(ar_coefficient(x, ar_param_k500)), "agg50_mean_lin_trend": lambda x: dict(agg_linear_trend(x, agg50_mean_linear_trend)), "aug_dickey_fuler": lambda x: dict(augmented_dickey_fuller(x, aug_dickey_fuler_params)), "energy_ratio_num10_focus5": lambda x: dict(energy_ratio_by_chunks(x, energy_ratio_num10_focus5)), "fft_aggr_spectrum": lambda x: dict(fft_aggregated(x, fft_aggr_spectrum)), "fft_coeff_real": lambda x: dict(fft_coefficient(x, fft_coefficient_real)), "fft_coeff_imag": lambda x: dict(fft_coefficient(x, fft_coefficient_imag)), "fft_coeff_abs": lambda x: dict(fft_coefficient(x, fft_coefficient_abs)), "fft_coeff_angle": lambda x: dict(fft_coefficient(x, fft_coefficient_angle)), "linear_trend": lambda x: dict(linear_trend(x, linear_trend_params)), } for feature_name, function_call in other_feats_dict.iteritems(): print "{:.<70s}".format("- Processing features: %s" % feature_name), feature_name = 'tsfeats/%s_%s' % (dataset_name, feature_name) if not features_exists(feature_name, test): feats_dict = x_dataset.apply(function_call, axis=1, raw=True).values.tolist() feats = pd.DataFrame.from_dict(feats_dict) save_features(feats.values, feature_name, test) print("Done") else: print("Already generated") # Auto-correlations as features print("- Processing Auto-correlation features...") corr_dataset = x_dataset.apply(autocorrelation_all, axis=1, raw=True) save_features(corr_dataset.values, '%s_auto_correlation_all' % dataset_name, test) print("- Processing ARIMA(5,5,1) Features...") arima_features = parallelize_row(x_dataset.values, generate_arima_feats, n_jobs=2) assert arima_features.shape[0] == x_dataset.shape[0] # Assert the axis save_features(arima_features, '%s_arima_5_5_1' % dataset_name, test)
def CIDCELag11(fragment): return fc.cid_ce(fragment,11)
def transform_pack3(df): """ augment X from tsfresh features""" x = df.values output = {} output['kstat_1'] = stats.kstat(x, 1) output['kstat_2'] = stats.kstat(x, 2) output['kstat_3'] = stats.kstat(x, 3) output['kstat_4'] = stats.kstat(x, 4) output['abs_energy'] = feature_calculators.abs_energy(x) output['abs_sum_of_changes'] = feature_calculators.absolute_sum_of_changes( x) output['count_above_mean'] = feature_calculators.count_above_mean(x) output['count_below_mean'] = feature_calculators.count_below_mean(x) output['range_minf_m4000'] = feature_calculators.range_count( x, -np.inf, -4000) output['range_m4000_m3000'] = feature_calculators.range_count( x, -4000, -3000) output['range_m3000_m2000'] = feature_calculators.range_count( x, -3000, -2000) output['range_m2000_m1000'] = feature_calculators.range_count( x, -2000, -1000) output['range_m1000_0'] = feature_calculators.range_count(x, -1000, 0) output['range_0_p1000'] = feature_calculators.range_count(x, 0, 1000) output['range_p1000_p2000'] = feature_calculators.range_count( x, 1000, 2000) output['range_p2000_p3000'] = feature_calculators.range_count( x, 2000, 3000) output['range_p3000_p4000'] = feature_calculators.range_count( x, 3000, 4000) output['range_p4000_pinf'] = feature_calculators.range_count( x, 4000, np.inf) output[ 'ratio_unique_values'] = feature_calculators.ratio_value_number_to_time_series_length( x) output['first_loc_min'] = feature_calculators.first_location_of_minimum(x) output['first_loc_max'] = feature_calculators.first_location_of_maximum(x) output['last_loc_min'] = feature_calculators.last_location_of_minimum(x) output['last_loc_max'] = feature_calculators.last_location_of_maximum(x) output[ 'time_rev_asym_stat_10'] = feature_calculators.time_reversal_asymmetry_statistic( x, 10) output[ 'time_rev_asym_stat_100'] = feature_calculators.time_reversal_asymmetry_statistic( x, 100) output[ 'time_rev_asym_stat_1000'] = feature_calculators.time_reversal_asymmetry_statistic( x, 1000) output['autocorrelation_10'] = feature_calculators.autocorrelation(x, 10) output['autocorrelation_100'] = feature_calculators.autocorrelation(x, 100) output['autocorrelation_1000'] = feature_calculators.autocorrelation( x, 1000) output['autocorrelation_5000'] = feature_calculators.autocorrelation( x, 5000) output['c3_5'] = feature_calculators.c3(x, 5) output['c3_10'] = feature_calculators.c3(x, 10) output['c3_100'] = feature_calculators.c3(x, 100) output[ 'long_strk_above_mean'] = feature_calculators.longest_strike_above_mean( x) output[ 'long_strk_below_mean'] = feature_calculators.longest_strike_below_mean( x) output['cid_ce_0'] = feature_calculators.cid_ce(x, 0) output['cid_ce_1'] = feature_calculators.cid_ce(x, 1) output['binned_entropy_10'] = feature_calculators.binned_entropy(x, 10) output['binned_entropy_50'] = feature_calculators.binned_entropy(x, 50) output['binned_entropy_80'] = feature_calculators.binned_entropy(x, 80) output['binned_entropy_100'] = feature_calculators.binned_entropy(x, 100) tmp = np.abs(x) output['num_crossing_0'] = feature_calculators.number_crossing_m(tmp, 0) output['num_crossing_10'] = feature_calculators.number_crossing_m(tmp, 10) output['num_crossing_100'] = feature_calculators.number_crossing_m( tmp, 100) output['num_peaks_10'] = feature_calculators.number_peaks(tmp, 10) output['num_peaks_50'] = feature_calculators.number_peaks(tmp, 50) output['num_peaks_100'] = feature_calculators.number_peaks(tmp, 100) output['num_peaks_500'] = feature_calculators.number_peaks(tmp, 500) output['spkt_welch_density_1'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': 1 }]))[0][1] output['spkt_welch_density_10'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': 10 }]))[0][1] output['spkt_welch_density_50'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': 50 }]))[0][1] output['spkt_welch_density_100'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': 100 }]))[0][1] output[ 'time_rev_asym_stat_1'] = feature_calculators.time_reversal_asymmetry_statistic( x, 1) output[ 'time_rev_asym_stat_10'] = feature_calculators.time_reversal_asymmetry_statistic( x, 10) output[ 'time_rev_asym_stat_100'] = feature_calculators.time_reversal_asymmetry_statistic( x, 100) return output
def features(self, x, y, seg_id, denoise=False): if (denoise == True): x_hp = high_pass_filter(x, low_cutoff=10000, sample_rate=4000000) x = denoise_signal(x_hp, wavelet='haar', level=1) feature_dict = dict() feature_dict['target'] = y feature_dict['seg_id'] = seg_id # create features here # lists with parameters to iterate over them percentiles = [ 1, 5, 10, 20, 25, 30, 40, 50, 60, 70, 75, 80, 90, 95, 99 ] hann_windows = [50, 150, 1500, 15000] spans = [300, 3000, 30000, 50000] windows = [10, 50, 100, 500, 1000, 10000] borders = list(range(-4000, 4001, 1000)) peaks = [10, 20, 50, 100] coefs = [1, 5, 10, 50, 100] lags = [10, 100, 1000, 10000] autocorr_lags = [5, 10, 50, 100, 500, 1000, 5000, 10000] # basic stats feature_dict['mean'] = x.mean() feature_dict['std'] = x.std() feature_dict['max'] = x.max() feature_dict['min'] = x.min() # basic stats on absolute values feature_dict['mean_change_abs'] = np.mean(np.diff(x)) feature_dict['abs_max'] = np.abs(x).max() feature_dict['abs_mean'] = np.abs(x).mean() feature_dict['abs_std'] = np.abs(x).std() # geometric and harminic means feature_dict['hmean'] = stats.hmean(np.abs(x[np.nonzero(x)[0]])) feature_dict['gmean'] = stats.gmean(np.abs(x[np.nonzero(x)[0]])) # k-statistic and moments for i in range(1, 5): feature_dict['kstat_{}'.format(i)] = stats.kstat(x, i) feature_dict['moment_{}'.format(i)] = stats.moment(x, i) for i in [1, 2]: feature_dict['kstatvar_{}'.format(i)] = stats.kstatvar(x, i) # aggregations on various slices of data for agg_type, slice_length, direction in product( ['std', 'min', 'max', 'mean'], [1000, 10000, 50000], ['first', 'last']): if direction == 'first': feature_dict['{}_{}_{}'.format( agg_type, direction, slice_length)] = x[:slice_length].agg(agg_type) elif direction == 'last': feature_dict['{}_{}_{}'.format( agg_type, direction, slice_length)] = x[-slice_length:].agg(agg_type) feature_dict['max_to_min'] = x.max() / np.abs(x.min()) feature_dict['max_to_min_diff'] = x.max() - np.abs(x.min()) feature_dict['count_big'] = len(x[np.abs(x) > 500]) feature_dict['sum'] = x.sum() feature_dict['mean_change_rate'] = calc_change_rate(x) # calc_change_rate on slices of data for slice_length, direction in product([1000, 10000, 50000], ['first', 'last']): if direction == 'first': feature_dict['mean_change_rate_{}_{}'.format( direction, slice_length)] = calc_change_rate(x[:slice_length]) elif direction == 'last': feature_dict['mean_change_rate_{}_{}'.format( direction, slice_length)] = calc_change_rate(x[-slice_length:]) # percentiles on original and absolute values for p in percentiles: feature_dict['percentile_{}'.format(p)] = np.percentile(x, p) feature_dict['abs_percentile_{}'.format(p)] = np.percentile( np.abs(x), p) feature_dict['trend'] = add_trend_feature(x) feature_dict['abs_trend'] = add_trend_feature(x, abs_values=True) feature_dict['mad'] = x.mad() feature_dict['kurt'] = x.kurtosis() feature_dict['skew'] = x.skew() feature_dict['med'] = x.median() feature_dict['Hilbert_mean'] = np.abs(hilbert(x)).mean() for hw in hann_windows: feature_dict['Hann_window_mean_{}'.format(hw)] = ( convolve(x, hann(hw), mode='same') / sum(hann(hw))).mean() feature_dict['classic_sta_lta1_mean'] = classic_sta_lta(x, 500, 10000).mean() feature_dict['classic_sta_lta2_mean'] = classic_sta_lta( x, 5000, 100000).mean() feature_dict['classic_sta_lta3_mean'] = classic_sta_lta(x, 3333, 6666).mean() feature_dict['classic_sta_lta4_mean'] = classic_sta_lta( x, 10000, 25000).mean() feature_dict['classic_sta_lta5_mean'] = classic_sta_lta(x, 50, 1000).mean() feature_dict['classic_sta_lta6_mean'] = classic_sta_lta(x, 100, 5000).mean() feature_dict['classic_sta_lta7_mean'] = classic_sta_lta(x, 333, 666).mean() feature_dict['classic_sta_lta8_mean'] = classic_sta_lta( x, 4000, 10000).mean() # exponential rolling statistics ewma = pd.Series.ewm for s in spans: feature_dict['exp_Moving_average_{}_mean'.format(s)] = (ewma( x, span=s).mean(skipna=True)).mean(skipna=True) feature_dict['exp_Moving_average_{}_std'.format(s)] = (ewma( x, span=s).mean(skipna=True)).std(skipna=True) feature_dict['exp_Moving_std_{}_mean'.format(s)] = (ewma( x, span=s).std(skipna=True)).mean(skipna=True) feature_dict['exp_Moving_std_{}_std'.format(s)] = (ewma( x, span=s).std(skipna=True)).std(skipna=True) feature_dict['iqr1'] = np.subtract(*np.percentile(x, [95, 5])) feature_dict['ave10'] = stats.trim_mean(x, 0.1) for slice_length, threshold in product([50000, 100000, 150000], [5, 10, 20, 50, 100]): feature_dict['count_big_{}_threshold_{}'.format( slice_length, threshold)] = (np.abs(x[-slice_length:]) > threshold).sum() feature_dict['count_big_{}_less_threshold_{}'.format( slice_length, threshold)] = (np.abs(x[-slice_length:]) < threshold).sum() # tfresh features take too long to calculate, so I comment them for now feature_dict['abs_energy'] = feature_calculators.abs_energy(x) feature_dict[ 'abs_sum_of_changes'] = feature_calculators.absolute_sum_of_changes( x) feature_dict[ 'count_above_mean'] = feature_calculators.count_above_mean(x) feature_dict[ 'count_below_mean'] = feature_calculators.count_below_mean(x) feature_dict['mean_abs_change'] = feature_calculators.mean_abs_change( x) feature_dict['mean_change'] = feature_calculators.mean_change(x) feature_dict[ 'var_larger_than_std_dev'] = feature_calculators.variance_larger_than_standard_deviation( x) feature_dict['range_minf_m4000'] = feature_calculators.range_count( x, -np.inf, -4000) feature_dict['range_p4000_pinf'] = feature_calculators.range_count( x, 4000, np.inf) for i, j in zip(borders, borders[1:]): feature_dict['range_{}_{}'.format( i, j)] = feature_calculators.range_count(x, i, j) feature_dict[ 'ratio_unique_values'] = feature_calculators.ratio_value_number_to_time_series_length( x) feature_dict[ 'first_loc_min'] = feature_calculators.first_location_of_minimum(x) feature_dict[ 'first_loc_max'] = feature_calculators.first_location_of_maximum(x) feature_dict[ 'last_loc_min'] = feature_calculators.last_location_of_minimum(x) feature_dict[ 'last_loc_max'] = feature_calculators.last_location_of_maximum(x) for lag in lags: feature_dict['time_rev_asym_stat_{}'.format( lag)] = feature_calculators.time_reversal_asymmetry_statistic( x, lag) for autocorr_lag in autocorr_lags: feature_dict['autocorrelation_{}'.format( autocorr_lag)] = feature_calculators.autocorrelation( x, autocorr_lag) feature_dict['c3_{}'.format( autocorr_lag)] = feature_calculators.c3(x, autocorr_lag) for coeff, attr in product([1, 2, 3, 4, 5], ['real', 'imag', 'angle']): feature_dict['fft_{}_{}'.format(coeff, attr)] = list( feature_calculators.fft_coefficient(x, [{ 'coeff': coeff, 'attr': attr }]))[0][1] feature_dict[ 'long_strk_above_mean'] = feature_calculators.longest_strike_above_mean( x) feature_dict[ 'long_strk_below_mean'] = feature_calculators.longest_strike_below_mean( x) feature_dict['cid_ce_0'] = feature_calculators.cid_ce(x, 0) feature_dict['cid_ce_1'] = feature_calculators.cid_ce(x, 1) for p in percentiles: feature_dict['binned_entropy_{}'.format( p)] = feature_calculators.binned_entropy(x, p) feature_dict['num_crossing_0'] = feature_calculators.number_crossing_m( x, 0) for peak in peaks: feature_dict['num_peaks_{}'.format( peaks)] = feature_calculators.number_peaks(x, peak) for c in coefs: feature_dict['spkt_welch_density_{}'.format(c)] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': c }]))[0][1] feature_dict['time_rev_asym_stat_{}'.format( c)] = feature_calculators.time_reversal_asymmetry_statistic( x, c) # statistics on rolling windows of various sizes for w in windows: x_roll_std = x.rolling(w).std().dropna().values x_roll_mean = x.rolling(w).mean().dropna().values feature_dict['ave_roll_std_{}'.format(w)] = x_roll_std.mean() feature_dict['std_roll_std_{}'.format(w)] = x_roll_std.std() feature_dict['max_roll_std_{}'.format(w)] = x_roll_std.max() feature_dict['min_roll_std_{}'.format(w)] = x_roll_std.min() for p in percentiles: feature_dict['percentile_roll_std_{}_window_{}'.format( p, w)] = np.percentile(x_roll_std, p) feature_dict['av_change_abs_roll_std_{}'.format(w)] = np.mean( np.diff(x_roll_std)) feature_dict['av_change_rate_roll_std_{}'.format(w)] = np.mean( np.nonzero((np.diff(x_roll_std) / x_roll_std[:-1]))[0]) feature_dict['abs_max_roll_std_{}'.format(w)] = np.abs( x_roll_std).max() feature_dict['ave_roll_mean_{}'.format(w)] = x_roll_mean.mean() feature_dict['std_roll_mean_{}'.format(w)] = x_roll_mean.std() feature_dict['max_roll_mean_{}'.format(w)] = x_roll_mean.max() feature_dict['min_roll_mean_{}'.format(w)] = x_roll_mean.min() for p in percentiles: feature_dict['percentile_roll_mean_{}_window_{}'.format( p, w)] = np.percentile(x_roll_mean, p) feature_dict['av_change_abs_roll_mean_{}'.format(w)] = np.mean( np.diff(x_roll_mean)) feature_dict['av_change_rate_roll_mean_{}'.format(w)] = np.mean( np.nonzero((np.diff(x_roll_mean) / x_roll_mean[:-1]))[0]) feature_dict['abs_max_roll_mean_{}'.format(w)] = np.abs( x_roll_mean).max() return feature_dict
def create_features(seg_id, seg, X, st, end): """ create features including fft features, statistical features and time series features :param seg_id: the ID for a sample :param seg: s signal segment :param X: train set features before creating these features :param st: the start index of the signal segment :param end: the end index of the signal segment :return: train set features after creating these features """ try: # test set won't create these features because its seg_id is string X.loc[seg_id, 'seg_id'] = np.int32(seg_id) X.loc[seg_id, 'seg_start'] = np.int32(st) X.loc[seg_id, 'seg_end'] = np.int32(end) except ValueError: pass xc = pd.Series(seg['acoustic_data'].values) xcdm = xc - np.mean(xc) b, a = des_bw_filter_lp(cutoff=18000) xcz = sg.lfilter(b, a, xcdm) zc = np.fft.fft(xcz) zc = zc[:MAX_FREQ] # FFT transform values realFFT = np.real(zc) imagFFT = np.imag(zc) freq_bands = [x for x in range(0, MAX_FREQ, FREQ_BAND)] magFFT = np.sqrt(realFFT ** 2 + imagFFT ** 2) phzFFT = np.arctan(imagFFT / realFFT) phzFFT[phzFFT == -np.inf] = -np.pi / 2.0 phzFFT[phzFFT == np.inf] = np.pi / 2.0 phzFFT = np.nan_to_num(phzFFT) for freq in freq_bands: X.loc[seg_id, 'FFT_Mag_01q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_BAND], 0.01) X.loc[seg_id, 'FFT_Mag_10q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_BAND], 0.1) X.loc[seg_id, 'FFT_Mag_90q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_BAND], 0.9) X.loc[seg_id, 'FFT_Mag_99q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_BAND], 0.99) X.loc[seg_id, 'FFT_Mag_mean%d' % freq] = np.mean(magFFT[freq: freq + FREQ_BAND]) X.loc[seg_id, 'FFT_Mag_std%d' % freq] = np.std(magFFT[freq: freq + FREQ_BAND]) X.loc[seg_id, 'FFT_Mag_max%d' % freq] = np.max(magFFT[freq: freq + FREQ_BAND]) X.loc[seg_id, 'FFT_Phz_mean%d' % freq] = np.mean(phzFFT[freq: freq + FREQ_BAND]) X.loc[seg_id, 'FFT_Phz_std%d' % freq] = np.std(phzFFT[freq: freq + FREQ_BAND]) X.loc[seg_id, 'FFT_Rmean'] = realFFT.mean() X.loc[seg_id, 'FFT_Rstd'] = realFFT.std() X.loc[seg_id, 'FFT_Rmax'] = realFFT.max() X.loc[seg_id, 'FFT_Rmin'] = realFFT.min() X.loc[seg_id, 'FFT_Imean'] = imagFFT.mean() X.loc[seg_id, 'FFT_Istd'] = imagFFT.std() X.loc[seg_id, 'FFT_Imax'] = imagFFT.max() X.loc[seg_id, 'FFT_Imin'] = imagFFT.min() X.loc[seg_id, 'FFT_Rmean_first_6000'] = realFFT[:6000].mean() X.loc[seg_id, 'FFT_Rstd__first_6000'] = realFFT[:6000].std() X.loc[seg_id, 'FFT_Rmax_first_6000'] = realFFT[:6000].max() X.loc[seg_id, 'FFT_Rmin_first_6000'] = realFFT[:6000].min() X.loc[seg_id, 'FFT_Rmean_first_18000'] = realFFT[:18000].mean() X.loc[seg_id, 'FFT_Rstd_first_18000'] = realFFT[:18000].std() X.loc[seg_id, 'FFT_Rmax_first_18000'] = realFFT[:18000].max() X.loc[seg_id, 'FFT_Rmin_first_18000'] = realFFT[:18000].min() del xcz del zc b, a = des_bw_filter_lp(cutoff=2500) xc0 = sg.lfilter(b, a, xcdm) b, a = des_bw_filter_bp(low=2500, high=5000) xc1 = sg.lfilter(b, a, xcdm) b, a = des_bw_filter_bp(low=5000, high=7500) xc2 = sg.lfilter(b, a, xcdm) b, a = des_bw_filter_bp(low=7500, high=10000) xc3 = sg.lfilter(b, a, xcdm) b, a = des_bw_filter_bp(low=10000, high=12500) xc4 = sg.lfilter(b, a, xcdm) b, a = des_bw_filter_bp(low=12500, high=15000) xc5 = sg.lfilter(b, a, xcdm) b, a = des_bw_filter_bp(low=15000, high=17500) xc6 = sg.lfilter(b, a, xcdm) b, a = des_bw_filter_bp(low=17500, high=20000) xc7 = sg.lfilter(b, a, xcdm) b, a = des_bw_filter_hp(cutoff=20000) xc8 = sg.lfilter(b, a, xcdm) sigs = [xc, pd.Series(xc0), pd.Series(xc1), pd.Series(xc2), pd.Series(xc3), pd.Series(xc4), pd.Series(xc5), pd.Series(xc6), pd.Series(xc7), pd.Series(xc8)] for i, sig in enumerate(sigs): X.loc[seg_id, 'mean_%d' % i] = sig.mean() X.loc[seg_id, 'std_%d' % i] = sig.std() X.loc[seg_id, 'max_%d' % i] = sig.max() X.loc[seg_id, 'min_%d' % i] = sig.min() X.loc[seg_id, 'mean_change_abs_%d' % i] = np.mean(np.diff(sig)) X.loc[seg_id, 'mean_change_rate_%d' % i] = calc_mean_change_rate(sig) X.loc[seg_id, 'abs_max_%d' % i] = np.abs(sig).max() X.loc[seg_id, 'std_first_50000_%d' % i] = sig[:50000].std() X.loc[seg_id, 'std_last_50000_%d' % i] = sig[-50000:].std() X.loc[seg_id, 'std_first_10000_%d' % i] = sig[:10000].std() X.loc[seg_id, 'std_last_10000_%d' % i] = sig[-10000:].std() X.loc[seg_id, 'avg_first_50000_%d' % i] = sig[:50000].mean() X.loc[seg_id, 'avg_last_50000_%d' % i] = sig[-50000:].mean() X.loc[seg_id, 'avg_first_10000_%d' % i] = sig[:10000].mean() X.loc[seg_id, 'avg_last_10000_%d' % i] = sig[-10000:].mean() X.loc[seg_id, 'min_first_50000_%d' % i] = sig[:50000].min() X.loc[seg_id, 'min_last_50000_%d' % i] = sig[-50000:].min() X.loc[seg_id, 'min_first_10000_%d' % i] = sig[:10000].min() X.loc[seg_id, 'min_last_10000_%d' % i] = sig[-10000:].min() X.loc[seg_id, 'max_first_50000_%d' % i] = sig[:50000].max() X.loc[seg_id, 'max_last_50000_%d' % i] = sig[-50000:].max() X.loc[seg_id, 'max_first_10000_%d' % i] = sig[:10000].max() X.loc[seg_id, 'max_last_10000_%d' % i] = sig[-10000:].max() X.loc[seg_id, 'max_to_min_%d' % i] = sig.max() / np.abs(sig.min()) X.loc[seg_id, 'max_to_min_diff_%d' % i] = sig.max() - np.abs(sig.min()) X.loc[seg_id, 'count_big_%d' % i] = len(sig[np.abs(sig) > 500]) X.loc[seg_id, 'mean_change_rate_first_50000_%d' % i] = calc_mean_change_rate(sig[:50000]) X.loc[seg_id, 'mean_change_rate_last_50000_%d' % i] = calc_mean_change_rate(sig[-50000:]) X.loc[seg_id, 'mean_change_rate_first_10000_%d' % i] = calc_mean_change_rate(sig[:10000]) X.loc[seg_id, 'mean_change_rate_last_10000_%d' % i] = calc_mean_change_rate(sig[-10000:]) X.loc[seg_id, 'q95_%d' % i] = np.quantile(sig, 0.95) X.loc[seg_id, 'q99_%d' % i] = np.quantile(sig, 0.99) X.loc[seg_id, 'q05_%d' % i] = np.quantile(sig, 0.05) X.loc[seg_id, 'q01_%d' % i] = np.quantile(sig, 0.01) X.loc[seg_id, 'abs_q95_%d' % i] = np.quantile(np.abs(sig), 0.95) X.loc[seg_id, 'abs_q99_%d' % i] = np.quantile(np.abs(sig), 0.99) X.loc[seg_id, 'abs_q05_%d' % i] = np.quantile(np.abs(sig), 0.05) X.loc[seg_id, 'abs_q01_%d' % i] = np.quantile(np.abs(sig), 0.01) X.loc[seg_id, 'trend_%d' % i] = add_trend_feature(sig) X.loc[seg_id, 'abs_trend_%d' % i] = add_trend_feature(sig, abs_values=True) X.loc[seg_id, 'abs_mean_%d' % i] = np.abs(sig).mean() X.loc[seg_id, 'abs_std_%d' % i] = np.abs(sig).std() X.loc[seg_id, 'mad_%d' % i] = sig.mad() X.loc[seg_id, 'kurt_%d' % i] = sig.kurtosis() X.loc[seg_id, 'skew_%d' % i] = sig.skew() X.loc[seg_id, 'med_%d' % i] = sig.median() X.loc[seg_id, 'Hilbert_mean_%d' % i] = np.abs(hilbert(sig)).mean() X.loc[seg_id, 'Hann_window_mean'] = (convolve(xc, hann(150), mode='same') / sum(hann(150))).mean() X.loc[seg_id, 'classic_sta_lta1_mean_%d' % i] = classic_sta_lta(sig, 500, 10000).mean() X.loc[seg_id, 'classic_sta_lta2_mean_%d' % i] = classic_sta_lta(sig, 5000, 100000).mean() X.loc[seg_id, 'classic_sta_lta3_mean_%d' % i] = classic_sta_lta(sig, 3333, 6666).mean() X.loc[seg_id, 'classic_sta_lta4_mean_%d' % i] = classic_sta_lta(sig, 10000, 25000).mean() X.loc[seg_id, 'Moving_average_700_mean_%d' % i] = sig.rolling(window=700).mean().mean(skipna=True) X.loc[seg_id, 'Moving_average_1500_mean_%d' % i] = sig.rolling(window=1500).mean().mean(skipna=True) X.loc[seg_id, 'Moving_average_3000_mean_%d' % i] = sig.rolling(window=3000).mean().mean(skipna=True) X.loc[seg_id, 'Moving_average_6000_mean_%d' % i] = sig.rolling(window=6000).mean().mean(skipna=True) ewma = pd.Series.ewm X.loc[seg_id, 'exp_Moving_average_300_mean_%d' % i] = ewma(sig, span=300).mean().mean(skipna=True) X.loc[seg_id, 'exp_Moving_average_3000_mean_%d' % i] = ewma(sig, span=3000).mean().mean(skipna=True) X.loc[seg_id, 'exp_Moving_average_30000_mean_%d' % i] = ewma(sig, span=30000).mean().mean(skipna=True) no_of_std = 3 X.loc[seg_id, 'MA_700MA_std_mean_%d' % i] = sig.rolling(window=700).std().mean() X.loc[seg_id, 'MA_700MA_BB_high_mean_%d' % i] = ( X.loc[seg_id, 'Moving_average_700_mean_%d' % i] + no_of_std * X.loc[ seg_id, 'MA_700MA_std_mean_%d' % i]).mean() X.loc[seg_id, 'MA_700MA_BB_low_mean_%d' % i] = ( X.loc[seg_id, 'Moving_average_700_mean_%d' % i] - no_of_std * X.loc[ seg_id, 'MA_700MA_std_mean_%d' % i]).mean() X.loc[seg_id, 'MA_400MA_std_mean_%d' % i] = sig.rolling(window=400).std().mean() X.loc[seg_id, 'MA_400MA_BB_high_mean_%d' % i] = ( X.loc[seg_id, 'Moving_average_700_mean_%d' % i] + no_of_std * X.loc[ seg_id, 'MA_400MA_std_mean_%d' % i]).mean() X.loc[seg_id, 'MA_400MA_BB_low_mean_%d' % i] = ( X.loc[seg_id, 'Moving_average_700_mean_%d' % i] - no_of_std * X.loc[ seg_id, 'MA_400MA_std_mean_%d' % i]).mean() X.loc[seg_id, 'MA_1000MA_std_mean_%d' % i] = sig.rolling(window=1000).std().mean() X.loc[seg_id, 'iqr_%d' % i] = np.subtract(*np.percentile(sig, [75, 25])) X.loc[seg_id, 'q999_%d' % i] = np.quantile(sig, 0.999) X.loc[seg_id, 'q001_%d' % i] = np.quantile(sig, 0.001) X.loc[seg_id, 'ave10_%d' % i] = stats.trim_mean(sig, 0.1) X.loc[seg_id, 'num_peaks_10_%d' % i] = feature_calculators.number_peaks(sig, 10) X.loc[seg_id, 'cid_ce_1_%d' % i] = feature_calculators.cid_ce(sig, 1) # time series complexity X.loc[seg_id, 'count_1000_0_%d' % i] = feature_calculators.range_count(sig, -1000, 0) X.loc[seg_id, 'binned_entropy_5_%d' % i] = feature_calculators.binned_entropy(sig, 5) X.loc[seg_id, 'binned_entropy_15_%d' % i] = feature_calculators.binned_entropy(sig, 15) # sliding window is a kind of filter, so this code is out of the cycle of band pass for windows in [10, 100, 1000]: x_roll_std = xc.rolling(windows).std().dropna() x_roll_mean = xc.rolling(windows).mean().dropna() X.loc[seg_id, 'ave_roll_std_' + str(windows)] = x_roll_std.mean() X.loc[seg_id, 'std_roll_std_' + str(windows)] = x_roll_std.std() X.loc[seg_id, 'max_roll_std_' + str(windows)] = x_roll_std.max() X.loc[seg_id, 'min_roll_std_' + str(windows)] = x_roll_std.min() X.loc[seg_id, 'q01_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.01) X.loc[seg_id, 'q05_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.05) X.loc[seg_id, 'q95_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.95) X.loc[seg_id, 'q99_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.99) X.loc[seg_id, 'av_change_abs_roll_std_' + str(windows)] = np.mean(np.diff(x_roll_std)) X.loc[seg_id, 'av_change_rate_roll_std_' + str(windows)] = calc_mean_change_rate(x_roll_std) X.loc[seg_id, 'abs_max_roll_std_' + str(windows)] = np.abs(x_roll_std).max() X.loc[seg_id, 'ave_roll_mean_' + str(windows)] = x_roll_mean.mean() X.loc[seg_id, 'std_roll_mean_' + str(windows)] = x_roll_mean.std() X.loc[seg_id, 'max_roll_mean_' + str(windows)] = x_roll_mean.max() X.loc[seg_id, 'min_roll_mean_' + str(windows)] = x_roll_mean.min() X.loc[seg_id, 'q01_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.01) X.loc[seg_id, 'q05_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.05) X.loc[seg_id, 'q95_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.95) X.loc[seg_id, 'q99_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.99) X.loc[seg_id, 'av_change_abs_roll_mean_' + str(windows)] = np.mean(np.diff(x_roll_mean)) X.loc[seg_id, 'av_change_rate_roll_mean_' + str(windows)] = calc_mean_change_rate(x_roll_mean) X.loc[seg_id, 'abs_max_roll_mean_' + str(windows)] = np.abs(x_roll_mean).max() return X
def CIDCELag5(fragment): return fc.cid_ce(fragment,5)
def CIDCELag7(fragment): return fc.cid_ce(fragment,7)
# 3. `last_location_of_maximum`: locates the last occurrence of the maximum value in the time series # 4. `skewness`: the Fisher-Pearson skewness of the time series # 5. `sample_entropy`: the sample entropy of the time series # # All of these will be calculated for each operational setting and each sensor measurement giving us 5 new columns for each of the 29 original features (145 total features). # In[ ]: from tsfresh.feature_extraction.feature_calculators import ( cid_ce, number_peaks, last_location_of_maximum, skewness, sample_entropy) # To avoid the issue of passing multiple functions to `agg` with the same name `lambda`, we have to create lambda functions and then give them custom names. `cid_ce` and `number_peaks` both have required arguments but the other functions only need a time-series. # In[ ]: cid_ce_func = lambda x: cid_ce(x, normalize=False) cid_ce_func.__name__ = 'cid_ce' n_peaks = lambda x: number_peaks(x, n=5) n_peaks.__name__ = 'number_peaks' # Apply the five operations ts_values = train_obs.drop( columns=['time_in_cycles']).groupby('engine_no').agg([ cid_ce_func, n_peaks, last_location_of_maximum, skewness, sample_entropy ]) ts_values.head() # Below we rename the columns.
def complexity(x): return fc.cid_ce(x, True)
def features(self, x, y, seg_id): feature_dict = dict() feature_dict['target'] = y feature_dict['seg_id'] = seg_id # create features here # numpy feature_dict['mean'] = np.mean(x) feature_dict['max'] = np.max(x) feature_dict['min'] = np.min(x) feature_dict['std'] = np.std(x) feature_dict['var'] = np.var(x) feature_dict['ptp'] = np.ptp(x) feature_dict['percentile_10'] = np.percentile(x, 10) feature_dict['percentile_20'] = np.percentile(x, 20) feature_dict['percentile_30'] = np.percentile(x, 30) feature_dict['percentile_40'] = np.percentile(x, 40) feature_dict['percentile_50'] = np.percentile(x, 50) feature_dict['percentile_60'] = np.percentile(x, 60) feature_dict['percentile_70'] = np.percentile(x, 70) feature_dict['percentile_80'] = np.percentile(x, 80) feature_dict['percentile_90'] = np.percentile(x, 90) # scipy feature_dict['skew'] = sp.stats.skew(x) feature_dict['kurtosis'] = sp.stats.kurtosis(x) feature_dict['kstat_1'] = sp.stats.kstat(x, 1) feature_dict['kstat_2'] = sp.stats.kstat(x, 2) feature_dict['kstat_3'] = sp.stats.kstat(x, 3) feature_dict['kstat_4'] = sp.stats.kstat(x, 4) feature_dict['moment_1'] = sp.stats.moment(x, 1) feature_dict['moment_2'] = sp.stats.moment(x, 2) feature_dict['moment_3'] = sp.stats.moment(x, 3) feature_dict['moment_4'] = sp.stats.moment(x, 4) feature_dict['abs_energy'] = feature_calculators.abs_energy(x) feature_dict['abs_sum_of_changes'] = feature_calculators.absolute_sum_of_changes(x) feature_dict['count_above_mean'] = feature_calculators.count_above_mean(x) feature_dict['count_below_mean'] = feature_calculators.count_below_mean(x) feature_dict['mean_abs_change'] = feature_calculators.mean_abs_change(x) feature_dict['mean_change'] = feature_calculators.mean_change(x) feature_dict['var_larger_than_std_dev'] = feature_calculators.variance_larger_than_standard_deviation(x) feature_dict['range_minf_m4000'] = feature_calculators.range_count(x, -np.inf, -4000) feature_dict['range_m4000_m3000'] = feature_calculators.range_count(x, -4000, -3000) feature_dict['range_m3000_m2000'] = feature_calculators.range_count(x, -3000, -2000) feature_dict['range_m2000_m1000'] = feature_calculators.range_count(x, -2000, -1000) feature_dict['range_m1000_0'] = feature_calculators.range_count(x, -1000, 0) feature_dict['range_0_p1000'] = feature_calculators.range_count(x, 0, 1000) feature_dict['range_p1000_p2000'] = feature_calculators.range_count(x, 1000, 2000) feature_dict['range_p2000_p3000'] = feature_calculators.range_count(x, 2000, 3000) feature_dict['range_p3000_p4000'] = feature_calculators.range_count(x, 3000, 4000) feature_dict['range_p4000_pinf'] = feature_calculators.range_count(x, 4000, np.inf) feature_dict['ratio_unique_values'] = feature_calculators.ratio_value_number_to_time_series_length(x) feature_dict['first_loc_min'] = feature_calculators.first_location_of_minimum(x) feature_dict['first_loc_max'] = feature_calculators.first_location_of_maximum(x) feature_dict['last_loc_min'] = feature_calculators.last_location_of_minimum(x) feature_dict['last_loc_max'] = feature_calculators.last_location_of_maximum(x) feature_dict['time_rev_asym_stat_10'] = feature_calculators.time_reversal_asymmetry_statistic(x, 10) feature_dict['time_rev_asym_stat_100'] = feature_calculators.time_reversal_asymmetry_statistic(x, 100) feature_dict['time_rev_asym_stat_1000'] = feature_calculators.time_reversal_asymmetry_statistic(x, 1000) feature_dict['autocorrelation_5'] = feature_calculators.autocorrelation(x, 5) feature_dict['autocorrelation_10'] = feature_calculators.autocorrelation(x, 10) feature_dict['autocorrelation_50'] = feature_calculators.autocorrelation(x, 50) feature_dict['autocorrelation_100'] = feature_calculators.autocorrelation(x, 100) feature_dict['autocorrelation_1000'] = feature_calculators.autocorrelation(x, 1000) feature_dict['c3_5'] = feature_calculators.c3(x, 5) feature_dict['c3_10'] = feature_calculators.c3(x, 10) feature_dict['c3_100'] = feature_calculators.c3(x, 100) feature_dict['fft_1_real'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 1, 'attr': 'real'}]))[0][1] feature_dict['fft_1_imag'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 1, 'attr': 'imag'}]))[0][1] feature_dict['fft_1_ang'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 1, 'attr': 'angle'}]))[0][1] feature_dict['fft_2_real'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 2, 'attr': 'real'}]))[0][1] feature_dict['fft_2_imag'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 2, 'attr': 'imag'}]))[0][1] feature_dict['fft_2_ang'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 2, 'attr': 'angle'}]))[0][1] feature_dict['fft_3_real'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 3, 'attr': 'real'}]))[0][1] feature_dict['fft_3_imag'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 3, 'attr': 'imag'}]))[0][1] feature_dict['fft_3_ang'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 3, 'attr': 'angle'}]))[0][1] feature_dict['long_strk_above_mean'] = feature_calculators.longest_strike_above_mean(x) feature_dict['long_strk_below_mean'] = feature_calculators.longest_strike_below_mean(x) feature_dict['cid_ce_0'] = feature_calculators.cid_ce(x, 0) feature_dict['cid_ce_1'] = feature_calculators.cid_ce(x, 1) feature_dict['binned_entropy_5'] = feature_calculators.binned_entropy(x, 5) feature_dict['binned_entropy_10'] = feature_calculators.binned_entropy(x, 10) feature_dict['binned_entropy_20'] = feature_calculators.binned_entropy(x, 20) feature_dict['binned_entropy_50'] = feature_calculators.binned_entropy(x, 50) feature_dict['binned_entropy_80'] = feature_calculators.binned_entropy(x, 80) feature_dict['binned_entropy_100'] = feature_calculators.binned_entropy(x, 100) feature_dict['num_crossing_0'] = feature_calculators.number_crossing_m(x, 0) feature_dict['num_peaks_10'] = feature_calculators.number_peaks(x, 10) feature_dict['num_peaks_50'] = feature_calculators.number_peaks(x, 50) feature_dict['num_peaks_100'] = feature_calculators.number_peaks(x, 100) feature_dict['num_peaks_500'] = feature_calculators.number_peaks(x, 500) feature_dict['spkt_welch_density_1'] = list(feature_calculators.spkt_welch_density(x, [{'coeff': 1}]))[0][1] feature_dict['spkt_welch_density_10'] = list(feature_calculators.spkt_welch_density(x, [{'coeff': 10}]))[0][1] feature_dict['spkt_welch_density_50'] = list(feature_calculators.spkt_welch_density(x, [{'coeff': 50}]))[0][1] feature_dict['spkt_welch_density_100'] = list(feature_calculators.spkt_welch_density(x, [{'coeff': 100}]))[0][1] feature_dict['time_rev_asym_stat_1'] = feature_calculators.time_reversal_asymmetry_statistic(x, 1) feature_dict['time_rev_asym_stat_10'] = feature_calculators.time_reversal_asymmetry_statistic(x, 10) feature_dict['time_rev_asym_stat_100'] = feature_calculators.time_reversal_asymmetry_statistic(x, 100) return feature_dict