def get_features_from_one_signal(X, sample_rate=50): assert X.ndim == 1, "Expected single signal in feature extraction" mean = np.mean(X) stdev = np.std(X) abs_energy = fc.abs_energy(X) sum_of_changes = fc.absolute_sum_of_changes(X) autoc = fc.autocorrelation(X, sample_rate) count_above_mean = fc.count_above_mean(X) count_below_mean = fc.count_below_mean(X) kurtosis = fc.kurtosis(X) longest_above = fc.longest_strike_above_mean(X) zero_crossing = fc.number_crossing_m(X, mean) num_peaks = fc.number_peaks(X, int(sample_rate / 10)) sample_entropy = fc.sample_entropy(X) spectral_density = fc.spkt_welch_density(X, [{ "coeff": 1 }, { "coeff": 2 }, { "coeff": 3 }, { "coeff": 4 }, { "coeff": 5 }, { "coeff": 6 }]) c, v = zip(*spectral_density) v = np.asarray(v) return [ mean, stdev, abs_energy, sum_of_changes, autoc, count_above_mean, count_below_mean, kurtosis, longest_above, zero_crossing, num_peaks, sample_entropy, v[0], v[1], v[2], v[3], v[4], v[5] ]
def get_sta_features(self, data): """ Calculate the value of 9 kinds of selected statistical features :param data: :return: """ def _cal_trend(data): time_list = np.arange(len(data)) # create linear regression object regr = linear_model.LinearRegression() regr.fit(time_list.reshape(-1, 1), np.array(data).reshape(-1, 1)) return regr.coef_[0][0] E = ts.abs_energy(data) S = ts.binned_entropy(data, max_bins=5) ro = ts.autocorrelation(data, lag=4) skewness = ts.skewness(data) kurtosis = ts.kurtosis(data) trend = _cal_trend(data) mean = ts.mean(data) min = ts.minimum(data) max = ts.maximum(data) return [E, S, ro, skewness, kurtosis, trend, mean, min, max]
def feature_extract(dt): import tsfresh.feature_extraction.feature_calculators as fc ft = { 'abs_energy': fc.abs_energy(dt), 'sum_values': fc.sum_values(dt), 'mean': fc.mean(dt), 'maximum': fc.maximum(dt), 'minimum': fc.minimum(dt), 'median': fc.median(dt), 'quantile_0.1': fc.quantile(dt, 0.1), 'quantile_0.2': fc.quantile(dt, 0.2), 'quantile_0.3': fc.quantile(dt, 0.3), 'quantile_0.4': fc.quantile(dt, 0.4), 'quantile_0.5': fc.quantile(dt, 0.5), 'quantile_0.6': fc.quantile(dt, 0.6), 'quantile_0.7': fc.quantile(dt, 0.7), 'quantile_0.8': fc.quantile(dt, 0.8), 'quantile_0.9': fc.quantile(dt, 0.9), # # TODO: # Below functions dont works well -> need to be checked!! # #'fft_coefficient__coeff_0__attr_real': fc.fft_coefficient(dt {"coeff": 0, "attr": "real"}), #'fft_coefficient__coeff_0__attr_imag': fc.fft_coefficient(dt {"coeff": 0, "attr": "imag"}), #'fft_coefficient__coeff_0__attr_abs': fc.fft_coefficient(dt {"coeff": 0, "attr": "abs"}), #'fft_coefficient__coeff_0__attr_angle': fc.fft_coefficient(dt {"coeff": 0, "attr": "angle"}), # #=> Mr. Huy just fix this issue with above function fft_ft !! } ft.update(fft_ft(dt)) return ft
def ACEnergy(self,x): x = self.butter_bandpass_filter(x, 0.1, 20, 200) # n=len(frame) # Y=fft(frame)/n Y = abs(fft(x)) Y = Y / len(Y) return fc.abs_energy(Y[range(1, len(Y))]) # ACEnergy
def time_series_abs_energy(x): """ :param x: the time series to calculate the feature of :type x: pandas.Series :return: the value of this feature :return type: float """ return ts_feature_calculators.abs_energy(x)
def TS_features(signal): energy = ts.abs_energy(signal) abs_sum = ts.absolute_sum_of_changes(signal) above_mean = ts.count_above_mean(signal) below_mean = ts.count_below_mean(signal) first_max_location = ts.first_location_of_maximum(signal) first_min_location = ts.first_location_of_minimum(signal) return energy, abs_sum, above_mean, below_mean, first_max_location, first_min_location
def abs_energy(mag): """Returns the absolute energy of the time series, defined to be the sum over the squared values of the time-series. rtype: float """ energy = ts.abs_energy(mag) return energy
def time_series_abs_energy(x): """ Returns the absolute energy of the time series which is the sum over the squared values .. math:: E = \\sum_{i=1,\ldots, n} x_i^2 :param x: the time series to calculate the feature of :type x: pandas.Series :return: the value of this feature :return type: float """ return ts_feature_calculators.abs_energy(x)
def extract_feats(ts): std = fc.standard_deviation(ts) kurtosis = fc.kurtosis(ts) skewness = fc.skewness(ts) cam = fc.count_above_mean(ts) cbm = fc.count_below_mean(ts) lsam = fc.longest_strike_above_mean(ts) lsbm = fc.longest_strike_below_mean(ts) psd = fc.fourier_entropy(ts, bins=1000000) energy = fc.abs_energy(ts) return np.array( [std, kurtosis, skewness, cam, cbm, lsam, lsbm, psd, energy])
def transform(self, value): if value is None: return None # TODO: remove try-except and validate value in order to avoid exception try: return [ abs_energy(value), kurtosis(value), mean_abs_change(value), skewness(value), count_above_mean(value) / len(value), count_below_mean(value) / len(value) ] except: return None
def abs_energy(self, x): """ As in tsfresh `abs_energy <https://github.com/blue-yonder/tsfresh/blob/master/tsfresh/feature_extraction/\ feature_calculators.py#L390>`_ \ Returns the absolute energy of the time series which is the sum over the squared values\ .. math:: E=\\sum_{i=1,\ldots, n}x_i^2 :param x: the time series to calculate the feature of :type x: pandas.Series :return: the value of this feature :rtype: float """ _energy = feature_calculators.abs_energy(x) logging.debug("abs energy by tsfresh calculated") return _energy
def extract_features(data): day = 24 * 60 return list( numpy.nan_to_num( numpy.array([ feature.symmetry_looking(data, [{ 'r': 0.3 }])[0][1], feature.variance_larger_than_standard_deviation(data).bool(), feature.ratio_beyond_r_sigma(data, 2), feature.has_duplicate_max(data), feature.has_duplicate_min(data), feature.has_duplicate(data), feature.agg_autocorrelation(numpy.array(data.value), [{ 'f_agg': 'mean', 'maxlag': day }])[0][1], feature.partial_autocorrelation(data, [{ 'lag': day }])[0][1], feature.abs_energy(numpy.array(data.value)), feature.mean_change(data), feature.mean_second_derivative_central(data), feature.median(data), float(feature.mean(data)), float(feature.standard_deviation(data)), float(feature.longest_strike_below_mean(data)), float(feature.longest_strike_above_mean(data)), int(feature.number_peaks(data, 10)), feature.linear_trend(numpy.array(data.value), [{ 'attr': 'rvalue' }])[0][1], feature.c3(data, day), float(feature.maximum(data)), float(feature.minimum(data)) ])))
def abs_energy(current_observation: pd.DataFrame, raw_key: str): return tsf.abs_energy(current_observation[raw_key])
def create_features2(seg, ): data_row = {} xcz = des_filter(seg, high=CUTOFF) zc = np.fft.fft(xcz) zc = zc[:MAX_FREQ] # FFT transform values realFFT = np.real(zc) imagFFT = np.imag(zc) freq_bands = list(range(0, MAX_FREQ, FREQ_STEP)) magFFT = np.abs(zc) phzFFT = np.angle(zc) phzFFT[phzFFT == -np.inf] = -np.pi / 2.0 phzFFT[phzFFT == np.inf] = np.pi / 2.0 phzFFT = np.nan_to_num(phzFFT) for freq in freq_bands: data_row['FFT_Mag_01q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_STEP], 0.01) data_row['FFT_Mag_10q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_STEP], 0.1) data_row['FFT_Mag_90q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_STEP], 0.9) data_row['FFT_Mag_99q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_STEP], 0.99) data_row['FFT_Mag_mean%d' % freq] = np.mean(magFFT[freq: freq + FREQ_STEP]) data_row['FFT_Mag_std%d' % freq] = np.std(magFFT[freq: freq + FREQ_STEP]) data_row['FFT_Mag_max%d' % freq] = np.max(magFFT[freq: freq + FREQ_STEP]) data_row['FFT_Mag_min%d' % freq] = np.min(magFFT[freq: freq + FREQ_STEP]) data_row['FFT_Phz_mean%d' % freq] = np.mean(phzFFT[freq: freq + FREQ_STEP]) data_row['FFT_Phz_std%d' % freq] = np.std(phzFFT[freq: freq + FREQ_STEP]) data_row['FFT_Phz_max%d' % freq] = np.max(phzFFT[freq: freq + FREQ_STEP]) data_row['FFT_Phz_min%d' % freq] = np.min(phzFFT[freq: freq + FREQ_STEP]) data_row['FFT_Rmean'] = realFFT.mean() data_row['FFT_Rstd'] = realFFT.std() data_row['FFT_Rmax'] = realFFT.max() data_row['FFT_Rmin'] = realFFT.min() data_row['FFT_Imean'] = imagFFT.mean() data_row['FFT_Istd'] = imagFFT.std() data_row['FFT_Imax'] = imagFFT.max() data_row['FFT_Imin'] = imagFFT.min() data_row['FFT_Rmean_first_6000'] = realFFT[:6000].mean() data_row['FFT_Rstd__first_6000'] = realFFT[:6000].std() data_row['FFT_Rmax_first_6000'] = realFFT[:6000].max() data_row['FFT_Rmin_first_6000'] = realFFT[:6000].min() data_row['FFT_Rmean_first_18000'] = realFFT[:18000].mean() data_row['FFT_Rstd_first_18000'] = realFFT[:18000].std() data_row['FFT_Rmax_first_18000'] = realFFT[:18000].max() data_row['FFT_Rmin_first_18000'] = realFFT[:18000].min() del xcz del zc # gc.collect() sigs = [seg] for freq in range(0, MAX_FREQ + FREQ_STEP, FREQ_STEP): if freq == 0: xc_ = des_filter(seg, high=FREQ_STEP) elif freq == MAX_FREQ: xc_ = des_filter(seg, low=freq) else: xc_ = des_filter(seg, low=freq, high=freq + FREQ_STEP) sigs.append(pd.Series(xc_)) for window in [50, 200, 1000]: roll_mean = seg.rolling(window).mean().dropna() roll_std = seg.rolling(window).std().dropna() sigs.append(pd.Series(roll_mean)) sigs.append(pd.Series(roll_std)) for span in [30, 300, 3000]: exp_mean = seg.ewm(span).mean().dropna() exp_std = seg.ewm(span).std().dropna() sigs.append(pd.Series(exp_mean)) sigs.append(pd.Series(exp_std)) for i, sig in enumerate(sigs): data_row['mean_%d' % i] = sig.mean() data_row['std_%d' % i] = sig.std() data_row['max_%d' % i] = sig.max() data_row['min_%d' % i] = sig.min() data_row['mean_change_abs_%d' % i] = np.mean(np.diff(sig)) data_row['mean_change_rate_%d' % i] = np.mean(np.nonzero((np.diff(sig) / sig[:-1]))[0]) data_row['abs_max_%d' % i] = np.abs(sig).max() data_row['abs_min_%d' % i] = np.abs(sig).min() data_row['std_first_50000_%d' % i] = sig[:50000].std() data_row['std_last_50000_%d' % i] = sig[-50000:].std() data_row['std_first_10000_%d' % i] = sig[:10000].std() data_row['std_last_10000_%d' % i] = sig[-10000:].std() data_row['avg_first_50000_%d' % i] = sig[:50000].mean() data_row['avg_last_50000_%d' % i] = sig[-50000:].mean() data_row['avg_first_10000_%d' % i] = sig[:10000].mean() data_row['avg_last_10000_%d' % i] = sig[-10000:].mean() data_row['min_first_50000_%d' % i] = sig[:50000].min() data_row['min_last_50000_%d' % i] = sig[-50000:].min() data_row['min_first_10000_%d' % i] = sig[:10000].min() data_row['min_last_10000_%d' % i] = sig[-10000:].min() data_row['max_first_50000_%d' % i] = sig[:50000].max() data_row['max_last_50000_%d' % i] = sig[-50000:].max() data_row['max_first_10000_%d' % i] = sig[:10000].max() data_row['max_last_10000_%d' % i] = sig[-10000:].max() data_row['max_to_min_%d' % i] = sig.max() / np.abs(sig.min()) data_row['max_to_min_diff_%d' % i] = sig.max() - np.abs(sig.min()) data_row['count_big_%d' % i] = len(sig[np.abs(sig) > 500]) data_row['sum_%d' % i] = sig.sum() data_row['mean_change_rate_first_50000_%d' % i] = np.mean( np.nonzero((np.diff(sig[:50000]) / sig[:50000][:-1]))[0]) data_row['mean_change_rate_last_50000_%d' % i] = np.mean( np.nonzero((np.diff(sig[-50000:]) / sig[-50000:][:-1]))[0]) data_row['mean_change_rate_first_10000_%d' % i] = np.mean( np.nonzero((np.diff(sig[:10000]) / sig[:10000][:-1]))[0]) data_row['mean_change_rate_last_10000_%d' % i] = np.mean( np.nonzero((np.diff(sig[-10000:]) / sig[-10000:][:-1]))[0]) for p in [1, 5, 10, 25, 50, 75, 90, 95, 99]: data_row['percentile_p{}_{}'.format(p, i)] = np.percentile(sig, p) data_row['abd_percentile_p{}_{}'.format(p, i)] = np.percentile(np.abs(sig), p) data_row['trend_%d' % i] = add_trend_feature(sig) data_row['abs_trend_%d' % i] = add_trend_feature(sig, abs_values=True) data_row['abs_mean_%d' % i] = np.abs(sig).mean() data_row['abs_std_%d' % i] = np.abs(sig).std() data_row['mad_%d' % i] = sig.mad() data_row['kurt_%d' % i] = sig.kurtosis() data_row['skew_%d' % i] = sig.skew() data_row['med_%d' % i] = sig.median() # data_row['Hilbert_mean_%d' % i] = np.abs(hilbert(sig)).mean() data_row['Hann_window50_%d' % i] = (convolve(sig, hann(50), mode='same') / sum(hann(50))).mean() data_row['Hann_window500_%d' % i] = (convolve(sig, hann(500), mode='same') / sum(hann(500))).mean() data_row['classic_sta_lta0_mean_%d' % i] = classic_sta_lta(sig, 50, 1000).mean() data_row['classic_sta_lta1_mean_%d' % i] = classic_sta_lta(sig, 500, 10000).mean() data_row['classic_sta_lta2_mean_%d' % i] = classic_sta_lta(sig, 5000, 100000).mean() data_row['classic_sta_lta3_mean_%d' % i] = classic_sta_lta(sig, 3333, 6666).mean() data_row['classic_sta_lta4_mean_%d' % i] = classic_sta_lta(sig, 10000, 25000).mean() no_of_std = 2 for w in [10, 100, 500]: signal_mean = sig.rolling(window=w).mean() signal_std = sig.rolling(window=w).std() data_row['high_bound_mean_win{}_{}'.format(w, i)] = (signal_mean + no_of_std * signal_std).mean() data_row['low_bound_mean_win{}_{}'.format(w, i)] = (signal_mean - no_of_std * signal_std).mean() data_row['range_inf_4000_%d' % i] = feature_calculators.range_count(sig, -np.inf, -4000) data_row['range_4000_inf_%d' % i] = feature_calculators.range_count(sig, 4000, np.inf) for l, h in [[-4000, -2000], [-2000, 0], [0, 2000], [2000, 4000]]: data_row['range_{}_{}_{}'.format(np.abs(l), np.abs(h), i)] = feature_calculators.range_count(sig, l, h) data_row['iqr0_%d' % i] = np.subtract(*np.percentile(sig, [75, 25])) data_row['iqr1_%d' % i] = np.subtract(*np.percentile(sig, [95, 5])) data_row['ave10_%d' % i] = stats.trim_mean(sig, 0.1) data_row['num_cross_0_%d' % i] = feature_calculators.number_crossing_m(sig, 0) data_row['ratio_value_number_%d' % i] = feature_calculators.ratio_value_number_to_time_series_length(sig) # data_row['var_larger_than_std_dev_%d' % i] = feature_calculators.variance_larger_than_standard_deviation(sig) data_row['ratio_unique_values_%d' % i] = feature_calculators.ratio_value_number_to_time_series_length(sig) data_row['abs_energy_%d' % i] = feature_calculators.abs_energy(sig) data_row['abs_sum_of_changes_%d' % i] = feature_calculators.absolute_sum_of_changes(sig) data_row['count_above_mean_%d' % i] = feature_calculators.count_above_mean(sig) data_row['count_below_mean_%d' % i] = feature_calculators.count_below_mean(sig) data_row['mean_abs_change_%d' % i] = feature_calculators.mean_abs_change(sig) data_row['mean_change_%d' % i] = feature_calculators.mean_change(sig) data_row['first_loc_min_%d' % i] = feature_calculators.first_location_of_minimum(sig) data_row['first_loc_max_%d' % i] = feature_calculators.first_location_of_maximum(sig) data_row['last_loc_min_%d' % i] = feature_calculators.last_location_of_minimum(sig) data_row['last_loc_max_%d' % i] = feature_calculators.last_location_of_maximum(sig) data_row['long_strk_above_mean_%d' % i] = feature_calculators.longest_strike_above_mean(sig) data_row['long_strk_below_mean_%d' % i] = feature_calculators.longest_strike_below_mean(sig) # data_row['cid_ce_0_%d' % i] = feature_calculators.cid_ce(sig, 0) # data_row['cid_ce_1_%d' % i] = feature_calculators.cid_ce(sig, 1) for j in [10, 50, ]: data_row['peak_num_p{}_{}'.format(j, i)] = feature_calculators.number_peaks(sig, j) for j in [1, 10, 50, 100]: data_row['spkt_welch_density_coeff{}_{}'.format(j, i)] = \ list(feature_calculators.spkt_welch_density(sig, [{'coeff': j}]))[0][1] for j in [5, 10, 100]: data_row['c3_c{}_{}'.format(j, i)] = feature_calculators.c3(sig, j) for j in [5, 10, 50, 100, 1000]: data_row['autocorrelation_auto{}_{}'.format(j, i)] = feature_calculators.autocorrelation(sig, j) for j in [10, 100, 1000]: data_row['time_rev_asym_stat_t{}_{}'.format(j, i)] = feature_calculators.time_reversal_asymmetry_statistic( sig, j) for j in range(1, 5): data_row['kstat_k{}_{}'.format(j, i)] = stats.kstat(sig, j) data_row['moment_m{}_{}'.format(j, i)] = stats.moment(sig, j) for j in range(1, 3): data_row['kstatvar_k{}_{}'.format(j, i)] = stats.kstatvar(sig, j) for j in [5, 10, 50, 100]: data_row['binned_entropy_b{}_{}'.format(j, i)] = feature_calculators.binned_entropy(sig, j) return data_row
def main(): dirname = os.path.realpath('.') excelF = dirname + '\\Summary.xlsx' myworkbook = openpyxl.load_workbook(excelF) worksheet = myworkbook['SummaryPatients'] file = 1 for filename in glob.glob(dirname + "\*.txt"): data = open(filename, 'r') totalData = {} time = [] totalForceL = [] totalForceR = [] id = [] for line in data: tempForce = line.split() id.append(1) time.append(float(tempForce[0])) totalForceL.append(float(tempForce[17])) totalForceR.append(float(tempForce[18])) totalData["id"] = id totalData["time"] = time totalData["totalForceL"] = totalForceL totalData["totalForceR"] = totalForceR dataPandas = pd.DataFrame.from_dict(totalData) extracted_features = {} #extract_featuresL = extract_features(dataPandas, column_id="id", column_kind=None, column_value=None) worksheet['A' + str(file + 1)] = file if 'Pt' in filename: worksheet['B' + str(file + 1)] = 1 else: worksheet['B' + str(file + 1)] = 0 worksheet['C' + str(file + 1)] = tf.abs_energy( totalData["totalForceL"]) worksheet['D' + str(file + 1)] = tf.abs_energy( totalData["totalForceR"]) worksheet['E' + str(file + 1)] = tf.kurtosis(totalData["totalForceL"]) worksheet['F' + str(file + 1)] = tf.kurtosis(totalData["totalForceR"]) worksheet['G' + str(file + 1)] = tf.skewness(totalData["totalForceL"]) worksheet['H' + str(file + 1)] = tf.skewness(totalData["totalForceR"]) worksheet['I' + str(file + 1)] = tf.median(totalData["totalForceL"]) worksheet['J' + str(file + 1)] = tf.median(totalData["totalForceR"]) worksheet['K' + str(file + 1)] = tf.mean(totalData["totalForceL"]) worksheet['L' + str(file + 1)] = tf.mean(totalData["totalForceR"]) worksheet['M' + str(file + 1)] = tf.variance(totalData["totalForceL"]) worksheet['N' + str(file + 1)] = tf.variance(totalData["totalForceR"]) temp = tf.fft_aggregated(totalData["totalForceL"], [{ "aggtype": "centroid" }, { "aggtype": "variance" }, { "aggtype": "skew" }, { "aggtype": "kurtosis" }]) int = 0 for list in temp: if int == 0: worksheet['O' + str(file + 1)] = list[1] if int == 1: worksheet['P' + str(file + 1)] = list[1] if int == 2: worksheet['Q' + str(file + 1)] = list[1] if int == 3: worksheet['R' + str(file + 1)] = list[1] int += 1 temp2 = tf.fft_aggregated(totalData["totalForceR"], [{ "aggtype": "centroid" }, { "aggtype": "variance" }, { "aggtype": "skew" }, { "aggtype": "kurtosis" }]) int = 0 for list in temp2: if int == 0: worksheet['S' + str(file + 1)] = list[1] if int == 1: worksheet['T' + str(file + 1)] = list[1] if int == 2: worksheet['U' + str(file + 1)] = list[1] if int == 3: worksheet['V' + str(file + 1)] = list[1] int += 1 file += 1 myworkbook.save(excelF)
def features(self, x, prefix): feature_dict = dict() # create features here # numpy feature_dict[prefix + '_' + 'mean'] = np.mean(x) feature_dict[prefix + '_' + 'max'] = np.max(x) feature_dict[prefix + '_' + 'min'] = np.min(x) feature_dict[prefix + '_' + 'std'] = np.std(x) feature_dict[prefix + '_' + 'var'] = np.var(x) feature_dict[prefix + '_' + 'ptp'] = np.ptp(x) feature_dict[prefix + '_' + 'percentile_10'] = np.percentile(x, 10) feature_dict[prefix + '_' + 'percentile_20'] = np.percentile(x, 20) feature_dict[prefix + '_' + 'percentile_30'] = np.percentile(x, 30) feature_dict[prefix + '_' + 'percentile_40'] = np.percentile(x, 40) feature_dict[prefix + '_' + 'percentile_50'] = np.percentile(x, 50) feature_dict[prefix + '_' + 'percentile_60'] = np.percentile(x, 60) feature_dict[prefix + '_' + 'percentile_70'] = np.percentile(x, 70) feature_dict[prefix + '_' + 'percentile_80'] = np.percentile(x, 80) feature_dict[prefix + '_' + 'percentile_90'] = np.percentile(x, 90) # scipy feature_dict[prefix + '_' + 'skew'] = sp.stats.skew(x) feature_dict[prefix + '_' + 'kurtosis'] = sp.stats.kurtosis(x) feature_dict[prefix + '_' + 'kstat_1'] = sp.stats.kstat(x, 1) feature_dict[prefix + '_' + 'kstat_2'] = sp.stats.kstat(x, 2) feature_dict[prefix + '_' + 'kstat_3'] = sp.stats.kstat(x, 3) feature_dict[prefix + '_' + 'kstat_4'] = sp.stats.kstat(x, 4) feature_dict[prefix + '_' + 'moment_1'] = sp.stats.moment(x, 1) feature_dict[prefix + '_' + 'moment_2'] = sp.stats.moment(x, 2) feature_dict[prefix + '_' + 'moment_3'] = sp.stats.moment(x, 3) feature_dict[prefix + '_' + 'moment_4'] = sp.stats.moment(x, 4) # tsfresh feature_dict[prefix + '_' + 'abs_energy'] = feature_calculators.abs_energy(x) feature_dict[ prefix + '_' + 'abs_sum_of_changes'] = feature_calculators.absolute_sum_of_changes( x) feature_dict[ prefix + '_' + 'count_above_mean'] = feature_calculators.count_above_mean(x) feature_dict[ prefix + '_' + 'count_below_mean'] = feature_calculators.count_below_mean(x) feature_dict[prefix + '_' + 'mean_abs_change'] = feature_calculators.mean_abs_change( x) feature_dict[prefix + '_' + 'mean_change'] = feature_calculators.mean_change(x) feature_dict[ prefix + '_' + 'var_larger_than_std_dev'] = feature_calculators.variance_larger_than_standard_deviation( x) feature_dict[prefix + '_' + 'range_minf_m4000'] = feature_calculators.range_count( x, -np.inf, -4000) feature_dict[prefix + '_' + 'range_m4000_m3000'] = feature_calculators.range_count( x, -4000, -3000) feature_dict[prefix + '_' + 'range_m3000_m2000'] = feature_calculators.range_count( x, -3000, -2000) feature_dict[prefix + '_' + 'range_m2000_m1000'] = feature_calculators.range_count( x, -2000, -1000) feature_dict[prefix + '_' + 'range_m1000_0'] = feature_calculators.range_count( x, -1000, 0) feature_dict[prefix + '_' + 'range_0_p1000'] = feature_calculators.range_count( x, 0, 1000) feature_dict[prefix + '_' + 'range_p1000_p2000'] = feature_calculators.range_count( x, 1000, 2000) feature_dict[prefix + '_' + 'range_p2000_p3000'] = feature_calculators.range_count( x, 2000, 3000) feature_dict[prefix + '_' + 'range_p3000_p4000'] = feature_calculators.range_count( x, 3000, 4000) feature_dict[prefix + '_' + 'range_p4000_pinf'] = feature_calculators.range_count( x, 4000, np.inf) feature_dict[ prefix + '_' + 'ratio_unique_values'] = feature_calculators.ratio_value_number_to_time_series_length( x) feature_dict[ prefix + '_' + 'first_loc_min'] = feature_calculators.first_location_of_minimum(x) feature_dict[ prefix + '_' + 'first_loc_max'] = feature_calculators.first_location_of_maximum(x) feature_dict[ prefix + '_' + 'last_loc_min'] = feature_calculators.last_location_of_minimum(x) feature_dict[ prefix + '_' + 'last_loc_max'] = feature_calculators.last_location_of_maximum(x) feature_dict[ prefix + '_' + 'time_rev_asym_stat_10'] = feature_calculators.time_reversal_asymmetry_statistic( x, 10) feature_dict[ prefix + '_' + 'time_rev_asym_stat_100'] = feature_calculators.time_reversal_asymmetry_statistic( x, 100) feature_dict[ prefix + '_' + 'time_rev_asym_stat_1000'] = feature_calculators.time_reversal_asymmetry_statistic( x, 1000) feature_dict[ prefix + '_' + 'autocorrelation_1'] = feature_calculators.autocorrelation(x, 1) feature_dict[ prefix + '_' + 'autocorrelation_2'] = feature_calculators.autocorrelation(x, 2) feature_dict[ prefix + '_' + 'autocorrelation_3'] = feature_calculators.autocorrelation(x, 3) feature_dict[ prefix + '_' + 'autocorrelation_4'] = feature_calculators.autocorrelation(x, 4) feature_dict[ prefix + '_' + 'autocorrelation_5'] = feature_calculators.autocorrelation(x, 5) feature_dict[ prefix + '_' + 'autocorrelation_6'] = feature_calculators.autocorrelation(x, 6) feature_dict[ prefix + '_' + 'autocorrelation_7'] = feature_calculators.autocorrelation(x, 7) feature_dict[ prefix + '_' + 'autocorrelation_8'] = feature_calculators.autocorrelation(x, 8) feature_dict[ prefix + '_' + 'autocorrelation_9'] = feature_calculators.autocorrelation(x, 9) feature_dict[ prefix + '_' + 'autocorrelation_10'] = feature_calculators.autocorrelation(x, 10) feature_dict[ prefix + '_' + 'autocorrelation_50'] = feature_calculators.autocorrelation(x, 50) feature_dict[ prefix + '_' + 'autocorrelation_100'] = feature_calculators.autocorrelation( x, 100) feature_dict[ prefix + '_' + 'autocorrelation_1000'] = feature_calculators.autocorrelation( x, 1000) feature_dict[prefix + '_' + 'c3_1'] = feature_calculators.c3(x, 1) feature_dict[prefix + '_' + 'c3_2'] = feature_calculators.c3(x, 2) feature_dict[prefix + '_' + 'c3_3'] = feature_calculators.c3(x, 3) feature_dict[prefix + '_' + 'c3_4'] = feature_calculators.c3(x, 4) feature_dict[prefix + '_' + 'c3_5'] = feature_calculators.c3(x, 5) feature_dict[prefix + '_' + 'c3_10'] = feature_calculators.c3(x, 10) feature_dict[prefix + '_' + 'c3_100'] = feature_calculators.c3(x, 100) for c in range(1, 34): feature_dict[prefix + '_' + 'fft_{0}_real'.format(c)] = list( feature_calculators.fft_coefficient(x, [{ 'coeff': c, 'attr': 'real' }]))[0][1] feature_dict[prefix + '_' + 'fft_{0}_imag'.format(c)] = list( feature_calculators.fft_coefficient(x, [{ 'coeff': c, 'attr': 'imag' }]))[0][1] feature_dict[prefix + '_' + 'fft_{0}_ang'.format(c)] = list( feature_calculators.fft_coefficient(x, [{ 'coeff': c, 'attr': 'angle' }]))[0][1] feature_dict[ prefix + '_' + 'long_strk_above_mean'] = feature_calculators.longest_strike_above_mean( x) feature_dict[ prefix + '_' + 'long_strk_below_mean'] = feature_calculators.longest_strike_below_mean( x) feature_dict[prefix + '_' + 'cid_ce_0'] = feature_calculators.cid_ce( x, 0) feature_dict[prefix + '_' + 'cid_ce_1'] = feature_calculators.cid_ce( x, 1) feature_dict[prefix + '_' + 'binned_entropy_5'] = feature_calculators.binned_entropy( x, 5) feature_dict[prefix + '_' + 'binned_entropy_10'] = feature_calculators.binned_entropy( x, 10) feature_dict[prefix + '_' + 'binned_entropy_20'] = feature_calculators.binned_entropy( x, 20) feature_dict[prefix + '_' + 'binned_entropy_50'] = feature_calculators.binned_entropy( x, 50) feature_dict[prefix + '_' + 'binned_entropy_80'] = feature_calculators.binned_entropy( x, 80) feature_dict[ prefix + '_' + 'binned_entropy_100'] = feature_calculators.binned_entropy(x, 100) feature_dict[prefix + '_' + 'num_crossing_0'] = feature_calculators.number_crossing_m( x, 0) feature_dict[prefix + '_' + 'num_peaks_1'] = feature_calculators.number_peaks(x, 1) feature_dict[prefix + '_' + 'num_peaks_3'] = feature_calculators.number_peaks(x, 3) feature_dict[prefix + '_' + 'num_peaks_5'] = feature_calculators.number_peaks(x, 5) feature_dict[prefix + '_' + 'num_peaks_10'] = feature_calculators.number_peaks(x, 10) feature_dict[prefix + '_' + 'num_peaks_50'] = feature_calculators.number_peaks(x, 50) feature_dict[prefix + '_' + 'num_peaks_100'] = feature_calculators.number_peaks( x, 100) feature_dict[prefix + '_' + 'num_peaks_500'] = feature_calculators.number_peaks( x, 500) feature_dict[prefix + '_' + 'spkt_welch_density_1'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': 1 }]))[0][1] feature_dict[prefix + '_' + 'spkt_welch_density_2'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': 2 }]))[0][1] feature_dict[prefix + '_' + 'spkt_welch_density_5'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': 5 }]))[0][1] feature_dict[prefix + '_' + 'spkt_welch_density_8'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': 8 }]))[0][1] feature_dict[prefix + '_' + 'spkt_welch_density_10'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': 10 }]))[0][1] feature_dict[prefix + '_' + 'spkt_welch_density_50'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': 50 }]))[0][1] feature_dict[prefix + '_' + 'spkt_welch_density_100'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': 100 }]))[0][1] feature_dict[ prefix + '_' + 'time_rev_asym_stat_1'] = feature_calculators.time_reversal_asymmetry_statistic( x, 1) feature_dict[ prefix + '_' + 'time_rev_asym_stat_2'] = feature_calculators.time_reversal_asymmetry_statistic( x, 2) feature_dict[ prefix + '_' + 'time_rev_asym_stat_3'] = feature_calculators.time_reversal_asymmetry_statistic( x, 3) feature_dict[ prefix + '_' + 'time_rev_asym_stat_4'] = feature_calculators.time_reversal_asymmetry_statistic( x, 4) feature_dict[ prefix + '_' + 'time_rev_asym_stat_10'] = feature_calculators.time_reversal_asymmetry_statistic( x, 10) feature_dict[ prefix + '_' + 'time_rev_asym_stat_100'] = feature_calculators.time_reversal_asymmetry_statistic( x, 100) for r in range(20): feature_dict[prefix + '_' + 'symmetry_looking_' + str(r)] = feature_calculators.symmetry_looking( x, [{ 'r': r * 0.05 }])[0][1] for r in range(1, 20): feature_dict[ prefix + '_' + 'large_standard_deviation_' + str(r)] = feature_calculators.large_standard_deviation( x, r * 0.05) for r in range(1, 10): feature_dict[prefix + '_' + 'quantile_' + str(r)] = feature_calculators.quantile(x, r * 0.1) for r in ['mean', 'median', 'var']: feature_dict[prefix + '_' + 'agg_autocorr_' + r] = feature_calculators.agg_autocorrelation( x, [{ 'f_agg': r, 'maxlag': 40 }])[0][-1] #for r in range(1, 6): # feature_dict[prefix+'_'+'number_cwt_peaks_'+str(r)] = feature_calculators.number_cwt_peaks(x, r) for r in range(1, 10): feature_dict[prefix + '_' + 'index_mass_quantile_' + str(r)] = feature_calculators.index_mass_quantile( x, [{ 'q': r }])[0][1] #for ql in [0., .2, .4, .6, .8]: # for qh in [.2, .4, .6, .8, 1.]: # if ql < qh: # for b in [False, True]: # for f in ["mean", "var"]: # feature_dict[prefix+'_'+'change_quantiles_'+str(ql)+'_'+str(qh)+'_'+str(b)+'_'+str(f)] = feature_calculators.change_quantiles(x, ql, qh, b, f) #for r in [.1, .3, .5, .7, .9]: # feature_dict[prefix+'_'+'approximate_entropy_'+str(r)] = feature_calculators.approximate_entropy(x, 2, r) feature_dict[ prefix + '_' + 'max_langevin_fixed_point'] = feature_calculators.max_langevin_fixed_point( x, 3, 30) for r in ['pvalue', 'rvalue', 'intercept', 'slope', 'stderr']: feature_dict[prefix + '_' + 'linear_trend_' + str(r)] = feature_calculators.linear_trend( x, [{ 'attr': r }])[0][1] for r in ['pvalue', 'teststat', 'usedlag']: feature_dict[prefix + '_' + 'augmented_dickey_fuller_' + r] = feature_calculators.augmented_dickey_fuller( x, [{ 'attr': r }])[0][1] for r in [0.5, 1, 1.5, 2, 2.5, 3, 5, 6, 7, 10]: feature_dict[prefix + '_' + 'ratio_beyond_r_sigma_' + str(r)] = feature_calculators.ratio_beyond_r_sigma( x, r) #for attr in ["pvalue", "rvalue", "intercept", "slope", "stderr"]: # feature_dict[prefix+'_'+'linear_trend_timewise_'+attr] = feature_calculators.linear_trend_timewise(x, [{'attr': attr}])[0][1] #for attr in ["rvalue", "intercept", "slope", "stderr"]: # for i in [5, 10, 50]: # for f in ["max", "min", "mean", "var"]: # feature_dict[prefix+'_'+'agg_linear_trend_'+attr+'_'+str(i)+'_'+f] = feature_calculators.agg_linear_trend(x, [{'attr': attr, 'chunk_len': i, 'f_agg': f}])[0][-1] #for width in [2, 5, 10, 20]: # for coeff in range(15): # for w in [2, 5, 10, 20]: # feature_dict[prefix+'_'+'cwt_coefficients_'+str(width)+'_'+str(coeff)+'_'+str(w)] = list(feature_calculators.cwt_coefficients(x, [{'widths': width, 'coeff': coeff, 'w': w}]))[0][1] #for r in range(10): # feature_dict[prefix+'_'+'partial_autocorr_'+str(r)] = feature_calculators.partial_autocorrelation(x, [{'lag': r}])[0][1] # "ar_coefficient": [{"coeff": coeff, "k": k} for coeff in range(5) for k in [10]], # "fft_coefficient": [{"coeff": k, "attr": a} for a, k in product(["real", "imag", "abs", "angle"], range(100))], # "fft_aggregated": [{"aggtype": s} for s in ["centroid", "variance", "skew", "kurtosis"]], # "value_count": [{"value": value} for value in [0, 1, -1]], # "range_count": [{"min": -1, "max": 1}, {"min": 1e12, "max": 0}, {"min": 0, "max": 1e12}], # "friedrich_coefficients": (lambda m: [{"coeff": coeff, "m": m, "r": 30} for coeff in range(m + 1)])(3), # "energy_ratio_by_chunks": [{"num_segments": 10, "segment_focus": i} for i in range(10)], return feature_dict
# percentile feats feature_dict['percentile_10'] = np.percentile(x, 10) feature_dict['percentile_60'] = np.percentile(x, 60) feature_dict['percentile_90'] = np.percentile(x, 90) # quantile feat feature_dict['quantile_1'] = np.percentie(x, 25) feature_dict['quantile_2'] = np.percentie(x, 50) feature_dict['quantile_3'] = np.percentie(x, 75) feature_dict['quantile_4'] = np.percentie(x, 99) # ts fresh can be used for time series features when we have a list of features # in a given period of time from tsfresh.feature_extraction import feature_calculators as fc feature_dict['abs_energey'] = fc.abs_energy(x) feature_dict['count_above_mean'] = fc.count_above_mean(x) feature_dict['count_below_mean'] = fc.count_below_mean(x) feature_dict['mean_abs_change'] = fc.mean_abs_change(x) feature_dict['mean_change'] = fc.mean_change(x) # polynomial features import pandas as pd import numpy as np from sklearn import preprocessing df = pd.DataFrame(np.random.rand(100, 2), columns=['f1', 'f2']) pf = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=False) pf.fit(df)
def abs_energy(arr): return feature_calculators.abs_energy(arr)
def transform_pack3(df): """ augment X from tsfresh features""" x = df.values output = {} output['kstat_1'] = stats.kstat(x, 1) output['kstat_2'] = stats.kstat(x, 2) output['kstat_3'] = stats.kstat(x, 3) output['kstat_4'] = stats.kstat(x, 4) output['abs_energy'] = feature_calculators.abs_energy(x) output['abs_sum_of_changes'] = feature_calculators.absolute_sum_of_changes( x) output['count_above_mean'] = feature_calculators.count_above_mean(x) output['count_below_mean'] = feature_calculators.count_below_mean(x) output['range_minf_m4000'] = feature_calculators.range_count( x, -np.inf, -4000) output['range_m4000_m3000'] = feature_calculators.range_count( x, -4000, -3000) output['range_m3000_m2000'] = feature_calculators.range_count( x, -3000, -2000) output['range_m2000_m1000'] = feature_calculators.range_count( x, -2000, -1000) output['range_m1000_0'] = feature_calculators.range_count(x, -1000, 0) output['range_0_p1000'] = feature_calculators.range_count(x, 0, 1000) output['range_p1000_p2000'] = feature_calculators.range_count( x, 1000, 2000) output['range_p2000_p3000'] = feature_calculators.range_count( x, 2000, 3000) output['range_p3000_p4000'] = feature_calculators.range_count( x, 3000, 4000) output['range_p4000_pinf'] = feature_calculators.range_count( x, 4000, np.inf) output[ 'ratio_unique_values'] = feature_calculators.ratio_value_number_to_time_series_length( x) output['first_loc_min'] = feature_calculators.first_location_of_minimum(x) output['first_loc_max'] = feature_calculators.first_location_of_maximum(x) output['last_loc_min'] = feature_calculators.last_location_of_minimum(x) output['last_loc_max'] = feature_calculators.last_location_of_maximum(x) output[ 'time_rev_asym_stat_10'] = feature_calculators.time_reversal_asymmetry_statistic( x, 10) output[ 'time_rev_asym_stat_100'] = feature_calculators.time_reversal_asymmetry_statistic( x, 100) output[ 'time_rev_asym_stat_1000'] = feature_calculators.time_reversal_asymmetry_statistic( x, 1000) output['autocorrelation_10'] = feature_calculators.autocorrelation(x, 10) output['autocorrelation_100'] = feature_calculators.autocorrelation(x, 100) output['autocorrelation_1000'] = feature_calculators.autocorrelation( x, 1000) output['autocorrelation_5000'] = feature_calculators.autocorrelation( x, 5000) output['c3_5'] = feature_calculators.c3(x, 5) output['c3_10'] = feature_calculators.c3(x, 10) output['c3_100'] = feature_calculators.c3(x, 100) output[ 'long_strk_above_mean'] = feature_calculators.longest_strike_above_mean( x) output[ 'long_strk_below_mean'] = feature_calculators.longest_strike_below_mean( x) output['cid_ce_0'] = feature_calculators.cid_ce(x, 0) output['cid_ce_1'] = feature_calculators.cid_ce(x, 1) output['binned_entropy_10'] = feature_calculators.binned_entropy(x, 10) output['binned_entropy_50'] = feature_calculators.binned_entropy(x, 50) output['binned_entropy_80'] = feature_calculators.binned_entropy(x, 80) output['binned_entropy_100'] = feature_calculators.binned_entropy(x, 100) tmp = np.abs(x) output['num_crossing_0'] = feature_calculators.number_crossing_m(tmp, 0) output['num_crossing_10'] = feature_calculators.number_crossing_m(tmp, 10) output['num_crossing_100'] = feature_calculators.number_crossing_m( tmp, 100) output['num_peaks_10'] = feature_calculators.number_peaks(tmp, 10) output['num_peaks_50'] = feature_calculators.number_peaks(tmp, 50) output['num_peaks_100'] = feature_calculators.number_peaks(tmp, 100) output['num_peaks_500'] = feature_calculators.number_peaks(tmp, 500) output['spkt_welch_density_1'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': 1 }]))[0][1] output['spkt_welch_density_10'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': 10 }]))[0][1] output['spkt_welch_density_50'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': 50 }]))[0][1] output['spkt_welch_density_100'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': 100 }]))[0][1] output[ 'time_rev_asym_stat_1'] = feature_calculators.time_reversal_asymmetry_statistic( x, 1) output[ 'time_rev_asym_stat_10'] = feature_calculators.time_reversal_asymmetry_statistic( x, 10) output[ 'time_rev_asym_stat_100'] = feature_calculators.time_reversal_asymmetry_statistic( x, 100) return output
def features(self, x, y, seg_id, denoise=False): if (denoise == True): x_hp = high_pass_filter(x, low_cutoff=10000, sample_rate=4000000) x = denoise_signal(x_hp, wavelet='haar', level=1) feature_dict = dict() feature_dict['target'] = y feature_dict['seg_id'] = seg_id # create features here # lists with parameters to iterate over them percentiles = [ 1, 5, 10, 20, 25, 30, 40, 50, 60, 70, 75, 80, 90, 95, 99 ] hann_windows = [50, 150, 1500, 15000] spans = [300, 3000, 30000, 50000] windows = [10, 50, 100, 500, 1000, 10000] borders = list(range(-4000, 4001, 1000)) peaks = [10, 20, 50, 100] coefs = [1, 5, 10, 50, 100] lags = [10, 100, 1000, 10000] autocorr_lags = [5, 10, 50, 100, 500, 1000, 5000, 10000] # basic stats feature_dict['mean'] = x.mean() feature_dict['std'] = x.std() feature_dict['max'] = x.max() feature_dict['min'] = x.min() # basic stats on absolute values feature_dict['mean_change_abs'] = np.mean(np.diff(x)) feature_dict['abs_max'] = np.abs(x).max() feature_dict['abs_mean'] = np.abs(x).mean() feature_dict['abs_std'] = np.abs(x).std() # geometric and harminic means feature_dict['hmean'] = stats.hmean(np.abs(x[np.nonzero(x)[0]])) feature_dict['gmean'] = stats.gmean(np.abs(x[np.nonzero(x)[0]])) # k-statistic and moments for i in range(1, 5): feature_dict['kstat_{}'.format(i)] = stats.kstat(x, i) feature_dict['moment_{}'.format(i)] = stats.moment(x, i) for i in [1, 2]: feature_dict['kstatvar_{}'.format(i)] = stats.kstatvar(x, i) # aggregations on various slices of data for agg_type, slice_length, direction in product( ['std', 'min', 'max', 'mean'], [1000, 10000, 50000], ['first', 'last']): if direction == 'first': feature_dict['{}_{}_{}'.format( agg_type, direction, slice_length)] = x[:slice_length].agg(agg_type) elif direction == 'last': feature_dict['{}_{}_{}'.format( agg_type, direction, slice_length)] = x[-slice_length:].agg(agg_type) feature_dict['max_to_min'] = x.max() / np.abs(x.min()) feature_dict['max_to_min_diff'] = x.max() - np.abs(x.min()) feature_dict['count_big'] = len(x[np.abs(x) > 500]) feature_dict['sum'] = x.sum() feature_dict['mean_change_rate'] = calc_change_rate(x) # calc_change_rate on slices of data for slice_length, direction in product([1000, 10000, 50000], ['first', 'last']): if direction == 'first': feature_dict['mean_change_rate_{}_{}'.format( direction, slice_length)] = calc_change_rate(x[:slice_length]) elif direction == 'last': feature_dict['mean_change_rate_{}_{}'.format( direction, slice_length)] = calc_change_rate(x[-slice_length:]) # percentiles on original and absolute values for p in percentiles: feature_dict['percentile_{}'.format(p)] = np.percentile(x, p) feature_dict['abs_percentile_{}'.format(p)] = np.percentile( np.abs(x), p) feature_dict['trend'] = add_trend_feature(x) feature_dict['abs_trend'] = add_trend_feature(x, abs_values=True) feature_dict['mad'] = x.mad() feature_dict['kurt'] = x.kurtosis() feature_dict['skew'] = x.skew() feature_dict['med'] = x.median() feature_dict['Hilbert_mean'] = np.abs(hilbert(x)).mean() for hw in hann_windows: feature_dict['Hann_window_mean_{}'.format(hw)] = ( convolve(x, hann(hw), mode='same') / sum(hann(hw))).mean() feature_dict['classic_sta_lta1_mean'] = classic_sta_lta(x, 500, 10000).mean() feature_dict['classic_sta_lta2_mean'] = classic_sta_lta( x, 5000, 100000).mean() feature_dict['classic_sta_lta3_mean'] = classic_sta_lta(x, 3333, 6666).mean() feature_dict['classic_sta_lta4_mean'] = classic_sta_lta( x, 10000, 25000).mean() feature_dict['classic_sta_lta5_mean'] = classic_sta_lta(x, 50, 1000).mean() feature_dict['classic_sta_lta6_mean'] = classic_sta_lta(x, 100, 5000).mean() feature_dict['classic_sta_lta7_mean'] = classic_sta_lta(x, 333, 666).mean() feature_dict['classic_sta_lta8_mean'] = classic_sta_lta( x, 4000, 10000).mean() # exponential rolling statistics ewma = pd.Series.ewm for s in spans: feature_dict['exp_Moving_average_{}_mean'.format(s)] = (ewma( x, span=s).mean(skipna=True)).mean(skipna=True) feature_dict['exp_Moving_average_{}_std'.format(s)] = (ewma( x, span=s).mean(skipna=True)).std(skipna=True) feature_dict['exp_Moving_std_{}_mean'.format(s)] = (ewma( x, span=s).std(skipna=True)).mean(skipna=True) feature_dict['exp_Moving_std_{}_std'.format(s)] = (ewma( x, span=s).std(skipna=True)).std(skipna=True) feature_dict['iqr1'] = np.subtract(*np.percentile(x, [95, 5])) feature_dict['ave10'] = stats.trim_mean(x, 0.1) for slice_length, threshold in product([50000, 100000, 150000], [5, 10, 20, 50, 100]): feature_dict['count_big_{}_threshold_{}'.format( slice_length, threshold)] = (np.abs(x[-slice_length:]) > threshold).sum() feature_dict['count_big_{}_less_threshold_{}'.format( slice_length, threshold)] = (np.abs(x[-slice_length:]) < threshold).sum() # tfresh features take too long to calculate, so I comment them for now feature_dict['abs_energy'] = feature_calculators.abs_energy(x) feature_dict[ 'abs_sum_of_changes'] = feature_calculators.absolute_sum_of_changes( x) feature_dict[ 'count_above_mean'] = feature_calculators.count_above_mean(x) feature_dict[ 'count_below_mean'] = feature_calculators.count_below_mean(x) feature_dict['mean_abs_change'] = feature_calculators.mean_abs_change( x) feature_dict['mean_change'] = feature_calculators.mean_change(x) feature_dict[ 'var_larger_than_std_dev'] = feature_calculators.variance_larger_than_standard_deviation( x) feature_dict['range_minf_m4000'] = feature_calculators.range_count( x, -np.inf, -4000) feature_dict['range_p4000_pinf'] = feature_calculators.range_count( x, 4000, np.inf) for i, j in zip(borders, borders[1:]): feature_dict['range_{}_{}'.format( i, j)] = feature_calculators.range_count(x, i, j) feature_dict[ 'ratio_unique_values'] = feature_calculators.ratio_value_number_to_time_series_length( x) feature_dict[ 'first_loc_min'] = feature_calculators.first_location_of_minimum(x) feature_dict[ 'first_loc_max'] = feature_calculators.first_location_of_maximum(x) feature_dict[ 'last_loc_min'] = feature_calculators.last_location_of_minimum(x) feature_dict[ 'last_loc_max'] = feature_calculators.last_location_of_maximum(x) for lag in lags: feature_dict['time_rev_asym_stat_{}'.format( lag)] = feature_calculators.time_reversal_asymmetry_statistic( x, lag) for autocorr_lag in autocorr_lags: feature_dict['autocorrelation_{}'.format( autocorr_lag)] = feature_calculators.autocorrelation( x, autocorr_lag) feature_dict['c3_{}'.format( autocorr_lag)] = feature_calculators.c3(x, autocorr_lag) for coeff, attr in product([1, 2, 3, 4, 5], ['real', 'imag', 'angle']): feature_dict['fft_{}_{}'.format(coeff, attr)] = list( feature_calculators.fft_coefficient(x, [{ 'coeff': coeff, 'attr': attr }]))[0][1] feature_dict[ 'long_strk_above_mean'] = feature_calculators.longest_strike_above_mean( x) feature_dict[ 'long_strk_below_mean'] = feature_calculators.longest_strike_below_mean( x) feature_dict['cid_ce_0'] = feature_calculators.cid_ce(x, 0) feature_dict['cid_ce_1'] = feature_calculators.cid_ce(x, 1) for p in percentiles: feature_dict['binned_entropy_{}'.format( p)] = feature_calculators.binned_entropy(x, p) feature_dict['num_crossing_0'] = feature_calculators.number_crossing_m( x, 0) for peak in peaks: feature_dict['num_peaks_{}'.format( peaks)] = feature_calculators.number_peaks(x, peak) for c in coefs: feature_dict['spkt_welch_density_{}'.format(c)] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': c }]))[0][1] feature_dict['time_rev_asym_stat_{}'.format( c)] = feature_calculators.time_reversal_asymmetry_statistic( x, c) # statistics on rolling windows of various sizes for w in windows: x_roll_std = x.rolling(w).std().dropna().values x_roll_mean = x.rolling(w).mean().dropna().values feature_dict['ave_roll_std_{}'.format(w)] = x_roll_std.mean() feature_dict['std_roll_std_{}'.format(w)] = x_roll_std.std() feature_dict['max_roll_std_{}'.format(w)] = x_roll_std.max() feature_dict['min_roll_std_{}'.format(w)] = x_roll_std.min() for p in percentiles: feature_dict['percentile_roll_std_{}_window_{}'.format( p, w)] = np.percentile(x_roll_std, p) feature_dict['av_change_abs_roll_std_{}'.format(w)] = np.mean( np.diff(x_roll_std)) feature_dict['av_change_rate_roll_std_{}'.format(w)] = np.mean( np.nonzero((np.diff(x_roll_std) / x_roll_std[:-1]))[0]) feature_dict['abs_max_roll_std_{}'.format(w)] = np.abs( x_roll_std).max() feature_dict['ave_roll_mean_{}'.format(w)] = x_roll_mean.mean() feature_dict['std_roll_mean_{}'.format(w)] = x_roll_mean.std() feature_dict['max_roll_mean_{}'.format(w)] = x_roll_mean.max() feature_dict['min_roll_mean_{}'.format(w)] = x_roll_mean.min() for p in percentiles: feature_dict['percentile_roll_mean_{}_window_{}'.format( p, w)] = np.percentile(x_roll_mean, p) feature_dict['av_change_abs_roll_mean_{}'.format(w)] = np.mean( np.diff(x_roll_mean)) feature_dict['av_change_rate_roll_mean_{}'.format(w)] = np.mean( np.nonzero((np.diff(x_roll_mean) / x_roll_mean[:-1]))[0]) feature_dict['abs_max_roll_mean_{}'.format(w)] = np.abs( x_roll_mean).max() return feature_dict
def log_energy(x): return np.log(fc.abs_energy(x))
def get_abs_energy(arr): res = np.array([abs_energy(arr)]) res = np.nan_to_num(res) return res
def features(self, x, y, seg_id): feature_dict = dict() feature_dict['target'] = y feature_dict['seg_id'] = seg_id # create features here # numpy feature_dict['mean'] = np.mean(x) feature_dict['max'] = np.max(x) feature_dict['min'] = np.min(x) feature_dict['std'] = np.std(x) feature_dict['var'] = np.var(x) feature_dict['ptp'] = np.ptp(x) feature_dict['percentile_10'] = np.percentile(x, 10) feature_dict['percentile_20'] = np.percentile(x, 20) feature_dict['percentile_30'] = np.percentile(x, 30) feature_dict['percentile_40'] = np.percentile(x, 40) feature_dict['percentile_50'] = np.percentile(x, 50) feature_dict['percentile_60'] = np.percentile(x, 60) feature_dict['percentile_70'] = np.percentile(x, 70) feature_dict['percentile_80'] = np.percentile(x, 80) feature_dict['percentile_90'] = np.percentile(x, 90) # scipy feature_dict['skew'] = sp.stats.skew(x) feature_dict['kurtosis'] = sp.stats.kurtosis(x) feature_dict['kstat_1'] = sp.stats.kstat(x, 1) feature_dict['kstat_2'] = sp.stats.kstat(x, 2) feature_dict['kstat_3'] = sp.stats.kstat(x, 3) feature_dict['kstat_4'] = sp.stats.kstat(x, 4) feature_dict['moment_1'] = sp.stats.moment(x, 1) feature_dict['moment_2'] = sp.stats.moment(x, 2) feature_dict['moment_3'] = sp.stats.moment(x, 3) feature_dict['moment_4'] = sp.stats.moment(x, 4) feature_dict['abs_energy'] = feature_calculators.abs_energy(x) feature_dict['abs_sum_of_changes'] = feature_calculators.absolute_sum_of_changes(x) feature_dict['count_above_mean'] = feature_calculators.count_above_mean(x) feature_dict['count_below_mean'] = feature_calculators.count_below_mean(x) feature_dict['mean_abs_change'] = feature_calculators.mean_abs_change(x) feature_dict['mean_change'] = feature_calculators.mean_change(x) feature_dict['var_larger_than_std_dev'] = feature_calculators.variance_larger_than_standard_deviation(x) feature_dict['range_minf_m4000'] = feature_calculators.range_count(x, -np.inf, -4000) feature_dict['range_m4000_m3000'] = feature_calculators.range_count(x, -4000, -3000) feature_dict['range_m3000_m2000'] = feature_calculators.range_count(x, -3000, -2000) feature_dict['range_m2000_m1000'] = feature_calculators.range_count(x, -2000, -1000) feature_dict['range_m1000_0'] = feature_calculators.range_count(x, -1000, 0) feature_dict['range_0_p1000'] = feature_calculators.range_count(x, 0, 1000) feature_dict['range_p1000_p2000'] = feature_calculators.range_count(x, 1000, 2000) feature_dict['range_p2000_p3000'] = feature_calculators.range_count(x, 2000, 3000) feature_dict['range_p3000_p4000'] = feature_calculators.range_count(x, 3000, 4000) feature_dict['range_p4000_pinf'] = feature_calculators.range_count(x, 4000, np.inf) feature_dict['ratio_unique_values'] = feature_calculators.ratio_value_number_to_time_series_length(x) feature_dict['first_loc_min'] = feature_calculators.first_location_of_minimum(x) feature_dict['first_loc_max'] = feature_calculators.first_location_of_maximum(x) feature_dict['last_loc_min'] = feature_calculators.last_location_of_minimum(x) feature_dict['last_loc_max'] = feature_calculators.last_location_of_maximum(x) feature_dict['time_rev_asym_stat_10'] = feature_calculators.time_reversal_asymmetry_statistic(x, 10) feature_dict['time_rev_asym_stat_100'] = feature_calculators.time_reversal_asymmetry_statistic(x, 100) feature_dict['time_rev_asym_stat_1000'] = feature_calculators.time_reversal_asymmetry_statistic(x, 1000) feature_dict['autocorrelation_5'] = feature_calculators.autocorrelation(x, 5) feature_dict['autocorrelation_10'] = feature_calculators.autocorrelation(x, 10) feature_dict['autocorrelation_50'] = feature_calculators.autocorrelation(x, 50) feature_dict['autocorrelation_100'] = feature_calculators.autocorrelation(x, 100) feature_dict['autocorrelation_1000'] = feature_calculators.autocorrelation(x, 1000) feature_dict['c3_5'] = feature_calculators.c3(x, 5) feature_dict['c3_10'] = feature_calculators.c3(x, 10) feature_dict['c3_100'] = feature_calculators.c3(x, 100) feature_dict['fft_1_real'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 1, 'attr': 'real'}]))[0][1] feature_dict['fft_1_imag'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 1, 'attr': 'imag'}]))[0][1] feature_dict['fft_1_ang'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 1, 'attr': 'angle'}]))[0][1] feature_dict['fft_2_real'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 2, 'attr': 'real'}]))[0][1] feature_dict['fft_2_imag'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 2, 'attr': 'imag'}]))[0][1] feature_dict['fft_2_ang'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 2, 'attr': 'angle'}]))[0][1] feature_dict['fft_3_real'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 3, 'attr': 'real'}]))[0][1] feature_dict['fft_3_imag'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 3, 'attr': 'imag'}]))[0][1] feature_dict['fft_3_ang'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 3, 'attr': 'angle'}]))[0][1] feature_dict['long_strk_above_mean'] = feature_calculators.longest_strike_above_mean(x) feature_dict['long_strk_below_mean'] = feature_calculators.longest_strike_below_mean(x) feature_dict['cid_ce_0'] = feature_calculators.cid_ce(x, 0) feature_dict['cid_ce_1'] = feature_calculators.cid_ce(x, 1) feature_dict['binned_entropy_5'] = feature_calculators.binned_entropy(x, 5) feature_dict['binned_entropy_10'] = feature_calculators.binned_entropy(x, 10) feature_dict['binned_entropy_20'] = feature_calculators.binned_entropy(x, 20) feature_dict['binned_entropy_50'] = feature_calculators.binned_entropy(x, 50) feature_dict['binned_entropy_80'] = feature_calculators.binned_entropy(x, 80) feature_dict['binned_entropy_100'] = feature_calculators.binned_entropy(x, 100) feature_dict['num_crossing_0'] = feature_calculators.number_crossing_m(x, 0) feature_dict['num_peaks_10'] = feature_calculators.number_peaks(x, 10) feature_dict['num_peaks_50'] = feature_calculators.number_peaks(x, 50) feature_dict['num_peaks_100'] = feature_calculators.number_peaks(x, 100) feature_dict['num_peaks_500'] = feature_calculators.number_peaks(x, 500) feature_dict['spkt_welch_density_1'] = list(feature_calculators.spkt_welch_density(x, [{'coeff': 1}]))[0][1] feature_dict['spkt_welch_density_10'] = list(feature_calculators.spkt_welch_density(x, [{'coeff': 10}]))[0][1] feature_dict['spkt_welch_density_50'] = list(feature_calculators.spkt_welch_density(x, [{'coeff': 50}]))[0][1] feature_dict['spkt_welch_density_100'] = list(feature_calculators.spkt_welch_density(x, [{'coeff': 100}]))[0][1] feature_dict['time_rev_asym_stat_1'] = feature_calculators.time_reversal_asymmetry_statistic(x, 1) feature_dict['time_rev_asym_stat_10'] = feature_calculators.time_reversal_asymmetry_statistic(x, 10) feature_dict['time_rev_asym_stat_100'] = feature_calculators.time_reversal_asymmetry_statistic(x, 100) return feature_dict
def main(): dirname = os.path.realpath('.') filename = dirname + '\\GaPt07_01.txt' data = open(filename, 'r') totalData = {} time = [] totalForceL = [] totalForceR = [] id = [] for line in data: tempForce = line.split() id.append(1) time.append(float(tempForce[0])) totalForceL.append(float(tempForce[17])) totalForceR.append(float(tempForce[18])) totalData["id"] = id totalData["time"] = time totalData["totalForceL"] = totalForceL totalData["totalForceR"] = totalForceR dataPandas = pd.DataFrame.from_dict(totalData) extracted_features = {} #extract_featuresL = extract_features(dataPandas, column_id="id", column_kind=None, column_value=None) extracted_features["absEnergyL"] = tf.abs_energy(totalData["totalForceL"]) extracted_features["absEnergyR"] = tf.abs_energy(totalData["totalForceR"]) extracted_features["kurtosisL"] = tf.kurtosis(totalData["totalForceL"]) extracted_features["kurtosisR"] = tf.kurtosis(totalData["totalForceR"]) extracted_features["skewnessL"] = tf.skewness(totalData["totalForceL"]) extracted_features["skewnessR"] = tf.skewness(totalData["totalForceR"]) extracted_features["medianL"] = tf.median(totalData["totalForceL"]) extracted_features["medianR"] = tf.median(totalData["totalForceR"]) extracted_features["meanL"] = tf.mean(totalData["totalForceL"]) extracted_features["meanR"] = tf.mean(totalData["totalForceR"]) extracted_features["varianceL"] = tf.variance(totalData["totalForceL"]) extracted_features["varianceR"] = tf.variance(totalData["totalForceR"]) temp = tf.fft_aggregated(totalData["totalForceL"], [{ "aggtype": "centroid" }, { "aggtype": "variance" }, { "aggtype": "skew" }, { "aggtype": "kurtosis" }]) int = 0 for list in temp: if int == 0: extracted_features["fftCentroidL"] = list if int == 1: extracted_features["fftVarianceL"] = list if int == 2: extracted_features["fftSkewL"] = list if int == 3: extracted_features["fftKurtosisL"] = list int += 1 temp2 = tf.fft_aggregated(totalData["totalForceR"], [{ "aggtype": "centroid" }, { "aggtype": "variance" }, { "aggtype": "skew" }, { "aggtype": "kurtosis" }]) int = 0 for list in temp2: if int == 0: extracted_features["fftCentroidR"] = list if int == 1: extracted_features["fftVarianceR"] = list if int == 2: extracted_features["fftSkewR"] = list if int == 3: extracted_features["fftKurtosisR"] = list int += 1