示例#1
0
def transform_pack5(df):
    """ 0425 more features"""
    output = {}

    percentiles = [1, 5, 10, 20, 25, 30, 40, 50, 60, 70, 75, 80, 90, 95, 99]
    for w in [50, 500, 5000]:
        x_roll_std = df.rolling(w).std().dropna().values
        x_roll_mean = df.rolling(w).mean().dropna().values

        for p in percentiles:
            output[f'percentile_roll_{w}_mean_{p}'] = np.percentile(
                x_roll_mean, p)
            output[f'percentile_roll_{w}_std_{p}'] = np.percentile(
                x_roll_std, p)

        output[
            f'mean_abs_change_mean_{w}'] = feature_calculators.mean_abs_change(
                x_roll_mean)
        output[f'mean_change_roll_mean_{w}'] = feature_calculators.mean_change(
            x_roll_mean)
        output[
            f'mean_abs_change_std_{w}'] = feature_calculators.mean_abs_change(
                x_roll_std)
        output[f'mean_change_roll_std_{w}'] = feature_calculators.mean_change(
            x_roll_std)
    return output
示例#2
0
def mean_change(mag):
    """Returns mean over the differences between subsequent observations/

    rtype: float
    """
    val = ts.mean_change(mag)
    return val
def TS_feature3(signal):
    max_ts = ts.maximum(signal)
    mean_rs = ts.mean(signal)
    mean_abs_change = ts.mean_abs_change(signal)
    mean_change = ts.mean_change(signal)
    median_ts = ts.median(signal)
    minimum_ts = ts.minimum(signal)
    return max_ts, mean_rs, mean_abs_change, mean_change, median_ts, minimum_ts
示例#4
0
def time_series_mean_change(x):
    """
    :param x: the time series to calculate the feature of
    :type x: pandas.Series
    :return: the value of this feature
    :return type: float
    """
    return ts_feature_calculators.mean_change(x)
示例#5
0
def time_series_mean_change(x):
    """
    Returns the mean over the absolute differences between subsequent time series values which is
    .. math::
        \\frac{1}{n} \\sum_{i=1,\ldots, n-1}  x_{i+1} - x_{i}
    :param x: the time series to calculate the feature of
    :type x: pandas.Series
    :return: the value of this feature
    :return type: float
    """
    return ts_feature_calculators.mean_change(x)
示例#6
0
def create_X(x, last_index=None, n_steps=150, step_length=1000, aug=0):
    if last_index == None:
        last_index=len(x)
       
    assert last_index - n_steps * step_length >= 0

    # Reshaping
    per=x[(last_index - n_steps * step_length):last_index]

    #for data augmentation
    if aug==1:
        flag=randint(0, 2)
        if flag==0:
            s=np.random.normal(0, 1, per.shape[0])
            s=np.matrix.round(s,0)
            per=per+s
        if flag==1:
            per=running_mean(per)
        if flag==2:
            per=fourier(per)
            #print(per)

    temp = (per.reshape(n_steps, -1) - 5 ) / 3
    
    #ac1=np.zeros(150)
    ac2=np.zeros(150)
    ac3=np.zeros(150)
    #c3_1=np.zeros(150)
    c3_2=np.zeros(150)
    c3_3=np.zeros(150)
    mac=np.zeros(150)
    mc=np.zeros(150)
    for i in range(150):
        #ac1[i]=ts.autocorrelation(temp[i,:],1)
        ac2[i]=ts.autocorrelation(temp[i,:],2)
        ac3[i]=ts.autocorrelation(temp[i,:],3)
        #c3_1[i]=ts.c3(temp[i,:],1)/500
        c3_2[i]=ts.c3(temp[i,:],2)/500
        c3_3[i]=ts.c3(temp[i,:],3)/500
        mac[i]=ts.mean_abs_change(temp[i,:])
        mc[i]=ts.mean_change(temp[i,:])
        
    return np.c_[extract_features(temp),
                 extract_features(temp[:, 827:]),
                 extract_features(temp[:, 970:]),
                 #ac1,
                 ac2,
                 ac3,
                 #c3_1,
                 c3_2,
                 c3_3,
                 mac,
                 mc,
                 temp[:, -1:]]
示例#7
0
def extract_features(data):
    day = 24 * 60

    return list(
        numpy.nan_to_num(
            numpy.array([
                feature.symmetry_looking(data, [{
                    'r': 0.3
                }])[0][1],
                feature.variance_larger_than_standard_deviation(data).bool(),
                feature.ratio_beyond_r_sigma(data, 2),
                feature.has_duplicate_max(data),
                feature.has_duplicate_min(data),
                feature.has_duplicate(data),
                feature.agg_autocorrelation(numpy.array(data.value),
                                            [{
                                                'f_agg': 'mean',
                                                'maxlag': day
                                            }])[0][1],
                feature.partial_autocorrelation(data, [{
                    'lag': day
                }])[0][1],
                feature.abs_energy(numpy.array(data.value)),
                feature.mean_change(data),
                feature.mean_second_derivative_central(data),
                feature.median(data),
                float(feature.mean(data)),
                float(feature.standard_deviation(data)),
                float(feature.longest_strike_below_mean(data)),
                float(feature.longest_strike_above_mean(data)),
                int(feature.number_peaks(data, 10)),
                feature.linear_trend(numpy.array(data.value), [{
                    'attr': 'rvalue'
                }])[0][1],
                feature.c3(data, day),
                float(feature.maximum(data)),
                float(feature.minimum(data))
            ])))
示例#8
0
    def extract_feats(self, eda_data, file_id=None):
        """
        eda_data is a dict, dict{'data': DataFrame, 'file_id': str, 'category_id': int}
        """
        data = eda_data.to_frame()
        # file_id = eda_data['file_id']
        file_name = osp.splitext(file_id)[0]
        if self.phase != 'test':
            try:
                file_label = int(file_name.split('_')[-2])
            except IndexError:
                print('{}文件命名有误!'.format(file_id))

        data = down_sample(data, interval=cfg.EDA.DOWN_SAMPLE)
        look_back = cfg.EDA.LOOK_BACK
        features_dict = {
            'mean': [],
            'maximum': [],
            'minimum': [],
            'max-min': []
        }

        im_index = 0

        data_fragments = []
        color_list = []

        learn_max_min = []

        num = len(range(0, len(data) - look_back, int(look_back * 0.9)))

        for i in range(0, len(data) - look_back, int(look_back * 0.9)):
            a = data[i:(i + look_back)]
            data_fragments.append(a)
            # 转置(或transpose)之后可能会出现某一维shape为1(如一行转成一列)
            # 因此需要去掉多余维度,否则可能会带来一些切片问题x[np.argmax(x)]越界问题
            # x = a.values.transpose((1, 0))    # 等同于.T
            x = a.values.T
            x = np.squeeze(x)
            feature_dict = {}

            if cfg.EDA.FEATS_MEAN:
                mean = ts_feature.mean(x)
                # mean_change只能按行处理,
                # 内部用的np.diff函数并没有指定axis,
                # 所以只有一列时,np.diff是空,最后得到nan
                mean_change = ts_feature.mean_change(x)
                feature_dict['mean'] = float('%.6f' % mean)
                feature_dict['mean_change'] = float('%.6f' % mean_change)
                features_dict['mean'].append(mean)

            if cfg.EDA.FEATS_MAX_MIN:
                # maximum = ts_feature.maximum(x)
                # minimum = ts_feature.minimum(x)
                max_index = np.argmax(x)
                maximum = x[max_index]
                min_index = np.argmin(x)
                minimum = x[min_index]
                feature_dict['maximum'] = float('%.6f' % maximum)
                feature_dict['minimum'] = float('%.6f' % minimum)
                features_dict['maximum'].append(maximum)
                features_dict['minimum'].append(minimum)
                features_dict['max-min'].append(maximum - minimum)
                if maximum - minimum > 1:
                    if max_index > min_index:
                        color_list.append(1)
                    else:
                        color_list.append(2)
                else:
                    color_list.append(0)

            if cfg.EDA.FEATS_LEARN and im_index < num // 2:
                max_index = np.argmax(x)
                maximum = x[max_index]
                min_index = np.argmin(x)
                minimum = x[min_index]
                learns = []
                for max_min_threshold in np.arange(2., 0., -0.1):
                    if maximum - minimum > max_min_threshold and max_index > min_index:
                        learns.append(1)
                    else:
                        learns.append(0)
                learn_max_min.append(learns)

            if cfg.DRAW.EDA_FRAGMENTS:
                save = osp.join(
                    cfg.DRAW.PATH,
                    'eda/' + file_name + '/' + str(im_index) + '.png')
                draw_dataframe(a,
                               save, ('EDA', ),
                               im_size=(20, 10),
                               show_value=feature_dict)

            im_index += 1

        if cfg.EDA.FEATS_LEARN:
            learn_max_min = np.array(learn_max_min)
            learn_max_min_sum = np.sum(learn_max_min, axis=0)
            position = np.where(learn_max_min_sum > 0)[0]  # 此函数返回的是一个元组
            if self.phase != 'test':
                if file_label == 0:
                    if len(position) > 0:
                        features_dict['unfear_TH'] = 0.1 * (20 - position[0] +
                                                            1)
                    else:
                        features_dict['unfear_TH'] = 0.1
                    features_dict['fear_TH'] = features_dict['unfear_TH'] + 0.2
                else:
                    if len(position) > 0:
                        features_dict['fear_TH'] = 0.1 * (20 - position[0] - 1)
                    else:
                        features_dict['fear_TH'] = 0.3
                    features_dict['unfear_TH'] = features_dict['fear_TH'] - 0.2
                    features_dict['unfear_TH'] = 0.1 \
                        if features_dict['unfear_TH'] < 0.1 \
                        else features_dict['unfear_TH']
                print('{}: {}, fear_TH: {}, unfear_TH: {}'.format(
                    file_id, learn_max_min_sum, features_dict['fear_TH'],
                    features_dict['unfear_TH']))

        if cfg.EDA.STAT_FILES_MEAN:
            mean = ts_feature.mean(data.values)
            if self.phase != 'test':
                if file_label == 1:
                    self.stat_files_mean['fear'].append(mean)
                elif file_label == 0:
                    self.stat_files_mean['nomal'].append(mean)

        if cfg.DRAW.EDA_FEATURES:
            features = pd.DataFrame(features_dict)
            save = osp.join(cfg.DRAW.PATH,
                            'eda/' + file_name + '/' + str(im_index) + '.png')
            draw_dataframe(features,
                           save, ('mean', 'maximum', 'minimum', 'max-min'),
                           im_size=(40, 10))

        if cfg.DRAW.EDA_WHOLE:
            if cfg.EDA.FEATS_MAX_MIN:  # 使用到的color_list是根据此特征计算
                # 使用不同颜色标注片段
                save = osp.join(
                    cfg.DRAW.PATH,
                    'eda/' + file_name + '/' + str(im_index) + '.png')
                draw_dataframe_list(data_fragments,
                                    color_list,
                                    save, ('EDA', ),
                                    im_size=(im_index, 10),
                                    tick_spacing=100)
            save = osp.join(cfg.DRAW.PATH,
                            'eda/' + file_name + '/' + str(im_index) + '.png')
            draw_dataframe(data, save, ('EDA', ), im_size=(im_index, 10))

        for feat in features_dict:
            features_dict[feat] = np.array(features_dict[feat])
        return features_dict
示例#9
0
 def mean_diff(x: np.ndarray):
     """ Mean change between consecutive values
     :param x: a 1-d numeric vector
     :return: scalar feature
     """
     return fc.mean_change(x)
def create_X(x,
             last_index=None,
             n_steps=N_STEPS,
             step_length=STEP_LENGTH,
             aug=0):
    if last_index == None:
        last_index = len(x)

    assert last_index - n_steps * step_length >= 0

    # Reshaping and approximate standardization with mean 5 and std 3.
    per = x[(last_index - n_steps * step_length):last_index]
    #print("per", x.shape)

    #for data augmentation
    if aug == 1:
        flag = randint(0, 3)
        if flag == 0:
            s = np.random.normal(0, 1, per.shape[0])
            s = np.matrix.round(s, 0)
            per = per + s
        if flag == 1:
            per = running_mean(per)
        if flag == 2:
            per = savgol_filter(per, 5, polyorder=3)
            per = np.matrix.round(per, 0)
        if flag == 3:
            per = lowpassfilter(per, thresh=0.01, wavelet="db4")
            per = np.matrix.round(per, 0)

    temp = (per.reshape(n_steps, -1) - 5) / 3

    # Extracts features of sequences of full length 1000, of the last 100 values and finally also
    # of the last 10 observations.
    q05_roll_std_10 = np.zeros(n_steps)
    R_std = np.zeros(n_steps)
    av_change_abs_roll_mean_10 = np.zeros(n_steps)
    Imean = np.zeros(n_steps)

    mac = np.zeros(n_steps)
    mc = np.zeros(n_steps)

    for i in range(n_steps):
        #s=pd.DataFrame(temp[i, :])
        #x_roll_std=s.rolling(10).std().dropna().values
        x_roll_std = rolling_window(temp[i, :], 10).std(axis=1)
        #print("rol ", x_roll_std.shape)
        q05_roll_std_10[i] = np.quantile(x_roll_std, 0.05)

        zc = np.fft.fft(temp[i, :])
        realFFT = np.real(zc)
        R_std[i] = realFFT.std()
        imagFFT = np.imag(zc)
        Imean[i] = imagFFT.mean()

        #x_roll_mean = s.rolling(10).mean().dropna().values
        x_roll_mean = rolling_window(temp[i, :], 10).mean(axis=1)
        av_change_abs_roll_mean_10[i] = np.mean(np.diff(x_roll_mean))

        mac[i] = ts.mean_abs_change(temp[i, :])
        mc[i] = ts.mean_change(temp[i, :])

    return np.c_[extract_features(temp),
                 extract_features(temp[:, (step_length -
                                           int(round(step_length / DEL))):]),
                 extract_features(temp[:, (step_length -
                                           int(round(step_length /
                                                     (DEL * DEL)))):]),
                 q05_roll_std_10, R_std, av_change_abs_roll_mean_10, Imean,
                 mac, mc, temp[:, -1:]]
示例#11
0
    def features(self, x, prefix):
        feature_dict = dict()

        # create features here
        # numpy
        feature_dict[prefix + '_' + 'mean'] = np.mean(x)
        feature_dict[prefix + '_' + 'max'] = np.max(x)
        feature_dict[prefix + '_' + 'min'] = np.min(x)
        feature_dict[prefix + '_' + 'std'] = np.std(x)
        feature_dict[prefix + '_' + 'var'] = np.var(x)
        feature_dict[prefix + '_' + 'ptp'] = np.ptp(x)
        feature_dict[prefix + '_' + 'percentile_10'] = np.percentile(x, 10)
        feature_dict[prefix + '_' + 'percentile_20'] = np.percentile(x, 20)
        feature_dict[prefix + '_' + 'percentile_30'] = np.percentile(x, 30)
        feature_dict[prefix + '_' + 'percentile_40'] = np.percentile(x, 40)
        feature_dict[prefix + '_' + 'percentile_50'] = np.percentile(x, 50)
        feature_dict[prefix + '_' + 'percentile_60'] = np.percentile(x, 60)
        feature_dict[prefix + '_' + 'percentile_70'] = np.percentile(x, 70)
        feature_dict[prefix + '_' + 'percentile_80'] = np.percentile(x, 80)
        feature_dict[prefix + '_' + 'percentile_90'] = np.percentile(x, 90)

        # scipy
        feature_dict[prefix + '_' + 'skew'] = sp.stats.skew(x)
        feature_dict[prefix + '_' + 'kurtosis'] = sp.stats.kurtosis(x)
        feature_dict[prefix + '_' + 'kstat_1'] = sp.stats.kstat(x, 1)
        feature_dict[prefix + '_' + 'kstat_2'] = sp.stats.kstat(x, 2)
        feature_dict[prefix + '_' + 'kstat_3'] = sp.stats.kstat(x, 3)
        feature_dict[prefix + '_' + 'kstat_4'] = sp.stats.kstat(x, 4)
        feature_dict[prefix + '_' + 'moment_1'] = sp.stats.moment(x, 1)
        feature_dict[prefix + '_' + 'moment_2'] = sp.stats.moment(x, 2)
        feature_dict[prefix + '_' + 'moment_3'] = sp.stats.moment(x, 3)
        feature_dict[prefix + '_' + 'moment_4'] = sp.stats.moment(x, 4)

        # tsfresh
        feature_dict[prefix + '_' +
                     'abs_energy'] = feature_calculators.abs_energy(x)
        feature_dict[
            prefix + '_' +
            'abs_sum_of_changes'] = feature_calculators.absolute_sum_of_changes(
                x)
        feature_dict[
            prefix + '_' +
            'count_above_mean'] = feature_calculators.count_above_mean(x)
        feature_dict[
            prefix + '_' +
            'count_below_mean'] = feature_calculators.count_below_mean(x)
        feature_dict[prefix + '_' +
                     'mean_abs_change'] = feature_calculators.mean_abs_change(
                         x)
        feature_dict[prefix + '_' +
                     'mean_change'] = feature_calculators.mean_change(x)
        feature_dict[
            prefix + '_' +
            'var_larger_than_std_dev'] = feature_calculators.variance_larger_than_standard_deviation(
                x)
        feature_dict[prefix + '_' +
                     'range_minf_m4000'] = feature_calculators.range_count(
                         x, -np.inf, -4000)
        feature_dict[prefix + '_' +
                     'range_m4000_m3000'] = feature_calculators.range_count(
                         x, -4000, -3000)
        feature_dict[prefix + '_' +
                     'range_m3000_m2000'] = feature_calculators.range_count(
                         x, -3000, -2000)
        feature_dict[prefix + '_' +
                     'range_m2000_m1000'] = feature_calculators.range_count(
                         x, -2000, -1000)
        feature_dict[prefix + '_' +
                     'range_m1000_0'] = feature_calculators.range_count(
                         x, -1000, 0)
        feature_dict[prefix + '_' +
                     'range_0_p1000'] = feature_calculators.range_count(
                         x, 0, 1000)
        feature_dict[prefix + '_' +
                     'range_p1000_p2000'] = feature_calculators.range_count(
                         x, 1000, 2000)
        feature_dict[prefix + '_' +
                     'range_p2000_p3000'] = feature_calculators.range_count(
                         x, 2000, 3000)
        feature_dict[prefix + '_' +
                     'range_p3000_p4000'] = feature_calculators.range_count(
                         x, 3000, 4000)
        feature_dict[prefix + '_' +
                     'range_p4000_pinf'] = feature_calculators.range_count(
                         x, 4000, np.inf)

        feature_dict[
            prefix + '_' +
            'ratio_unique_values'] = feature_calculators.ratio_value_number_to_time_series_length(
                x)
        feature_dict[
            prefix + '_' +
            'first_loc_min'] = feature_calculators.first_location_of_minimum(x)
        feature_dict[
            prefix + '_' +
            'first_loc_max'] = feature_calculators.first_location_of_maximum(x)
        feature_dict[
            prefix + '_' +
            'last_loc_min'] = feature_calculators.last_location_of_minimum(x)
        feature_dict[
            prefix + '_' +
            'last_loc_max'] = feature_calculators.last_location_of_maximum(x)
        feature_dict[
            prefix + '_' +
            'time_rev_asym_stat_10'] = feature_calculators.time_reversal_asymmetry_statistic(
                x, 10)
        feature_dict[
            prefix + '_' +
            'time_rev_asym_stat_100'] = feature_calculators.time_reversal_asymmetry_statistic(
                x, 100)
        feature_dict[
            prefix + '_' +
            'time_rev_asym_stat_1000'] = feature_calculators.time_reversal_asymmetry_statistic(
                x, 1000)
        feature_dict[
            prefix + '_' +
            'autocorrelation_1'] = feature_calculators.autocorrelation(x, 1)
        feature_dict[
            prefix + '_' +
            'autocorrelation_2'] = feature_calculators.autocorrelation(x, 2)
        feature_dict[
            prefix + '_' +
            'autocorrelation_3'] = feature_calculators.autocorrelation(x, 3)
        feature_dict[
            prefix + '_' +
            'autocorrelation_4'] = feature_calculators.autocorrelation(x, 4)
        feature_dict[
            prefix + '_' +
            'autocorrelation_5'] = feature_calculators.autocorrelation(x, 5)
        feature_dict[
            prefix + '_' +
            'autocorrelation_6'] = feature_calculators.autocorrelation(x, 6)
        feature_dict[
            prefix + '_' +
            'autocorrelation_7'] = feature_calculators.autocorrelation(x, 7)
        feature_dict[
            prefix + '_' +
            'autocorrelation_8'] = feature_calculators.autocorrelation(x, 8)
        feature_dict[
            prefix + '_' +
            'autocorrelation_9'] = feature_calculators.autocorrelation(x, 9)
        feature_dict[
            prefix + '_' +
            'autocorrelation_10'] = feature_calculators.autocorrelation(x, 10)
        feature_dict[
            prefix + '_' +
            'autocorrelation_50'] = feature_calculators.autocorrelation(x, 50)
        feature_dict[
            prefix + '_' +
            'autocorrelation_100'] = feature_calculators.autocorrelation(
                x, 100)
        feature_dict[
            prefix + '_' +
            'autocorrelation_1000'] = feature_calculators.autocorrelation(
                x, 1000)
        feature_dict[prefix + '_' + 'c3_1'] = feature_calculators.c3(x, 1)
        feature_dict[prefix + '_' + 'c3_2'] = feature_calculators.c3(x, 2)
        feature_dict[prefix + '_' + 'c3_3'] = feature_calculators.c3(x, 3)
        feature_dict[prefix + '_' + 'c3_4'] = feature_calculators.c3(x, 4)
        feature_dict[prefix + '_' + 'c3_5'] = feature_calculators.c3(x, 5)
        feature_dict[prefix + '_' + 'c3_10'] = feature_calculators.c3(x, 10)
        feature_dict[prefix + '_' + 'c3_100'] = feature_calculators.c3(x, 100)
        for c in range(1, 34):
            feature_dict[prefix + '_' + 'fft_{0}_real'.format(c)] = list(
                feature_calculators.fft_coefficient(x, [{
                    'coeff': c,
                    'attr': 'real'
                }]))[0][1]
            feature_dict[prefix + '_' + 'fft_{0}_imag'.format(c)] = list(
                feature_calculators.fft_coefficient(x, [{
                    'coeff': c,
                    'attr': 'imag'
                }]))[0][1]
            feature_dict[prefix + '_' + 'fft_{0}_ang'.format(c)] = list(
                feature_calculators.fft_coefficient(x, [{
                    'coeff': c,
                    'attr': 'angle'
                }]))[0][1]
        feature_dict[
            prefix + '_' +
            'long_strk_above_mean'] = feature_calculators.longest_strike_above_mean(
                x)
        feature_dict[
            prefix + '_' +
            'long_strk_below_mean'] = feature_calculators.longest_strike_below_mean(
                x)
        feature_dict[prefix + '_' + 'cid_ce_0'] = feature_calculators.cid_ce(
            x, 0)
        feature_dict[prefix + '_' + 'cid_ce_1'] = feature_calculators.cid_ce(
            x, 1)
        feature_dict[prefix + '_' +
                     'binned_entropy_5'] = feature_calculators.binned_entropy(
                         x, 5)
        feature_dict[prefix + '_' +
                     'binned_entropy_10'] = feature_calculators.binned_entropy(
                         x, 10)
        feature_dict[prefix + '_' +
                     'binned_entropy_20'] = feature_calculators.binned_entropy(
                         x, 20)
        feature_dict[prefix + '_' +
                     'binned_entropy_50'] = feature_calculators.binned_entropy(
                         x, 50)
        feature_dict[prefix + '_' +
                     'binned_entropy_80'] = feature_calculators.binned_entropy(
                         x, 80)
        feature_dict[
            prefix + '_' +
            'binned_entropy_100'] = feature_calculators.binned_entropy(x, 100)

        feature_dict[prefix + '_' +
                     'num_crossing_0'] = feature_calculators.number_crossing_m(
                         x, 0)
        feature_dict[prefix + '_' +
                     'num_peaks_1'] = feature_calculators.number_peaks(x, 1)
        feature_dict[prefix + '_' +
                     'num_peaks_3'] = feature_calculators.number_peaks(x, 3)
        feature_dict[prefix + '_' +
                     'num_peaks_5'] = feature_calculators.number_peaks(x, 5)
        feature_dict[prefix + '_' +
                     'num_peaks_10'] = feature_calculators.number_peaks(x, 10)
        feature_dict[prefix + '_' +
                     'num_peaks_50'] = feature_calculators.number_peaks(x, 50)
        feature_dict[prefix + '_' +
                     'num_peaks_100'] = feature_calculators.number_peaks(
                         x, 100)
        feature_dict[prefix + '_' +
                     'num_peaks_500'] = feature_calculators.number_peaks(
                         x, 500)

        feature_dict[prefix + '_' + 'spkt_welch_density_1'] = list(
            feature_calculators.spkt_welch_density(x, [{
                'coeff': 1
            }]))[0][1]
        feature_dict[prefix + '_' + 'spkt_welch_density_2'] = list(
            feature_calculators.spkt_welch_density(x, [{
                'coeff': 2
            }]))[0][1]
        feature_dict[prefix + '_' + 'spkt_welch_density_5'] = list(
            feature_calculators.spkt_welch_density(x, [{
                'coeff': 5
            }]))[0][1]
        feature_dict[prefix + '_' + 'spkt_welch_density_8'] = list(
            feature_calculators.spkt_welch_density(x, [{
                'coeff': 8
            }]))[0][1]
        feature_dict[prefix + '_' + 'spkt_welch_density_10'] = list(
            feature_calculators.spkt_welch_density(x, [{
                'coeff': 10
            }]))[0][1]
        feature_dict[prefix + '_' + 'spkt_welch_density_50'] = list(
            feature_calculators.spkt_welch_density(x, [{
                'coeff': 50
            }]))[0][1]
        feature_dict[prefix + '_' + 'spkt_welch_density_100'] = list(
            feature_calculators.spkt_welch_density(x, [{
                'coeff': 100
            }]))[0][1]

        feature_dict[
            prefix + '_' +
            'time_rev_asym_stat_1'] = feature_calculators.time_reversal_asymmetry_statistic(
                x, 1)
        feature_dict[
            prefix + '_' +
            'time_rev_asym_stat_2'] = feature_calculators.time_reversal_asymmetry_statistic(
                x, 2)
        feature_dict[
            prefix + '_' +
            'time_rev_asym_stat_3'] = feature_calculators.time_reversal_asymmetry_statistic(
                x, 3)
        feature_dict[
            prefix + '_' +
            'time_rev_asym_stat_4'] = feature_calculators.time_reversal_asymmetry_statistic(
                x, 4)
        feature_dict[
            prefix + '_' +
            'time_rev_asym_stat_10'] = feature_calculators.time_reversal_asymmetry_statistic(
                x, 10)
        feature_dict[
            prefix + '_' +
            'time_rev_asym_stat_100'] = feature_calculators.time_reversal_asymmetry_statistic(
                x, 100)

        for r in range(20):
            feature_dict[prefix + '_' + 'symmetry_looking_' +
                         str(r)] = feature_calculators.symmetry_looking(
                             x, [{
                                 'r': r * 0.05
                             }])[0][1]

        for r in range(1, 20):
            feature_dict[
                prefix + '_' + 'large_standard_deviation_' +
                str(r)] = feature_calculators.large_standard_deviation(
                    x, r * 0.05)

        for r in range(1, 10):
            feature_dict[prefix + '_' + 'quantile_' +
                         str(r)] = feature_calculators.quantile(x, r * 0.1)

        for r in ['mean', 'median', 'var']:
            feature_dict[prefix + '_' + 'agg_autocorr_' +
                         r] = feature_calculators.agg_autocorrelation(
                             x, [{
                                 'f_agg': r,
                                 'maxlag': 40
                             }])[0][-1]

        #for r in range(1, 6):
        #    feature_dict[prefix+'_'+'number_cwt_peaks_'+str(r)] = feature_calculators.number_cwt_peaks(x, r)

        for r in range(1, 10):
            feature_dict[prefix + '_' + 'index_mass_quantile_' +
                         str(r)] = feature_calculators.index_mass_quantile(
                             x, [{
                                 'q': r
                             }])[0][1]

        #for ql in [0., .2, .4, .6, .8]:
        #    for qh in [.2, .4, .6, .8, 1.]:
        #        if ql < qh:
        #            for b in [False, True]:
        #                for f in ["mean", "var"]:
        #                    feature_dict[prefix+'_'+'change_quantiles_'+str(ql)+'_'+str(qh)+'_'+str(b)+'_'+str(f)] = feature_calculators.change_quantiles(x, ql, qh, b, f)

        #for r in [.1, .3, .5, .7, .9]:
        #    feature_dict[prefix+'_'+'approximate_entropy_'+str(r)] = feature_calculators.approximate_entropy(x, 2, r)

        feature_dict[
            prefix + '_' +
            'max_langevin_fixed_point'] = feature_calculators.max_langevin_fixed_point(
                x, 3, 30)

        for r in ['pvalue', 'rvalue', 'intercept', 'slope', 'stderr']:
            feature_dict[prefix + '_' + 'linear_trend_' +
                         str(r)] = feature_calculators.linear_trend(
                             x, [{
                                 'attr': r
                             }])[0][1]

        for r in ['pvalue', 'teststat', 'usedlag']:
            feature_dict[prefix + '_' + 'augmented_dickey_fuller_' +
                         r] = feature_calculators.augmented_dickey_fuller(
                             x, [{
                                 'attr': r
                             }])[0][1]

        for r in [0.5, 1, 1.5, 2, 2.5, 3, 5, 6, 7, 10]:
            feature_dict[prefix + '_' + 'ratio_beyond_r_sigma_' +
                         str(r)] = feature_calculators.ratio_beyond_r_sigma(
                             x, r)

        #for attr in ["pvalue", "rvalue", "intercept", "slope", "stderr"]:
        #    feature_dict[prefix+'_'+'linear_trend_timewise_'+attr] = feature_calculators.linear_trend_timewise(x, [{'attr': attr}])[0][1]
        #for attr in ["rvalue", "intercept", "slope", "stderr"]:
        #    for i in [5, 10, 50]:
        #        for f in ["max", "min", "mean", "var"]:
        #            feature_dict[prefix+'_'+'agg_linear_trend_'+attr+'_'+str(i)+'_'+f] = feature_calculators.agg_linear_trend(x, [{'attr': attr, 'chunk_len': i, 'f_agg': f}])[0][-1]
        #for width in [2, 5, 10, 20]:
        #    for coeff in range(15):
        #        for w in [2, 5, 10, 20]:
        #            feature_dict[prefix+'_'+'cwt_coefficients_'+str(width)+'_'+str(coeff)+'_'+str(w)] = list(feature_calculators.cwt_coefficients(x, [{'widths': width, 'coeff': coeff, 'w': w}]))[0][1]
        #for r in range(10):
        #    feature_dict[prefix+'_'+'partial_autocorr_'+str(r)] = feature_calculators.partial_autocorrelation(x, [{'lag': r}])[0][1]
        # "ar_coefficient": [{"coeff": coeff, "k": k} for coeff in range(5) for k in [10]],
        # "fft_coefficient": [{"coeff": k, "attr": a} for a, k in product(["real", "imag", "abs", "angle"], range(100))],
        # "fft_aggregated": [{"aggtype": s} for s in ["centroid", "variance", "skew", "kurtosis"]],
        # "value_count": [{"value": value} for value in [0, 1, -1]],
        # "range_count": [{"min": -1, "max": 1}, {"min": 1e12, "max": 0}, {"min": 0, "max": 1e12}],
        # "friedrich_coefficients": (lambda m: [{"coeff": coeff, "m": m, "r": 30} for coeff in range(m + 1)])(3),
        #  "energy_ratio_by_chunks": [{"num_segments": 10, "segment_focus": i} for i in range(10)],
        return feature_dict
def get_mean_change(arr):
    res = np.array([mean_change(arr)])
    res = np.nan_to_num(res)
    return res
示例#13
0
    def features(self, x, y, seg_id):
        feature_dict = dict()
        feature_dict['target'] = y
        feature_dict['seg_id'] = seg_id

        # create features here
        # numpy
        feature_dict['mean'] = np.mean(x)
        feature_dict['max'] = np.max(x)
        feature_dict['min'] = np.min(x)
        feature_dict['std'] = np.std(x)
        feature_dict['var'] = np.var(x)
        feature_dict['ptp'] = np.ptp(x)
        feature_dict['percentile_10'] = np.percentile(x, 10)
        feature_dict['percentile_20'] = np.percentile(x, 20)
        feature_dict['percentile_30'] = np.percentile(x, 30)
        feature_dict['percentile_40'] = np.percentile(x, 40)
        feature_dict['percentile_50'] = np.percentile(x, 50)
        feature_dict['percentile_60'] = np.percentile(x, 60)
        feature_dict['percentile_70'] = np.percentile(x, 70)
        feature_dict['percentile_80'] = np.percentile(x, 80)
        feature_dict['percentile_90'] = np.percentile(x, 90)

        # scipy
        feature_dict['skew'] = sp.stats.skew(x)
        feature_dict['kurtosis'] = sp.stats.kurtosis(x)
        feature_dict['kstat_1'] = sp.stats.kstat(x, 1)
        feature_dict['kstat_2'] = sp.stats.kstat(x, 2)
        feature_dict['kstat_3'] = sp.stats.kstat(x, 3)
        feature_dict['kstat_4'] = sp.stats.kstat(x, 4)
        feature_dict['moment_1'] = sp.stats.moment(x, 1)
        feature_dict['moment_2'] = sp.stats.moment(x, 2)
        feature_dict['moment_3'] = sp.stats.moment(x, 3)
        feature_dict['moment_4'] = sp.stats.moment(x, 4)
        
        feature_dict['abs_energy'] = feature_calculators.abs_energy(x)
        feature_dict['abs_sum_of_changes'] = feature_calculators.absolute_sum_of_changes(x)
        feature_dict['count_above_mean'] = feature_calculators.count_above_mean(x)
        feature_dict['count_below_mean'] = feature_calculators.count_below_mean(x)
        feature_dict['mean_abs_change'] = feature_calculators.mean_abs_change(x)
        feature_dict['mean_change'] = feature_calculators.mean_change(x)
        feature_dict['var_larger_than_std_dev'] = feature_calculators.variance_larger_than_standard_deviation(x)
        feature_dict['range_minf_m4000'] = feature_calculators.range_count(x, -np.inf, -4000)
        feature_dict['range_m4000_m3000'] = feature_calculators.range_count(x, -4000, -3000)
        feature_dict['range_m3000_m2000'] = feature_calculators.range_count(x, -3000, -2000)
        feature_dict['range_m2000_m1000'] = feature_calculators.range_count(x, -2000, -1000)
        feature_dict['range_m1000_0'] = feature_calculators.range_count(x, -1000, 0)
        feature_dict['range_0_p1000'] = feature_calculators.range_count(x, 0, 1000)
        feature_dict['range_p1000_p2000'] = feature_calculators.range_count(x, 1000, 2000)
        feature_dict['range_p2000_p3000'] = feature_calculators.range_count(x, 2000, 3000)
        feature_dict['range_p3000_p4000'] = feature_calculators.range_count(x, 3000, 4000)
        feature_dict['range_p4000_pinf'] = feature_calculators.range_count(x, 4000, np.inf)

        feature_dict['ratio_unique_values'] = feature_calculators.ratio_value_number_to_time_series_length(x)
        feature_dict['first_loc_min'] = feature_calculators.first_location_of_minimum(x)
        feature_dict['first_loc_max'] = feature_calculators.first_location_of_maximum(x)
        feature_dict['last_loc_min'] = feature_calculators.last_location_of_minimum(x)
        feature_dict['last_loc_max'] = feature_calculators.last_location_of_maximum(x)
        feature_dict['time_rev_asym_stat_10'] = feature_calculators.time_reversal_asymmetry_statistic(x, 10)
        feature_dict['time_rev_asym_stat_100'] = feature_calculators.time_reversal_asymmetry_statistic(x, 100)
        feature_dict['time_rev_asym_stat_1000'] = feature_calculators.time_reversal_asymmetry_statistic(x, 1000)
        feature_dict['autocorrelation_5'] = feature_calculators.autocorrelation(x, 5)
        feature_dict['autocorrelation_10'] = feature_calculators.autocorrelation(x, 10)
        feature_dict['autocorrelation_50'] = feature_calculators.autocorrelation(x, 50)
        feature_dict['autocorrelation_100'] = feature_calculators.autocorrelation(x, 100)
        feature_dict['autocorrelation_1000'] = feature_calculators.autocorrelation(x, 1000)
        feature_dict['c3_5'] = feature_calculators.c3(x, 5)
        feature_dict['c3_10'] = feature_calculators.c3(x, 10)
        feature_dict['c3_100'] = feature_calculators.c3(x, 100)
        feature_dict['fft_1_real'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 1, 'attr': 'real'}]))[0][1]
        feature_dict['fft_1_imag'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 1, 'attr': 'imag'}]))[0][1]
        feature_dict['fft_1_ang'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 1, 'attr': 'angle'}]))[0][1]
        feature_dict['fft_2_real'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 2, 'attr': 'real'}]))[0][1]
        feature_dict['fft_2_imag'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 2, 'attr': 'imag'}]))[0][1]
        feature_dict['fft_2_ang'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 2, 'attr': 'angle'}]))[0][1]
        feature_dict['fft_3_real'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 3, 'attr': 'real'}]))[0][1]
        feature_dict['fft_3_imag'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 3, 'attr': 'imag'}]))[0][1]
        feature_dict['fft_3_ang'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 3, 'attr': 'angle'}]))[0][1]
        feature_dict['long_strk_above_mean'] = feature_calculators.longest_strike_above_mean(x)
        feature_dict['long_strk_below_mean'] = feature_calculators.longest_strike_below_mean(x)
        feature_dict['cid_ce_0'] = feature_calculators.cid_ce(x, 0)
        feature_dict['cid_ce_1'] = feature_calculators.cid_ce(x, 1)
        feature_dict['binned_entropy_5'] = feature_calculators.binned_entropy(x, 5)
        feature_dict['binned_entropy_10'] = feature_calculators.binned_entropy(x, 10)
        feature_dict['binned_entropy_20'] = feature_calculators.binned_entropy(x, 20)
        feature_dict['binned_entropy_50'] = feature_calculators.binned_entropy(x, 50)
        feature_dict['binned_entropy_80'] = feature_calculators.binned_entropy(x, 80)
        feature_dict['binned_entropy_100'] = feature_calculators.binned_entropy(x, 100)

        feature_dict['num_crossing_0'] = feature_calculators.number_crossing_m(x, 0)
        feature_dict['num_peaks_10'] = feature_calculators.number_peaks(x, 10)
        feature_dict['num_peaks_50'] = feature_calculators.number_peaks(x, 50)
        feature_dict['num_peaks_100'] = feature_calculators.number_peaks(x, 100)
        feature_dict['num_peaks_500'] = feature_calculators.number_peaks(x, 500)

        feature_dict['spkt_welch_density_1'] = list(feature_calculators.spkt_welch_density(x, [{'coeff': 1}]))[0][1]
        feature_dict['spkt_welch_density_10'] = list(feature_calculators.spkt_welch_density(x, [{'coeff': 10}]))[0][1]
        feature_dict['spkt_welch_density_50'] = list(feature_calculators.spkt_welch_density(x, [{'coeff': 50}]))[0][1]
        feature_dict['spkt_welch_density_100'] = list(feature_calculators.spkt_welch_density(x, [{'coeff': 100}]))[0][1]

        feature_dict['time_rev_asym_stat_1'] = feature_calculators.time_reversal_asymmetry_statistic(x, 1)
        feature_dict['time_rev_asym_stat_10'] = feature_calculators.time_reversal_asymmetry_statistic(x, 10)
        feature_dict['time_rev_asym_stat_100'] = feature_calculators.time_reversal_asymmetry_statistic(x, 100)        

        return feature_dict
示例#14
0
    def features(self, x, y, seg_id, denoise=False):
        if (denoise == True):
            x_hp = high_pass_filter(x, low_cutoff=10000, sample_rate=4000000)

            x = denoise_signal(x_hp, wavelet='haar', level=1)

        feature_dict = dict()
        feature_dict['target'] = y
        feature_dict['seg_id'] = seg_id

        # create features here

        # lists with parameters to iterate over them
        percentiles = [
            1, 5, 10, 20, 25, 30, 40, 50, 60, 70, 75, 80, 90, 95, 99
        ]
        hann_windows = [50, 150, 1500, 15000]
        spans = [300, 3000, 30000, 50000]
        windows = [10, 50, 100, 500, 1000, 10000]
        borders = list(range(-4000, 4001, 1000))
        peaks = [10, 20, 50, 100]
        coefs = [1, 5, 10, 50, 100]
        lags = [10, 100, 1000, 10000]
        autocorr_lags = [5, 10, 50, 100, 500, 1000, 5000, 10000]

        # basic stats
        feature_dict['mean'] = x.mean()
        feature_dict['std'] = x.std()
        feature_dict['max'] = x.max()
        feature_dict['min'] = x.min()

        # basic stats on absolute values
        feature_dict['mean_change_abs'] = np.mean(np.diff(x))
        feature_dict['abs_max'] = np.abs(x).max()
        feature_dict['abs_mean'] = np.abs(x).mean()
        feature_dict['abs_std'] = np.abs(x).std()

        # geometric and harminic means
        feature_dict['hmean'] = stats.hmean(np.abs(x[np.nonzero(x)[0]]))
        feature_dict['gmean'] = stats.gmean(np.abs(x[np.nonzero(x)[0]]))

        # k-statistic and moments
        for i in range(1, 5):
            feature_dict['kstat_{}'.format(i)] = stats.kstat(x, i)
            feature_dict['moment_{}'.format(i)] = stats.moment(x, i)

        for i in [1, 2]:
            feature_dict['kstatvar_{}'.format(i)] = stats.kstatvar(x, i)

        # aggregations on various slices of data
        for agg_type, slice_length, direction in product(
            ['std', 'min', 'max', 'mean'], [1000, 10000, 50000],
            ['first', 'last']):
            if direction == 'first':
                feature_dict['{}_{}_{}'.format(
                    agg_type, direction,
                    slice_length)] = x[:slice_length].agg(agg_type)
            elif direction == 'last':
                feature_dict['{}_{}_{}'.format(
                    agg_type, direction,
                    slice_length)] = x[-slice_length:].agg(agg_type)

        feature_dict['max_to_min'] = x.max() / np.abs(x.min())
        feature_dict['max_to_min_diff'] = x.max() - np.abs(x.min())
        feature_dict['count_big'] = len(x[np.abs(x) > 500])
        feature_dict['sum'] = x.sum()

        feature_dict['mean_change_rate'] = calc_change_rate(x)
        # calc_change_rate on slices of data
        for slice_length, direction in product([1000, 10000, 50000],
                                               ['first', 'last']):
            if direction == 'first':
                feature_dict['mean_change_rate_{}_{}'.format(
                    direction,
                    slice_length)] = calc_change_rate(x[:slice_length])
            elif direction == 'last':
                feature_dict['mean_change_rate_{}_{}'.format(
                    direction,
                    slice_length)] = calc_change_rate(x[-slice_length:])

        # percentiles on original and absolute values
        for p in percentiles:
            feature_dict['percentile_{}'.format(p)] = np.percentile(x, p)
            feature_dict['abs_percentile_{}'.format(p)] = np.percentile(
                np.abs(x), p)

        feature_dict['trend'] = add_trend_feature(x)
        feature_dict['abs_trend'] = add_trend_feature(x, abs_values=True)

        feature_dict['mad'] = x.mad()
        feature_dict['kurt'] = x.kurtosis()
        feature_dict['skew'] = x.skew()
        feature_dict['med'] = x.median()

        feature_dict['Hilbert_mean'] = np.abs(hilbert(x)).mean()

        for hw in hann_windows:
            feature_dict['Hann_window_mean_{}'.format(hw)] = (
                convolve(x, hann(hw), mode='same') / sum(hann(hw))).mean()

        feature_dict['classic_sta_lta1_mean'] = classic_sta_lta(x, 500,
                                                                10000).mean()
        feature_dict['classic_sta_lta2_mean'] = classic_sta_lta(
            x, 5000, 100000).mean()
        feature_dict['classic_sta_lta3_mean'] = classic_sta_lta(x, 3333,
                                                                6666).mean()
        feature_dict['classic_sta_lta4_mean'] = classic_sta_lta(
            x, 10000, 25000).mean()
        feature_dict['classic_sta_lta5_mean'] = classic_sta_lta(x, 50,
                                                                1000).mean()
        feature_dict['classic_sta_lta6_mean'] = classic_sta_lta(x, 100,
                                                                5000).mean()
        feature_dict['classic_sta_lta7_mean'] = classic_sta_lta(x, 333,
                                                                666).mean()
        feature_dict['classic_sta_lta8_mean'] = classic_sta_lta(
            x, 4000, 10000).mean()

        # exponential rolling statistics
        ewma = pd.Series.ewm
        for s in spans:
            feature_dict['exp_Moving_average_{}_mean'.format(s)] = (ewma(
                x, span=s).mean(skipna=True)).mean(skipna=True)
            feature_dict['exp_Moving_average_{}_std'.format(s)] = (ewma(
                x, span=s).mean(skipna=True)).std(skipna=True)
            feature_dict['exp_Moving_std_{}_mean'.format(s)] = (ewma(
                x, span=s).std(skipna=True)).mean(skipna=True)
            feature_dict['exp_Moving_std_{}_std'.format(s)] = (ewma(
                x, span=s).std(skipna=True)).std(skipna=True)

        feature_dict['iqr1'] = np.subtract(*np.percentile(x, [95, 5]))
        feature_dict['ave10'] = stats.trim_mean(x, 0.1)

        for slice_length, threshold in product([50000, 100000, 150000],
                                               [5, 10, 20, 50, 100]):
            feature_dict['count_big_{}_threshold_{}'.format(
                slice_length,
                threshold)] = (np.abs(x[-slice_length:]) > threshold).sum()
            feature_dict['count_big_{}_less_threshold_{}'.format(
                slice_length,
                threshold)] = (np.abs(x[-slice_length:]) < threshold).sum()

        # tfresh features take too long to calculate, so I comment them for now

        feature_dict['abs_energy'] = feature_calculators.abs_energy(x)
        feature_dict[
            'abs_sum_of_changes'] = feature_calculators.absolute_sum_of_changes(
                x)
        feature_dict[
            'count_above_mean'] = feature_calculators.count_above_mean(x)
        feature_dict[
            'count_below_mean'] = feature_calculators.count_below_mean(x)
        feature_dict['mean_abs_change'] = feature_calculators.mean_abs_change(
            x)
        feature_dict['mean_change'] = feature_calculators.mean_change(x)
        feature_dict[
            'var_larger_than_std_dev'] = feature_calculators.variance_larger_than_standard_deviation(
                x)
        feature_dict['range_minf_m4000'] = feature_calculators.range_count(
            x, -np.inf, -4000)
        feature_dict['range_p4000_pinf'] = feature_calculators.range_count(
            x, 4000, np.inf)

        for i, j in zip(borders, borders[1:]):
            feature_dict['range_{}_{}'.format(
                i, j)] = feature_calculators.range_count(x, i, j)

        feature_dict[
            'ratio_unique_values'] = feature_calculators.ratio_value_number_to_time_series_length(
                x)
        feature_dict[
            'first_loc_min'] = feature_calculators.first_location_of_minimum(x)
        feature_dict[
            'first_loc_max'] = feature_calculators.first_location_of_maximum(x)
        feature_dict[
            'last_loc_min'] = feature_calculators.last_location_of_minimum(x)
        feature_dict[
            'last_loc_max'] = feature_calculators.last_location_of_maximum(x)

        for lag in lags:
            feature_dict['time_rev_asym_stat_{}'.format(
                lag)] = feature_calculators.time_reversal_asymmetry_statistic(
                    x, lag)
        for autocorr_lag in autocorr_lags:
            feature_dict['autocorrelation_{}'.format(
                autocorr_lag)] = feature_calculators.autocorrelation(
                    x, autocorr_lag)
            feature_dict['c3_{}'.format(
                autocorr_lag)] = feature_calculators.c3(x, autocorr_lag)

        for coeff, attr in product([1, 2, 3, 4, 5], ['real', 'imag', 'angle']):
            feature_dict['fft_{}_{}'.format(coeff, attr)] = list(
                feature_calculators.fft_coefficient(x, [{
                    'coeff': coeff,
                    'attr': attr
                }]))[0][1]

        feature_dict[
            'long_strk_above_mean'] = feature_calculators.longest_strike_above_mean(
                x)
        feature_dict[
            'long_strk_below_mean'] = feature_calculators.longest_strike_below_mean(
                x)
        feature_dict['cid_ce_0'] = feature_calculators.cid_ce(x, 0)
        feature_dict['cid_ce_1'] = feature_calculators.cid_ce(x, 1)

        for p in percentiles:
            feature_dict['binned_entropy_{}'.format(
                p)] = feature_calculators.binned_entropy(x, p)

        feature_dict['num_crossing_0'] = feature_calculators.number_crossing_m(
            x, 0)

        for peak in peaks:
            feature_dict['num_peaks_{}'.format(
                peaks)] = feature_calculators.number_peaks(x, peak)

        for c in coefs:
            feature_dict['spkt_welch_density_{}'.format(c)] = list(
                feature_calculators.spkt_welch_density(x, [{
                    'coeff': c
                }]))[0][1]
            feature_dict['time_rev_asym_stat_{}'.format(
                c)] = feature_calculators.time_reversal_asymmetry_statistic(
                    x, c)

        # statistics on rolling windows of various sizes
        for w in windows:
            x_roll_std = x.rolling(w).std().dropna().values
            x_roll_mean = x.rolling(w).mean().dropna().values

            feature_dict['ave_roll_std_{}'.format(w)] = x_roll_std.mean()
            feature_dict['std_roll_std_{}'.format(w)] = x_roll_std.std()
            feature_dict['max_roll_std_{}'.format(w)] = x_roll_std.max()
            feature_dict['min_roll_std_{}'.format(w)] = x_roll_std.min()

            for p in percentiles:
                feature_dict['percentile_roll_std_{}_window_{}'.format(
                    p, w)] = np.percentile(x_roll_std, p)

            feature_dict['av_change_abs_roll_std_{}'.format(w)] = np.mean(
                np.diff(x_roll_std))
            feature_dict['av_change_rate_roll_std_{}'.format(w)] = np.mean(
                np.nonzero((np.diff(x_roll_std) / x_roll_std[:-1]))[0])
            feature_dict['abs_max_roll_std_{}'.format(w)] = np.abs(
                x_roll_std).max()

            feature_dict['ave_roll_mean_{}'.format(w)] = x_roll_mean.mean()
            feature_dict['std_roll_mean_{}'.format(w)] = x_roll_mean.std()
            feature_dict['max_roll_mean_{}'.format(w)] = x_roll_mean.max()
            feature_dict['min_roll_mean_{}'.format(w)] = x_roll_mean.min()

            for p in percentiles:
                feature_dict['percentile_roll_mean_{}_window_{}'.format(
                    p, w)] = np.percentile(x_roll_mean, p)

            feature_dict['av_change_abs_roll_mean_{}'.format(w)] = np.mean(
                np.diff(x_roll_mean))
            feature_dict['av_change_rate_roll_mean_{}'.format(w)] = np.mean(
                np.nonzero((np.diff(x_roll_mean) / x_roll_mean[:-1]))[0])
            feature_dict['abs_max_roll_mean_{}'.format(w)] = np.abs(
                x_roll_mean).max()

        return feature_dict
示例#15
0
def create_features2(seg, ):
    data_row = {}

    xcz = des_filter(seg, high=CUTOFF)

    zc = np.fft.fft(xcz)
    zc = zc[:MAX_FREQ]

    # FFT transform values
    realFFT = np.real(zc)
    imagFFT = np.imag(zc)

    freq_bands = list(range(0, MAX_FREQ, FREQ_STEP))
    magFFT = np.abs(zc)
    phzFFT = np.angle(zc)
    phzFFT[phzFFT == -np.inf] = -np.pi / 2.0
    phzFFT[phzFFT == np.inf] = np.pi / 2.0
    phzFFT = np.nan_to_num(phzFFT)

    for freq in freq_bands:
        data_row['FFT_Mag_01q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_STEP], 0.01)
        data_row['FFT_Mag_10q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_STEP], 0.1)
        data_row['FFT_Mag_90q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_STEP], 0.9)
        data_row['FFT_Mag_99q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_STEP], 0.99)

        data_row['FFT_Mag_mean%d' % freq] = np.mean(magFFT[freq: freq + FREQ_STEP])
        data_row['FFT_Mag_std%d' % freq] = np.std(magFFT[freq: freq + FREQ_STEP])
        data_row['FFT_Mag_max%d' % freq] = np.max(magFFT[freq: freq + FREQ_STEP])
        data_row['FFT_Mag_min%d' % freq] = np.min(magFFT[freq: freq + FREQ_STEP])

        data_row['FFT_Phz_mean%d' % freq] = np.mean(phzFFT[freq: freq + FREQ_STEP])
        data_row['FFT_Phz_std%d' % freq] = np.std(phzFFT[freq: freq + FREQ_STEP])
        data_row['FFT_Phz_max%d' % freq] = np.max(phzFFT[freq: freq + FREQ_STEP])
        data_row['FFT_Phz_min%d' % freq] = np.min(phzFFT[freq: freq + FREQ_STEP])

    data_row['FFT_Rmean'] = realFFT.mean()
    data_row['FFT_Rstd'] = realFFT.std()
    data_row['FFT_Rmax'] = realFFT.max()
    data_row['FFT_Rmin'] = realFFT.min()
    data_row['FFT_Imean'] = imagFFT.mean()
    data_row['FFT_Istd'] = imagFFT.std()
    data_row['FFT_Imax'] = imagFFT.max()
    data_row['FFT_Imin'] = imagFFT.min()

    data_row['FFT_Rmean_first_6000'] = realFFT[:6000].mean()
    data_row['FFT_Rstd__first_6000'] = realFFT[:6000].std()
    data_row['FFT_Rmax_first_6000'] = realFFT[:6000].max()
    data_row['FFT_Rmin_first_6000'] = realFFT[:6000].min()
    data_row['FFT_Rmean_first_18000'] = realFFT[:18000].mean()
    data_row['FFT_Rstd_first_18000'] = realFFT[:18000].std()
    data_row['FFT_Rmax_first_18000'] = realFFT[:18000].max()
    data_row['FFT_Rmin_first_18000'] = realFFT[:18000].min()

    del xcz
    del zc
    # gc.collect()

    sigs = [seg]
    for freq in range(0, MAX_FREQ + FREQ_STEP, FREQ_STEP):
        if freq == 0:
            xc_ = des_filter(seg, high=FREQ_STEP)
        elif freq == MAX_FREQ:
            xc_ = des_filter(seg, low=freq)
        else:
            xc_ = des_filter(seg, low=freq, high=freq + FREQ_STEP)
        sigs.append(pd.Series(xc_))

    for window in [50, 200, 1000]:
        roll_mean = seg.rolling(window).mean().dropna()
        roll_std = seg.rolling(window).std().dropna()
        sigs.append(pd.Series(roll_mean))
        sigs.append(pd.Series(roll_std))

    for span in [30, 300, 3000]:
        exp_mean = seg.ewm(span).mean().dropna()
        exp_std = seg.ewm(span).std().dropna()
        sigs.append(pd.Series(exp_mean))
        sigs.append(pd.Series(exp_std))

    for i, sig in enumerate(sigs):

        data_row['mean_%d' % i] = sig.mean()
        data_row['std_%d' % i] = sig.std()
        data_row['max_%d' % i] = sig.max()
        data_row['min_%d' % i] = sig.min()

        data_row['mean_change_abs_%d' % i] = np.mean(np.diff(sig))
        data_row['mean_change_rate_%d' % i] = np.mean(np.nonzero((np.diff(sig) / sig[:-1]))[0])
        data_row['abs_max_%d' % i] = np.abs(sig).max()
        data_row['abs_min_%d' % i] = np.abs(sig).min()

        data_row['std_first_50000_%d' % i] = sig[:50000].std()
        data_row['std_last_50000_%d' % i] = sig[-50000:].std()
        data_row['std_first_10000_%d' % i] = sig[:10000].std()
        data_row['std_last_10000_%d' % i] = sig[-10000:].std()

        data_row['avg_first_50000_%d' % i] = sig[:50000].mean()
        data_row['avg_last_50000_%d' % i] = sig[-50000:].mean()
        data_row['avg_first_10000_%d' % i] = sig[:10000].mean()
        data_row['avg_last_10000_%d' % i] = sig[-10000:].mean()

        data_row['min_first_50000_%d' % i] = sig[:50000].min()
        data_row['min_last_50000_%d' % i] = sig[-50000:].min()
        data_row['min_first_10000_%d' % i] = sig[:10000].min()
        data_row['min_last_10000_%d' % i] = sig[-10000:].min()

        data_row['max_first_50000_%d' % i] = sig[:50000].max()
        data_row['max_last_50000_%d' % i] = sig[-50000:].max()
        data_row['max_first_10000_%d' % i] = sig[:10000].max()
        data_row['max_last_10000_%d' % i] = sig[-10000:].max()

        data_row['max_to_min_%d' % i] = sig.max() / np.abs(sig.min())
        data_row['max_to_min_diff_%d' % i] = sig.max() - np.abs(sig.min())
        data_row['count_big_%d' % i] = len(sig[np.abs(sig) > 500])
        data_row['sum_%d' % i] = sig.sum()

        data_row['mean_change_rate_first_50000_%d' % i] = np.mean(
            np.nonzero((np.diff(sig[:50000]) / sig[:50000][:-1]))[0])
        data_row['mean_change_rate_last_50000_%d' % i] = np.mean(
            np.nonzero((np.diff(sig[-50000:]) / sig[-50000:][:-1]))[0])
        data_row['mean_change_rate_first_10000_%d' % i] = np.mean(
            np.nonzero((np.diff(sig[:10000]) / sig[:10000][:-1]))[0])
        data_row['mean_change_rate_last_10000_%d' % i] = np.mean(
            np.nonzero((np.diff(sig[-10000:]) / sig[-10000:][:-1]))[0])

        for p in [1, 5, 10, 25, 50, 75, 90, 95, 99]:
            data_row['percentile_p{}_{}'.format(p, i)] = np.percentile(sig, p)
            data_row['abd_percentile_p{}_{}'.format(p, i)] = np.percentile(np.abs(sig), p)

        data_row['trend_%d' % i] = add_trend_feature(sig)
        data_row['abs_trend_%d' % i] = add_trend_feature(sig, abs_values=True)
        data_row['abs_mean_%d' % i] = np.abs(sig).mean()
        data_row['abs_std_%d' % i] = np.abs(sig).std()

        data_row['mad_%d' % i] = sig.mad()
        data_row['kurt_%d' % i] = sig.kurtosis()
        data_row['skew_%d' % i] = sig.skew()
        data_row['med_%d' % i] = sig.median()

        # data_row['Hilbert_mean_%d' % i] = np.abs(hilbert(sig)).mean()
        data_row['Hann_window50_%d' % i] = (convolve(sig, hann(50), mode='same') / sum(hann(50))).mean()
        data_row['Hann_window500_%d' % i] = (convolve(sig, hann(500), mode='same') / sum(hann(500))).mean()

        data_row['classic_sta_lta0_mean_%d' % i] = classic_sta_lta(sig, 50, 1000).mean()
        data_row['classic_sta_lta1_mean_%d' % i] = classic_sta_lta(sig, 500, 10000).mean()
        data_row['classic_sta_lta2_mean_%d' % i] = classic_sta_lta(sig, 5000, 100000).mean()
        data_row['classic_sta_lta3_mean_%d' % i] = classic_sta_lta(sig, 3333, 6666).mean()
        data_row['classic_sta_lta4_mean_%d' % i] = classic_sta_lta(sig, 10000, 25000).mean()

        no_of_std = 2
        for w in [10, 100, 500]:
            signal_mean = sig.rolling(window=w).mean()
            signal_std = sig.rolling(window=w).std()
            data_row['high_bound_mean_win{}_{}'.format(w, i)] = (signal_mean + no_of_std * signal_std).mean()
            data_row['low_bound_mean_win{}_{}'.format(w, i)] = (signal_mean - no_of_std * signal_std).mean()

        data_row['range_inf_4000_%d' % i] = feature_calculators.range_count(sig, -np.inf, -4000)
        data_row['range_4000_inf_%d' % i] = feature_calculators.range_count(sig, 4000, np.inf)
        for l, h in [[-4000, -2000], [-2000, 0], [0, 2000], [2000, 4000]]:
            data_row['range_{}_{}_{}'.format(np.abs(l), np.abs(h), i)] = feature_calculators.range_count(sig, l, h)

        data_row['iqr0_%d' % i] = np.subtract(*np.percentile(sig, [75, 25]))
        data_row['iqr1_%d' % i] = np.subtract(*np.percentile(sig, [95, 5]))
        data_row['ave10_%d' % i] = stats.trim_mean(sig, 0.1)
        data_row['num_cross_0_%d' % i] = feature_calculators.number_crossing_m(sig, 0)
        data_row['ratio_value_number_%d' % i] = feature_calculators.ratio_value_number_to_time_series_length(sig)
        # data_row['var_larger_than_std_dev_%d' % i] = feature_calculators.variance_larger_than_standard_deviation(sig)
        data_row['ratio_unique_values_%d' % i] = feature_calculators.ratio_value_number_to_time_series_length(sig)
        data_row['abs_energy_%d' % i] = feature_calculators.abs_energy(sig)
        data_row['abs_sum_of_changes_%d' % i] = feature_calculators.absolute_sum_of_changes(sig)
        data_row['count_above_mean_%d' % i] = feature_calculators.count_above_mean(sig)
        data_row['count_below_mean_%d' % i] = feature_calculators.count_below_mean(sig)
        data_row['mean_abs_change_%d' % i] = feature_calculators.mean_abs_change(sig)
        data_row['mean_change_%d' % i] = feature_calculators.mean_change(sig)
        data_row['first_loc_min_%d' % i] = feature_calculators.first_location_of_minimum(sig)
        data_row['first_loc_max_%d' % i] = feature_calculators.first_location_of_maximum(sig)
        data_row['last_loc_min_%d' % i] = feature_calculators.last_location_of_minimum(sig)
        data_row['last_loc_max_%d' % i] = feature_calculators.last_location_of_maximum(sig)
        data_row['long_strk_above_mean_%d' % i] = feature_calculators.longest_strike_above_mean(sig)
        data_row['long_strk_below_mean_%d' % i] = feature_calculators.longest_strike_below_mean(sig)
        # data_row['cid_ce_0_%d' % i] = feature_calculators.cid_ce(sig, 0)
        # data_row['cid_ce_1_%d' % i] = feature_calculators.cid_ce(sig, 1)

        for j in [10, 50, ]:
            data_row['peak_num_p{}_{}'.format(j, i)] = feature_calculators.number_peaks(sig, j)
        for j in [1, 10, 50, 100]:
            data_row['spkt_welch_density_coeff{}_{}'.format(j, i)] = \
            list(feature_calculators.spkt_welch_density(sig, [{'coeff': j}]))[0][1]
        for j in [5, 10, 100]:
            data_row['c3_c{}_{}'.format(j, i)] = feature_calculators.c3(sig, j)
        for j in [5, 10, 50, 100, 1000]:
            data_row['autocorrelation_auto{}_{}'.format(j, i)] = feature_calculators.autocorrelation(sig, j)
        for j in [10, 100, 1000]:
            data_row['time_rev_asym_stat_t{}_{}'.format(j, i)] = feature_calculators.time_reversal_asymmetry_statistic(
                sig, j)
        for j in range(1, 5):
            data_row['kstat_k{}_{}'.format(j, i)] = stats.kstat(sig, j)
            data_row['moment_m{}_{}'.format(j, i)] = stats.moment(sig, j)
        for j in range(1, 3):
            data_row['kstatvar_k{}_{}'.format(j, i)] = stats.kstatvar(sig, j)
        for j in [5, 10, 50, 100]:
            data_row['binned_entropy_b{}_{}'.format(j, i)] = feature_calculators.binned_entropy(sig, j)

    return data_row
# quantile feat
feature_dict['quantile_1'] = np.percentie(x, 25)
feature_dict['quantile_2'] = np.percentie(x, 50)
feature_dict['quantile_3'] = np.percentie(x, 75)
feature_dict['quantile_4'] = np.percentie(x, 99)

# ts fresh can be used for time series features when we have a list of features
# in a given period of time
from tsfresh.feature_extraction import feature_calculators as fc

feature_dict['abs_energey'] = fc.abs_energy(x)
feature_dict['count_above_mean'] = fc.count_above_mean(x)
feature_dict['count_below_mean'] = fc.count_below_mean(x)
feature_dict['mean_abs_change'] = fc.mean_abs_change(x)
feature_dict['mean_change'] = fc.mean_change(x)

# polynomial features
import pandas as pd
import numpy as np
from sklearn import preprocessing

df = pd.DataFrame(np.random.rand(100, 2), columns=['f1', 'f2'])
pf = preprocessing.PolynomialFeatures(degree=2,
                                      interaction_only=False,
                                      include_bias=False)
pf.fit(df)

poly_ft = pf.transform(df)

poly_df = pd.DataFrame(