def build_targets():
    """
	构建全量目标数据集
	:return:
	"""
    # 载入数据
    data = pd.read_csv('../tmp/total_implemented_normalized_data.csv')

    # 滤波
    data_filtered = savitzky_golay_filtering(data)
    # data_filtered = data

    # 构建目标数据集
    seq_len = config.conf['model_params']['seq_len']
    target_column = config.conf['model_params']['target_column']

    target_array = np.array(data_filtered[target_column]).reshape(-1, 1)

    # 数据扩充
    target_array = np.vstack((target_array, np.zeros([seq_len - 1, 1])))

    targets = []
    for i in range(target_array.shape[0] - seq_len + 1):
        targets.append(target_array[i:i + seq_len, :])
    targets = np.array(targets)  # targets.shape = (samples_len, seq_len, 1)

    return targets
def build_samples():
    """
	构建全量样本集
	:return:
	"""
    # 载入数据
    data = pd.read_csv('../tmp/total_implemented_normalized_data.csv')

    # 滤波
    data_filtered = savitzky_golay_filtering(data)
    # data_filtered = data

    # 构建样本数据集
    seq_len = config.conf['model_params']['seq_len']
    selected_columns = config.conf['model_params']['selected_columns']

    data_array = np.array(data_filtered[selected_columns])

    # 数据扩充, 方便后面的lstm样本构造
    data_array = np.vstack((np.zeros([seq_len - 1,
                                      data_array.shape[1]]), data_array))

    samples = []
    for i in range(data_array.shape[0] - seq_len + 1):
        samples.append(data_array[i:i + seq_len, :])
    samples = np.array(
        samples)  # samples.shape = (samples_len, seq_len, features)

    return samples
예제 #3
0
def build_train_samples_dict():
    """
	获取样本字典
	:return:
		samples_dict: dict, {'pm10': np.ndarray(shape = (samples_len, embed_dim, 1)), ...}
	"""
    # 设定参数
    exist_record_time = config.conf['exist_record_time']
    exist_time_stamp = int(
        time.mktime(time.strptime(str(exist_record_time), '%Y%m%d%H')))
    selected_columns = config.conf['model_params']['selected_columns']
    samples_len = config.conf['model_params']['samples_len']
    hr = config.conf['model_params']['hr']

    # 载入数据
    data = pd.read_csv('../tmp/total_implemented_normalized_data.csv')

    # 滤波
    data = savitzky_golay_filtering(data)

    # 生成样本数据
    samples_df = build_samples_data_frame(data)
    samples_columns = samples_df.columns

    # 构造样本
    samples_dict = {}
    for column in selected_columns:
        data_columns = [p for p in samples_columns if column in p]

        samples = samples_df[['time_stamp'] + data_columns]

        start_time_stamp = exist_time_stamp - samples_len * hr
        end_time_stamp = exist_time_stamp - hr
        samples = samples[(samples.time_stamp >= start_time_stamp)
                          & (samples.time_stamp <= end_time_stamp)]

        samples = np.array(samples.iloc[:, 1:])
        samples = samples[:, :, np.newaxis]

        samples_dict[column] = samples

    return samples_dict
예제 #4
0
    plt.tight_layout()
    plt.show()
    plt.pause(1.0)


if __name__ == '__main__':
    # 载入数据
    data = pd.read_csv('../tmp/total_implemented_normalized_data.csv')
    target_column = config.conf['model_params']['target_column']
    #target_column = 'pm25'
    selected_columns = config.conf['model_params']['selected_columns']
    #selected_columns = 'pm10'

    # 带通滤波
    # data = data[list(set([target_column] + selected_columns))]
    data_filtered = savitzky_golay_filtering(data)

    # 计算外生变量影响
    cross_correlation_analysis(target_column, selected_columns[5:], data_filtered)

    # # 联合分布
    # for col in selected_columns:
    #     sns.jointplot(x = col, y = target_column, data = data, kind = 'hex', space = 0, size = 3)
    #     plt.xlabel(col)
    #     plt.ylabel(target_column)
    #     plt.xlim([0, 1])
    #     plt.ylim([0, 1])
    #     plt.tight_layout()
    #     plt.show()
    #     plt.pause(1.0)
    #
                    cols[x].append('{}_{}'.format(x, i))
    for k,v in cols.items():
        columns = v
        dfn = pd.DataFrame(data=None, columns=columns, index=df.index)
        i = 1
        for c in columns:
            dfn[c] = df[k].shift(periods=i)
            i+=1
        df = pd.concat([df, dfn], axis=1, join_axes=[df.index])
    return df

if __name__=="__main__":
    file_name = "../tmp/total_implemented_normalized_data.csv"
    data = pd.read_csv(file_name)

    data = data_filtering.savitzky_golay_filtering(data)
    NON_DER = ['aqi', ]
    df_new = df_derived_by_shift(data, 6, NON_DER)

    """
    可视化
    """
    colormap = plt.cm.RdBu
    plt.figure(figsize=(15, 10))
    plt.title(u'6 days', y=1.05, size=16)

    mask = np.zeros_like(df_new.corr())
    mask[np.triu_indices_from(mask)] = True

    svm = sns.heatmap(df_new.corr(), mask=mask, linewidths=0.1, vmax=1.0,
                      square=True, cmap=colormap, linecolor='white', annot=True)
        fig = plt.figure('acf & pacf test for %s' % columns[i],
                         figsize=[12, 10])
        acf_fig = fig.add_subplot(2, 1, 1)
        sm.graphics.tsa.plot_acf(samples[:, i], lags=200, ax=acf_fig)
        pacf_fig = fig.add_subplot(2, 1, 2)
        sm.graphics.tsa.plot_pacf(samples[:, i], lags=200, ax=pacf_fig)
        plt.tight_layout()


if __name__ == '__main__':
    # 载入数据
    file_name = '../tmp/total_implemented_normalized_data.csv'
    data = pd.read_csv(file_name)

    # 数据滤波
    data = savitzky_golay_filtering(data)

    # 计算自相关函数
    columns = list(
        set([config.conf['model_params']['target_column']] +
            config.conf['model_params']['selected_columns']))
    data = data[columns]

    for col in columns:
        acf = []
        start_locs = range(1000, 25000, 100)
        for loc in start_locs:
            time_series = data.loc[loc:loc + 1000, col]
            acf.append(stattools.acf(time_series, nlags=60))

        acf = np.array(acf).T