min(10, len(df_feature_full['max_15'].unique())), duplicates='drop').cat.codes df_feature_full['sum_15'] = pd.qcut(df_feature_full['sum_15'], min(10, len(df_feature_full['sum_15'].unique())), duplicates='drop').cat.codes df_feature_full['min_7'] = pd.qcut(df_feature_full['min_7'], min(10, len(df_feature_full['min_7'].unique())), duplicates='drop').cat.codes df_feature_full['std_7'] = pd.qcut(df_feature_full['std_7'], min(10, len(df_feature_full['std_7'].unique())), duplicates='drop').cat.codes df_feature_full['var_7'] = pd.qcut(df_feature_full['var_7'], min(10, len(df_feature_full['var_7'].unique())), duplicates='drop').cat.codes df_feature_full['max_7'] = pd.qcut(df_feature_full['max_7'], min(10, len(df_feature_full['max_7'].unique())), duplicates='drop').cat.codes df_feature_full['sum_7'] = pd.qcut(df_feature_full['sum_7'], min(10, len(df_feature_full['sum_7'].unique())), duplicates='drop').cat.codes df_feature_full['min_3'] = pd.qcut(df_feature_full['min_3'], min(10, len(df_feature_full['min_3'].unique())), duplicates='drop').cat.codes df_feature_full['std_3'] = pd.qcut(df_feature_full['std_3'], min(10, len(df_feature_full['std_3'].unique())), duplicates='drop').cat.codes df_feature_full['var_3'] = pd.qcut(df_feature_full['var_3'], min(10, len(df_feature_full['var_3'].unique())), duplicates='drop').cat.codes df_feature_full['max_3'] = pd.qcut(df_feature_full['max_3'], min(10, len(df_feature_full['max_3'].unique())), duplicates='drop').cat.codes df_feature_full['sum_3'] = pd.qcut(df_feature_full['sum_3'], min(10, len(df_feature_full['sum_3'].unique())), duplicates='drop').cat.codes # print(df_feature_full.head) return df_feature_full if __name__ == '__main__': df_feature_full = file_operation.read_feature_full() df_feature_full = binner_engine(df_feature_full) file_operation.write_feature_full(df_feature_full)
df_goods = df_goods.sort_values(by=['rundate']) mean_result = df_goods[column].mean() std_result = df_goods[column].std() for k in range(3,df_goods[column].shape[0]): i = df_goods.index[k] if (df_feature_full[column][i] > mean_result + std_result) or (df_feature_full[column][i] < mean_result - std_result): df_feature_full[column][i] = round(df_goods[column][df_goods.index[k-3:k]].mean()) return df_feature_full """ def outlier_processing(df_feature_full, column): df_feature_full = df_feature_full.sort_values(by=['rundate']) #df_goods = df_feature_full mean_result = df_feature_full[column].mean() std_result = df_feature_full[column].std() for k in range(3, df_feature_full[column].shape[0]): i = df_feature_full.index[k] if (df_feature_full[column][i] > mean_result + 5 * std_result) or ( df_feature_full[column][i] < mean_result - 5 * std_result): df_feature_full[column][ i] = mean_result + std_result #round(df_feature_full[column][df_feature_full.index[k-3:k]].mean()) return df_feature_full if __name__ == '__main__': df_feature_full = file_operation.read_feature_full() df_processed = outlier_processing(df_feature_full) file_operation.write_feature_full(df_processed)
def remove_useless(df_cleaned): # df_cleaned = df_cleaned.drop('goodsn', 1) # df_cleaned = df_cleaned.drop('name', 1) # df_cleaned = df_cleaned.drop('barcode', 1) # df_cleaned = df_cleaned.drop('daleicode', 1) # df_cleaned = df_cleaned.drop('zhongleicode', 1) # df_cleaned = df_cleaned.drop('xiaoleicode', 1) # df_cleaned = df_cleaned.drop('storageattr', 1) # df_cleaned = df_cleaned.drop('vendorid', 1) # df_cleaned = df_cleaned.drop('specification', 1) # df_cleaned = df_cleaned.drop('dt_x', 1) # df_cleaned = df_cleaned.drop('dt_y', 1) return df_cleaned def data_clean(df_origin_full): df_cleaned = remove_nan(df_origin_full) df_cleaned = remove_illegal(df_cleaned) df_cleaned = remove_useless(df_cleaned) pd.set_option('display.max_columns', None) # print(df_cleaned[df_cleaned.isnull().any(axis=1)]) print('df_cleaned contains invalid value:') print(np.count_nonzero(df_cleaned.isnull())) return df_cleaned if __name__ == '__main__': df_origin_full = file_operation.read_feature_full() df_cleaned = data_clean(df_origin_full) file_operation.write_feature_full(df_cleaned)
s1_result = pd.Series([]) s2_result = pd.Series([]) s3_result = pd.Series([]) for goodsn in goodsn_list: for store_id in store_ids: df_goods = df_feature_full[ (df_feature_full['storeid'] == store_id) & (df_feature_full['goodscode'] == goodsn)] if len(df_goods.index) < 2 * 7: continue y = [v for i, v in df_goods['saleqty'].items()] s1 = holt_winters_first_order_ewma(y, 0.3) s2 = holt_winters_second_order_ewma(y, 0.3, 0.3) s3 = triple_exponential_smoothing(y, 7, 0.3, 0.3, 0.1) s1_result = pd.concat( [s1_result, pd.Series(s1, index=df_goods.index)]) s2_result = pd.concat( [s2_result, pd.Series(s2, index=df_goods.index)]) s3_result = pd.concat( [s3_result, pd.Series(s3, index=df_goods.index)]) df_feature_full['smoothing1'] = s1_result df_feature_full['smoothing2'] = s2_result df_feature_full['smoothing3'] = s3_result return df_feature_full if __name__ == '__main__': df_sale_feature = file_operation.read_feature_full() df_sale_feature = smoothing(df_sale_feature) file_operation.write_feature_full(df_sale_feature)