예제 #1
0
                                        min(10, len(df_feature_full['max_15'].unique())), duplicates='drop').cat.codes
    df_feature_full['sum_15'] = pd.qcut(df_feature_full['sum_15'],
                                        min(10, len(df_feature_full['sum_15'].unique())), duplicates='drop').cat.codes
    df_feature_full['min_7'] = pd.qcut(df_feature_full['min_7'],
                                        min(10, len(df_feature_full['min_7'].unique())), duplicates='drop').cat.codes
    df_feature_full['std_7'] = pd.qcut(df_feature_full['std_7'],
                                        min(10, len(df_feature_full['std_7'].unique())), duplicates='drop').cat.codes
    df_feature_full['var_7'] = pd.qcut(df_feature_full['var_7'],
                                        min(10, len(df_feature_full['var_7'].unique())), duplicates='drop').cat.codes
    df_feature_full['max_7'] = pd.qcut(df_feature_full['max_7'],
                                        min(10, len(df_feature_full['max_7'].unique())), duplicates='drop').cat.codes
    df_feature_full['sum_7'] = pd.qcut(df_feature_full['sum_7'],
                                        min(10, len(df_feature_full['sum_7'].unique())), duplicates='drop').cat.codes
    df_feature_full['min_3'] = pd.qcut(df_feature_full['min_3'],
                                        min(10, len(df_feature_full['min_3'].unique())), duplicates='drop').cat.codes
    df_feature_full['std_3'] = pd.qcut(df_feature_full['std_3'],
                                        min(10, len(df_feature_full['std_3'].unique())), duplicates='drop').cat.codes
    df_feature_full['var_3'] = pd.qcut(df_feature_full['var_3'],
                                        min(10, len(df_feature_full['var_3'].unique())), duplicates='drop').cat.codes
    df_feature_full['max_3'] = pd.qcut(df_feature_full['max_3'],
                                        min(10, len(df_feature_full['max_3'].unique())), duplicates='drop').cat.codes
    df_feature_full['sum_3'] = pd.qcut(df_feature_full['sum_3'],
                                        min(10, len(df_feature_full['sum_3'].unique())), duplicates='drop').cat.codes
    # print(df_feature_full.head)
    return df_feature_full

if __name__ == '__main__':
    df_feature_full = file_operation.read_feature_full()
    df_feature_full = binner_engine(df_feature_full)
    file_operation.write_feature_full(df_feature_full)
예제 #2
0
        df_goods = df_goods.sort_values(by=['rundate'])

        mean_result = df_goods[column].mean()
        std_result = df_goods[column].std()
        for k in range(3,df_goods[column].shape[0]):
            i = df_goods.index[k]
            if (df_feature_full[column][i] > mean_result + std_result) or (df_feature_full[column][i] < mean_result - std_result):
                df_feature_full[column][i] = round(df_goods[column][df_goods.index[k-3:k]].mean())
    return df_feature_full
"""


def outlier_processing(df_feature_full, column):
    df_feature_full = df_feature_full.sort_values(by=['rundate'])
    #df_goods = df_feature_full
    mean_result = df_feature_full[column].mean()
    std_result = df_feature_full[column].std()
    for k in range(3, df_feature_full[column].shape[0]):
        i = df_feature_full.index[k]
        if (df_feature_full[column][i] > mean_result + 5 * std_result) or (
                df_feature_full[column][i] < mean_result - 5 * std_result):
            df_feature_full[column][
                i] = mean_result + std_result  #round(df_feature_full[column][df_feature_full.index[k-3:k]].mean())
    return df_feature_full


if __name__ == '__main__':
    df_feature_full = file_operation.read_feature_full()
    df_processed = outlier_processing(df_feature_full)
    file_operation.write_feature_full(df_processed)
예제 #3
0
def remove_useless(df_cleaned):
    # df_cleaned = df_cleaned.drop('goodsn', 1)
    # df_cleaned = df_cleaned.drop('name', 1)
    # df_cleaned = df_cleaned.drop('barcode', 1)
    # df_cleaned = df_cleaned.drop('daleicode', 1)
    # df_cleaned = df_cleaned.drop('zhongleicode', 1)
    # df_cleaned = df_cleaned.drop('xiaoleicode', 1)
    # df_cleaned = df_cleaned.drop('storageattr', 1)
    # df_cleaned = df_cleaned.drop('vendorid', 1)
    # df_cleaned = df_cleaned.drop('specification', 1)
    # df_cleaned = df_cleaned.drop('dt_x', 1)
    # df_cleaned = df_cleaned.drop('dt_y', 1)
    return df_cleaned


def data_clean(df_origin_full):
    df_cleaned = remove_nan(df_origin_full)
    df_cleaned = remove_illegal(df_cleaned)
    df_cleaned = remove_useless(df_cleaned)
    pd.set_option('display.max_columns', None)
    # print(df_cleaned[df_cleaned.isnull().any(axis=1)])
    print('df_cleaned contains invalid value:')
    print(np.count_nonzero(df_cleaned.isnull()))
    return df_cleaned


if __name__ == '__main__':
    df_origin_full = file_operation.read_feature_full()
    df_cleaned = data_clean(df_origin_full)
    file_operation.write_feature_full(df_cleaned)
예제 #4
0
    s1_result = pd.Series([])
    s2_result = pd.Series([])
    s3_result = pd.Series([])
    for goodsn in goodsn_list:
        for store_id in store_ids:
            df_goods = df_feature_full[
                (df_feature_full['storeid'] == store_id)
                & (df_feature_full['goodscode'] == goodsn)]
            if len(df_goods.index) < 2 * 7:
                continue
            y = [v for i, v in df_goods['saleqty'].items()]
            s1 = holt_winters_first_order_ewma(y, 0.3)
            s2 = holt_winters_second_order_ewma(y, 0.3, 0.3)
            s3 = triple_exponential_smoothing(y, 7, 0.3, 0.3, 0.1)
            s1_result = pd.concat(
                [s1_result, pd.Series(s1, index=df_goods.index)])
            s2_result = pd.concat(
                [s2_result, pd.Series(s2, index=df_goods.index)])
            s3_result = pd.concat(
                [s3_result, pd.Series(s3, index=df_goods.index)])
    df_feature_full['smoothing1'] = s1_result
    df_feature_full['smoothing2'] = s2_result
    df_feature_full['smoothing3'] = s3_result
    return df_feature_full


if __name__ == '__main__':
    df_sale_feature = file_operation.read_feature_full()
    df_sale_feature = smoothing(df_sale_feature)
    file_operation.write_feature_full(df_sale_feature)