Exemplo n.º 1
0
    median_abs_error = median_absolute_error(y_true=verify_golden, y_pred=predict_)
    r2score = r2_score(y_true=verify_golden, y_pred=predict_)
    # RECORD_LOG('使用sklearn的打分评价得到explained_var_score={}, mean_abs_error={}, mean_sqr_error={}, median_abs_error={}, r2score={}'
    #             .format(explained_var_score, mean_abs_error, mean_sqr_error, median_abs_error, r2score))
    return predict_, [explained_var_score, mean_abs_error, mean_sqr_error, median_abs_error, r2score]


if __name__ == "__main__":
    start_time = time.time()
    # 1. Get sample and last validation data.
    # Get Data include some pre-process.
    # Initial get fillna dataframe
    # cat_fill_type= "fill_paulnull" or "base_name" or "base_brand"
    # brand_fill_type= "fill_paulnull" or "base_other_cols" or "base_NB" or "base_GRU"
    # item_desc_fill_type= 'fill_' or 'fill_paulnull' or 'base_name'
    data_reader = DataReader(local_flag=LOCAL_FLAG, cat_fill_type='fill_paulnull', brand_fill_type='base_other_cols', item_desc_fill_type='fill_')
    RECORD_LOG('[{:.4f}s] Finished handling missing data...'.format(time.time() - start_time))

    data_reader.del_redundant_cols()

    # PROCESS CATEGORICAL DATA
    RECORD_LOG("Handling categorical variables...")
    data_reader.le_encode()
    RECORD_LOG('[{:.4f}s] Finished PROCESSING CATEGORICAL DATA...'.format(time.time() - start_time))
    with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', None,
                           'display.height', None):
        RECORD_LOG('\n{}'.format(data_reader.train_df.head(3)))

    # PROCESS TEXT: RAW
    RECORD_LOG("Text to seq process...")
    RECORD_LOG("   Fitting tokenizer...")
Exemplo n.º 2
0
def start_logging():
    # 加载前面的标准配置
    from ProjectCodes.logging_config import ConfigLogginfDict
    logging.config.dictConfig(ConfigLogginfDict(__file__).LOGGING)
    # 获取loggers其中的一个日志管理器
    logger = logging.getLogger("default")
    logger.info('\n\n#################\n~~~~~~Start~~~~~~\n#################')
    print(type(logger))
    return logger


if 'Logger' not in dir():
    Logger = start_logging()

data_reader = DataReader(local_flag=True,
                         cat_fill_type='fill_paulnull',
                         brand_fill_type='fill_paulnull',
                         item_desc_fill_type='fill_')

predict_by_cols = [
    'name', 'item_description', 'cat_name_main', 'cat_name_sub',
    'cat_name_sub2'
]


def construct_feature_text(row):
    text = ''
    for col in predict_by_cols:
        text += row[col] + ' '
    return text

Exemplo n.º 3
0
import gc
import time
import pandas as pd
import numpy as np
import platform

start_time = time.time()

# TODO: Need modify when run on Kaggle kernel.
if platform.system() == 'Windows':
    LOCAL_FLAG = True
else:
    LOCAL_FLAG = False
data_reader = DataReader(local_flag=LOCAL_FLAG,
                         cat_fill_type='base_name',
                         brand_fill_type='base_other_cols',
                         item_desc_fill_type='fill_')
# data_reader = DataReader(local_flag=LOCAL_FLAG, cat_fill_type='base_brand', brand_fill_type='base_name', item_desc_fill_type='base_name')
# Initial get fillna dataframe
print(data_reader.train_df.shape)
print(data_reader.test_df.shape)
print('[{}] Finished handling missing data...'.format(time.time() -
                                                      start_time))

# PROCESS CATEGORICAL DATA
# TODO: 需要改变下分类规则然后重新编码尝试结果
print("Handling categorical variables...")
data_reader.le_encode()
print('[{}] Finished PROCESSING CATEGORICAL DATA...'.format(time.time() -
                                                            start_time))
with pd.option_context('display.max_rows', None, 'display.max_columns', None,
Exemplo n.º 4
0
    return predict_, [
        explained_var_score, mean_abs_error, mean_sqr_error, median_abs_error,
        r2score
    ]


if __name__ == "__main__":
    start_time = time.time()
    # 1. Get sample and last validation data.
    # Get Data include some pre-process.
    # Initial get fillna dataframe
    # cat_fill_type= "fill_paulnull" or "base_name" or "base_brand"
    # brand_fill_type= "fill_paulnull" or "base_other_cols" or "base_NB" or "base_GRU"
    # item_desc_fill_type= 'fill_' or 'fill_paulnull' or 'base_name'
    data_reader = DataReader(local_flag=LOCAL_FLAG,
                             cat_fill_type='fill_paulnull',
                             brand_fill_type='base_other_cols',
                             item_desc_fill_type='fill_')
    RECORD_LOG(
        '[{:.4f}s] Finished handling missing data...'.format(time.time() -
                                                             start_time))

    data_reader.del_redundant_cols()

    # FIT FEATURES TRANSFORMERS
    RECORD_LOG("Fitting features pipeline and get train and test ...")
    sample_X, last_valida_X, sample_y, last_valida_y, test_X = data_reader.get_split_sparse_data(
    )
    RECORD_LOG(
        '[{:.4f}s] Finished FIT FEATURES TRANSFORMERS & SPLIT...'.format(
            time.time() - start_time))
    print('sample_X.shape={}'.format(sample_X.shape))