예제 #1
0
def check_dataframe_diff(df1: pd.DataFrame,
                         df2: pd.DataFrame,
                         logger: logging.Logger = None,
                         max_count: int = 10,
                         ignore_same: bool = False):
    if logger is None:
        from kkutils.util.com import set_logger  # 常に使いたくないのでここで呼ぶ
        logger = set_logger(__name__)
    df1, df2 = df1.copy().fillna(-999), df2.copy().fillna(
        -999)  # Series内の nan == nan は False になるので fill しておく

    logger.info("check dataframe shape.", color=["BOLD", "GREEN"])
    logger.info(f"df1 shape: {df1.shape}")
    logger.info(f"df2 shape: {df2.shape}")

    logger.info("check dataframe index.", color=["BOLD", "GREEN"])
    ndf1, ndf2 = df1.index, df2.index
    if (ndf1.shape[0] != ndf2.shape[0]) or (~(ndf1 == ndf2).sum() > 0):
        logger.warning(f"index is different.")
        same_index = values_include(ndf1, ndf2)
        logger.debug(f"same index: {same_index}")
        logger.warning(f"only df1 index: {values_not_include(ndf2, ndf1)}")
        logger.warning(f"only df2 index: {values_not_include(ndf1, ndf2)}")
    else:
        if ignore_same == False:
            logger.info(f"index is same.", color=["BOLD", "BLUE"])

    logger.info("check dataframe columns.", color=["BOLD", "GREEN"])
    ndf1, ndf2 = df1.columns, df2.columns
    same_columns = values_include(ndf1, ndf2)
    if (ndf1.shape[0] != ndf2.shape[0]) or (~(ndf1 == ndf2).sum() > 0):
        logger.warning(f"columns is different.")
        logger.debug(f"same columns: {same_columns}")
        logger.warning(f"only df1 index: {values_not_include(ndf2, ndf1)}")
        logger.warning(f"only df2 index: {values_not_include(ndf1, ndf2)}")
    else:
        if ignore_same == False:
            logger.info(f"columns is same.", color=["BOLD", "BLUE"])

    logger.info("we check only same indexes and same columns",
                color=["BOLD", "GREEN"])
    df1 = df1.loc[df1.index.isin(df2.index), df1.columns.isin(df2.columns)]
    df2 = df2.loc[df1.index, df1.columns]

    logger.info("check whole data.", color=["BOLD", "GREEN"])
    for x in same_columns:
        sebool = (df1[x] == df2[x])
        if (~sebool).sum() > 0:
            logger.warning(
                f'"{x}" is different. different count: {(~sebool).sum()}. different index: {df1.index[~sebool]}. {(~sebool).sum()}. different values: {[(_x, _y, ) for _x, _y in zip(df1.loc[~sebool, x].iloc[:max_count].values, df2.loc[~sebool, x].iloc[:max_count].values)]}'
            )
        else:
            if ignore_same == False:
                logger.info(f'"{x}" is same.', color=["BOLD", "BLUE"])
예제 #2
0
 def __init__(self, connection_string: str, max_disp_len: int=100, log_level="info", logfilepath: str=None):
     """
     postgresql db との connection を確立する
     Params::
         connection_string: 接続文字列
             ex) host=172.18.10.2 port=5432 dbname=boatrace user=postgres password=postgres
     """
     self.con = None if connection_string is None else psycopg2.connect(connection_string)
     self.max_disp_len = max_disp_len
     self.sql_list = [] # insert, update, delete は一連のsqlをsetした後に一気に実行することにする
     self.logger = set_logger(_logname+".Psgre."+str(id(self.con)), log_level=log_level, internal_log=False, logfilepath=logfilepath)
     if connection_string is None:
         self.logger.info("dummy connection is established.")
     else:
         self.logger.info(f'connection is established. {connection_string[:connection_string.find("password")]}')
예제 #3
0
import numpy as np
import pandas as pd
from typing import List

# local package
from kkutils.lib.ml.procs import MyAsType
from kkutils.util.com import check_type, is_callable, set_logger
logger = set_logger(__name__)

__all__ = [
    "ProcRegistry",
]


class ProcRegistry(object):
    def __init__(self, colname_explain: np.ndarray,
                 colname_answer: np.ndarray):
        super().__init__()
        self.processing = {}
        self.default_proc(colname_explain, colname_answer)

    def default_proc(self, colname_explain: np.ndarray,
                     colname_answer: np.ndarray):
        logger.info("START")
        check_type(colname_explain, [np.ndarray])
        check_type(colname_answer, [np.ndarray])
        self.processing["default_x"] = {}
        self.processing["default_x"]["type"] = "x"
        self.processing["default_x"]["cols"] = colname_explain
        self.processing["default_x"]["proc"] = []
        self.processing["default_y"] = {}