示例#1
0
def check_dataframe_diff(df1: pd.DataFrame,
                         df2: pd.DataFrame,
                         logger: logging.Logger = None,
                         max_count: int = 10,
                         ignore_same: bool = False):
    if logger is None:
        from kkpackage.util.logger import set_logger  # 常に使いたくないのでここで呼ぶ
        logger = set_logger(__name__)
    df1, df2 = df1.copy().fillna(-999), df2.copy().fillna(
        -999)  # Series内の nan == nan は False になるので fill しておく

    logger.info("check dataframe shape.", color=["BOLD", "GREEN"])
    logger.info(f"df1 shape: {df1.shape}")
    logger.info(f"df2 shape: {df2.shape}")

    logger.info("check dataframe index.", color=["BOLD", "GREEN"])
    ndf1, ndf2 = df1.index, df2.index
    if (ndf1.shape[0] != ndf2.shape[0]) or (~(ndf1 == ndf2).sum() > 0):
        logger.warning(f"index is different.")
        same_index = values_include(ndf1, ndf2)
        logger.debug(f"same index: {same_index}")
        logger.warning(f"only df1 index: {values_not_include(ndf2, ndf1)}")
        logger.warning(f"only df2 index: {values_not_include(ndf1, ndf2)}")
    else:
        if ignore_same == False:
            logger.info(f"index is same.", color=["BOLD", "BLUE"])

    logger.info("check dataframe columns.", color=["BOLD", "GREEN"])
    ndf1, ndf2 = df1.columns, df2.columns
    same_columns = values_include(ndf1, ndf2)
    if (ndf1.shape[0] != ndf2.shape[0]) or (~(ndf1 == ndf2).sum() > 0):
        logger.warning(f"columns is different.")
        logger.debug(f"same columns: {same_columns}")
        logger.warning(f"only df1 index: {values_not_include(ndf2, ndf1)}")
        logger.warning(f"only df2 index: {values_not_include(ndf1, ndf2)}")
    else:
        if ignore_same == False:
            logger.info(f"columns is same.", color=["BOLD", "BLUE"])

    logger.info("we check only same indexes and same columns",
                color=["BOLD", "GREEN"])
    df1 = df1.loc[df1.index.isin(df2.index), df1.columns.isin(df2.columns)]
    df2 = df2.loc[df1.index, df1.columns]

    logger.info("check whole data.", color=["BOLD", "GREEN"])
    for x in same_columns:
        sebool = (df1[x] == df2[x])
        if (~sebool).sum() > 0:
            logger.warning(
                f'"{x}" is different. different count: {(~sebool).sum()}. different index: {df1.index[~sebool]}. {(~sebool).sum()}. different values: {[(_x, _y, ) for _x, _y in zip(df1.loc[~sebool, x].iloc[:max_count].values, df2.loc[~sebool, x].iloc[:max_count].values)]}'
            )
        else:
            if ignore_same == False:
                logger.info(f'"{x}" is same.', color=["BOLD", "BLUE"])
示例#2
0
 def __init__(self, model: torch.nn.Module):
     self.logger = set_logger(_logname + ".MyTorch", log_level="info")
     self.logger.info("START")
     self.model = model
     self.model_init = copy.deepcopy(self.model)
     self.model_type_list = []  # 1:分類, 2:回帰.※損失関数の数だけ存在
     self.is_cuda = False
     self.learning_rate = None
     self.n_epoch = None
     self.batch_size = None
     self.criterion_list = []  # 複数のOutputがある場合、複数の損失関数を考慮する(マルチタスク学習)
     self.optimizer = None
     self.optimizer_init = None
     self.classes_ = None
     self.logger.info("END")
示例#3
0
from functools import partial
import os, json
import numpy as np
import pandas as pd
import optuna
import sqlite3
# local package
from kkpackage.util.learning import evalate, split_data_balance, conv_validdata_in_fitparmas, is_classification_model
from kkpackage.util.common import is_callable
from kkpackage.util.logger import set_logger
logger = set_logger(__name__)


# optuna に埋め込むベース関数
# X, Y はそれぞれ1種類を想定する
def optuna_base_function(X: np.ndarray, Y: np.ndarray, X_test: np.ndarray,
                         Y_test: np.ndarray, model: object, dict_param: dict,
                         tuning_eval: str, split_params: dict,
                         fit_params: dict, eval_params: dict, trial):
    """
    optuna に埋め込むためのBaseFunction
    X: 訓練データ
    Y: 訓練正解ラベル
    X_test: テストデータ
    Y_test: テスト正解ラベル
    model: ハイパーパラメータサーチしたいモデル
    dict_param = {"learning_rate":["category",0.05,0.1,0.2,0.3,0.5], "n_estimators":["int",500,1500], "max_depth":["int",3,10], 
                  "gamma":["float",0.,0.3], "min_child_weight":["int",1,20], "subsample":["step", 0.5, 0.9, 0.1], 
                  "colsample_bytree":["step", 0.1, 0.9, 0.1], "random_state":["const",1], "n_jobs":["const", -1]}
    tuning_eval: kkpackage.util.features.evalate にあるtuning_eval
    split_params:
示例#4
0
    def __init__(self, *layers):
        self.logger = set_logger(_logname + ".TorchNN", log_level="info")
        self.logger.info("START")
        # 親クラスのコンストラクタ
        super().__init__()
        self.num_modules = 0
        self.num_inputs = 0
        self.num_outputs = 0
        self.outjoin_list = []

        # 初めに、Layer構造チェック. 下記並びを原則とする
        # Input1, Input1, ..., Input2, ..., Common, Common, ..., Output1, Output1, ..., Output2, ..
        prev_type, prev_nodes = None, 0
        ilayer_fist_common, ilayer_last_common = 0, []
        fi_size_common, lo_size_common = 0, 0
        for i, layer in enumerate(layers):
            # layer_nameだけ取り出す
            layer_type, layer_name, nodes, *param = layer
            # 辞書形式があれば取り出す
            params = {}
            if (len(param) > 0) and (type(param[-1]) == dict):
                params = param[-1]
                param = param[:-1]
            if prev_type is None:
                if layer_type.find("Input1") == 0:
                    prev_type = layer_type
                    self.num_inputs += 1
                else:
                    self.logger.raise_error(
                        f"unexpected value. layer_type: {layer_type}")
            elif prev_type.find("Input") == 0:
                if layer_type.find("Common") == 0:
                    # Commonになる場合. これ以降はInputは来ない
                    prev_type = layer_type
                    fi_size_common += prev_nodes
                    ilayer_fist_common = i
                elif prev_type == layer_type:
                    # InputXが続く場合
                    prev_type = layer_type
                elif (layer_type.find("Input") == 0) and (
                    (int(prev_type[-1]) + 1) == int(layer_type[-1])):
                    # InputXからInputX+1に変わる場合
                    prev_type = layer_type
                    fi_size_common += prev_nodes
                    self.num_inputs += 1
                else:
                    self.logger.raise_error(
                        f"unexpected value. layer_type: {layer_type}")
            elif prev_type.find("Common") == 0:
                if layer_type.find("Common") == 0:
                    prev_type = layer_type
                elif (layer_type.find("Output1") == 0):
                    # Commonが終わってOutputXに移る場合
                    prev_type = layer_type
                    lo_size_common += prev_nodes
                    ilayer_last_common.append(i)
                    self.num_outputs += 1
                else:
                    self.logger.raise_error(
                        f"unexpected value. layer_type: {layer_type}")
            elif prev_type.find("Output") == 0:
                if prev_type == layer_type:
                    # OutputXが続く場合
                    prev_type = layer_type
                elif (layer_type.find("Output") == 0) and (
                    (int(prev_type[-1]) + 1) == int(layer_type[-1])):
                    # OutputXからOutputX+1に変わる場合
                    prev_type = layer_type
                    ilayer_last_common.append(i)
                    self.num_outputs += 1
                else:
                    self.logger.raise_error(
                        f"unexpected value. layer_type: {layer_type}")

            # 出力ノードの記録
            if len(nodes) == 0: continue
            elif len(nodes) == 1: prev_nodes = nodes[0]
            elif len(nodes) == 2: prev_nodes = nodes[1]
            else: self.logger.raise_error(f"unexpected value. nodes: {nodes}")
            # オプションによって出力のノード数が変わる場合があるのでその修正
            if (params.get("bidirectional")
                    is not None) and (params.get("bidirectional") == True):
                prev_nodes = prev_nodes * 2

        # 層を追加していく
        for i, layer in enumerate(layers):
            # 出力内容の分解
            layer_type, layer_name, nodes, outjoin, *param = layer
            # 辞書形式があれば取り出す
            params = {}
            if (len(param) > 0) and (type(param[-1]) == dict):
                params = param[-1]
                param = param[:-1]

            # 入力ノード数, 出力ノード数の埋め込み
            ## InpuptX -> Common, Common -> OutputXの箇所が特殊
            if len(nodes) == 0:
                pass
            elif len(nodes) == 1:
                self.out_size = nodes[0]
            elif len(nodes) == 2:
                self.in_size = nodes[0]
                self.out_size = nodes[1]
            else:
                self.logger.raise_error(f"unexpected value. nodes: {nodes}")

            # 初回Common層や初回OutputX層の場合
            if i == ilayer_fist_common: self.in_size = fi_size_common
            elif i in ilayer_last_common: self.in_size = lo_size_common

            # 出力の結合先を定義する(ResNetを表現したい)
            # 出力結合先の格納. add_modules と同期をとりたい
            self.outjoin_list.append(outjoin if type(outjoin) ==
                                     type("") else "")

            # Layerの追加
            self.__AddModule(layer_type, layer_name, *param, **params)

        # 計算処理のコンパイル
        self.__Compile()
        self.logger.info("END")
示例#5
0
import copy
from typing import List
import torch
import pandas as pd
import numpy as np
from scipy import stats

# local package
from kkpackage.util.learning import evalate
from kkpackage.util.common import check_type
from kkpackage.util.logger import set_logger
_logname = __name__
logger = set_logger()


class TorchNN(torch.nn.Module):
    """
    nn.Moduleを継承して新しいクラスを作る
    Layerの名前を固定して、複数入力ゆや複数出力に対応できるようにする
    *layers  : ((layer1), (layer2), (layer3), ...)
    ※layer1 : (layer_type, layer_name, () or (出力ノード数) or (入力ノード数, 出力ノード数), *param, **params)
    ※layer_type(string) : 基本的に定義の上から順に同typeのlayerを計算する
                         : InputX(Xは1から始まる番号. 別特徴量の入力を指す)
                         : Common(全結合層. InputXが横並びで入力される層)
                         : OutputX(Xは1から始まる番号. マルチタスク用に別出力層を定義)
    """
    def __init__(self, *layers):
        self.logger = set_logger(_logname + ".TorchNN", log_level="info")
        self.logger.info("START")
        # 親クラスのコンストラクタ
        super().__init__()
示例#6
0
import time
from urllib import request
import requests
import subprocess
import psutil
from time import sleep
import signal

# local package
from kkpackage.util.logger import set_logger
logger = set_logger(name=__name__)


def download(url: str, savepath: str, proxies: dict = None):
    """
    file を download する
    Params::
        url: download url
        savepath: 保存先のpath
        proxies: 下記のように指定
            proxies = {
                "http": "http://[email protected]:[email protected]:8080" \
            }
    """
    if proxies is not None:
        proxyHandler = request.ProxyHandler(proxies)
        opener = request.build_opener(proxyHandler)
        request.install_opener(opener)
    logger.info(f"download url: {url}")
    request.urlretrieve(url, savepath)