def check_dataframe_diff(df1: pd.DataFrame, df2: pd.DataFrame, logger: logging.Logger = None, max_count: int = 10, ignore_same: bool = False): if logger is None: from kkpackage.util.logger import set_logger # 常に使いたくないのでここで呼ぶ logger = set_logger(__name__) df1, df2 = df1.copy().fillna(-999), df2.copy().fillna( -999) # Series内の nan == nan は False になるので fill しておく logger.info("check dataframe shape.", color=["BOLD", "GREEN"]) logger.info(f"df1 shape: {df1.shape}") logger.info(f"df2 shape: {df2.shape}") logger.info("check dataframe index.", color=["BOLD", "GREEN"]) ndf1, ndf2 = df1.index, df2.index if (ndf1.shape[0] != ndf2.shape[0]) or (~(ndf1 == ndf2).sum() > 0): logger.warning(f"index is different.") same_index = values_include(ndf1, ndf2) logger.debug(f"same index: {same_index}") logger.warning(f"only df1 index: {values_not_include(ndf2, ndf1)}") logger.warning(f"only df2 index: {values_not_include(ndf1, ndf2)}") else: if ignore_same == False: logger.info(f"index is same.", color=["BOLD", "BLUE"]) logger.info("check dataframe columns.", color=["BOLD", "GREEN"]) ndf1, ndf2 = df1.columns, df2.columns same_columns = values_include(ndf1, ndf2) if (ndf1.shape[0] != ndf2.shape[0]) or (~(ndf1 == ndf2).sum() > 0): logger.warning(f"columns is different.") logger.debug(f"same columns: {same_columns}") logger.warning(f"only df1 index: {values_not_include(ndf2, ndf1)}") logger.warning(f"only df2 index: {values_not_include(ndf1, ndf2)}") else: if ignore_same == False: logger.info(f"columns is same.", color=["BOLD", "BLUE"]) logger.info("we check only same indexes and same columns", color=["BOLD", "GREEN"]) df1 = df1.loc[df1.index.isin(df2.index), df1.columns.isin(df2.columns)] df2 = df2.loc[df1.index, df1.columns] logger.info("check whole data.", color=["BOLD", "GREEN"]) for x in same_columns: sebool = (df1[x] == df2[x]) if (~sebool).sum() > 0: logger.warning( f'"{x}" is different. different count: {(~sebool).sum()}. different index: {df1.index[~sebool]}. {(~sebool).sum()}. different values: {[(_x, _y, ) for _x, _y in zip(df1.loc[~sebool, x].iloc[:max_count].values, df2.loc[~sebool, x].iloc[:max_count].values)]}' ) else: if ignore_same == False: logger.info(f'"{x}" is same.', color=["BOLD", "BLUE"])
def __init__(self, model: torch.nn.Module): self.logger = set_logger(_logname + ".MyTorch", log_level="info") self.logger.info("START") self.model = model self.model_init = copy.deepcopy(self.model) self.model_type_list = [] # 1:分類, 2:回帰.※損失関数の数だけ存在 self.is_cuda = False self.learning_rate = None self.n_epoch = None self.batch_size = None self.criterion_list = [] # 複数のOutputがある場合、複数の損失関数を考慮する(マルチタスク学習) self.optimizer = None self.optimizer_init = None self.classes_ = None self.logger.info("END")
from functools import partial import os, json import numpy as np import pandas as pd import optuna import sqlite3 # local package from kkpackage.util.learning import evalate, split_data_balance, conv_validdata_in_fitparmas, is_classification_model from kkpackage.util.common import is_callable from kkpackage.util.logger import set_logger logger = set_logger(__name__) # optuna に埋め込むベース関数 # X, Y はそれぞれ1種類を想定する def optuna_base_function(X: np.ndarray, Y: np.ndarray, X_test: np.ndarray, Y_test: np.ndarray, model: object, dict_param: dict, tuning_eval: str, split_params: dict, fit_params: dict, eval_params: dict, trial): """ optuna に埋め込むためのBaseFunction X: 訓練データ Y: 訓練正解ラベル X_test: テストデータ Y_test: テスト正解ラベル model: ハイパーパラメータサーチしたいモデル dict_param = {"learning_rate":["category",0.05,0.1,0.2,0.3,0.5], "n_estimators":["int",500,1500], "max_depth":["int",3,10], "gamma":["float",0.,0.3], "min_child_weight":["int",1,20], "subsample":["step", 0.5, 0.9, 0.1], "colsample_bytree":["step", 0.1, 0.9, 0.1], "random_state":["const",1], "n_jobs":["const", -1]} tuning_eval: kkpackage.util.features.evalate にあるtuning_eval split_params:
def __init__(self, *layers): self.logger = set_logger(_logname + ".TorchNN", log_level="info") self.logger.info("START") # 親クラスのコンストラクタ super().__init__() self.num_modules = 0 self.num_inputs = 0 self.num_outputs = 0 self.outjoin_list = [] # 初めに、Layer構造チェック. 下記並びを原則とする # Input1, Input1, ..., Input2, ..., Common, Common, ..., Output1, Output1, ..., Output2, .. prev_type, prev_nodes = None, 0 ilayer_fist_common, ilayer_last_common = 0, [] fi_size_common, lo_size_common = 0, 0 for i, layer in enumerate(layers): # layer_nameだけ取り出す layer_type, layer_name, nodes, *param = layer # 辞書形式があれば取り出す params = {} if (len(param) > 0) and (type(param[-1]) == dict): params = param[-1] param = param[:-1] if prev_type is None: if layer_type.find("Input1") == 0: prev_type = layer_type self.num_inputs += 1 else: self.logger.raise_error( f"unexpected value. layer_type: {layer_type}") elif prev_type.find("Input") == 0: if layer_type.find("Common") == 0: # Commonになる場合. これ以降はInputは来ない prev_type = layer_type fi_size_common += prev_nodes ilayer_fist_common = i elif prev_type == layer_type: # InputXが続く場合 prev_type = layer_type elif (layer_type.find("Input") == 0) and ( (int(prev_type[-1]) + 1) == int(layer_type[-1])): # InputXからInputX+1に変わる場合 prev_type = layer_type fi_size_common += prev_nodes self.num_inputs += 1 else: self.logger.raise_error( f"unexpected value. layer_type: {layer_type}") elif prev_type.find("Common") == 0: if layer_type.find("Common") == 0: prev_type = layer_type elif (layer_type.find("Output1") == 0): # Commonが終わってOutputXに移る場合 prev_type = layer_type lo_size_common += prev_nodes ilayer_last_common.append(i) self.num_outputs += 1 else: self.logger.raise_error( f"unexpected value. layer_type: {layer_type}") elif prev_type.find("Output") == 0: if prev_type == layer_type: # OutputXが続く場合 prev_type = layer_type elif (layer_type.find("Output") == 0) and ( (int(prev_type[-1]) + 1) == int(layer_type[-1])): # OutputXからOutputX+1に変わる場合 prev_type = layer_type ilayer_last_common.append(i) self.num_outputs += 1 else: self.logger.raise_error( f"unexpected value. layer_type: {layer_type}") # 出力ノードの記録 if len(nodes) == 0: continue elif len(nodes) == 1: prev_nodes = nodes[0] elif len(nodes) == 2: prev_nodes = nodes[1] else: self.logger.raise_error(f"unexpected value. nodes: {nodes}") # オプションによって出力のノード数が変わる場合があるのでその修正 if (params.get("bidirectional") is not None) and (params.get("bidirectional") == True): prev_nodes = prev_nodes * 2 # 層を追加していく for i, layer in enumerate(layers): # 出力内容の分解 layer_type, layer_name, nodes, outjoin, *param = layer # 辞書形式があれば取り出す params = {} if (len(param) > 0) and (type(param[-1]) == dict): params = param[-1] param = param[:-1] # 入力ノード数, 出力ノード数の埋め込み ## InpuptX -> Common, Common -> OutputXの箇所が特殊 if len(nodes) == 0: pass elif len(nodes) == 1: self.out_size = nodes[0] elif len(nodes) == 2: self.in_size = nodes[0] self.out_size = nodes[1] else: self.logger.raise_error(f"unexpected value. nodes: {nodes}") # 初回Common層や初回OutputX層の場合 if i == ilayer_fist_common: self.in_size = fi_size_common elif i in ilayer_last_common: self.in_size = lo_size_common # 出力の結合先を定義する(ResNetを表現したい) # 出力結合先の格納. add_modules と同期をとりたい self.outjoin_list.append(outjoin if type(outjoin) == type("") else "") # Layerの追加 self.__AddModule(layer_type, layer_name, *param, **params) # 計算処理のコンパイル self.__Compile() self.logger.info("END")
import copy from typing import List import torch import pandas as pd import numpy as np from scipy import stats # local package from kkpackage.util.learning import evalate from kkpackage.util.common import check_type from kkpackage.util.logger import set_logger _logname = __name__ logger = set_logger() class TorchNN(torch.nn.Module): """ nn.Moduleを継承して新しいクラスを作る Layerの名前を固定して、複数入力ゆや複数出力に対応できるようにする *layers : ((layer1), (layer2), (layer3), ...) ※layer1 : (layer_type, layer_name, () or (出力ノード数) or (入力ノード数, 出力ノード数), *param, **params) ※layer_type(string) : 基本的に定義の上から順に同typeのlayerを計算する : InputX(Xは1から始まる番号. 別特徴量の入力を指す) : Common(全結合層. InputXが横並びで入力される層) : OutputX(Xは1から始まる番号. マルチタスク用に別出力層を定義) """ def __init__(self, *layers): self.logger = set_logger(_logname + ".TorchNN", log_level="info") self.logger.info("START") # 親クラスのコンストラクタ super().__init__()
import time from urllib import request import requests import subprocess import psutil from time import sleep import signal # local package from kkpackage.util.logger import set_logger logger = set_logger(name=__name__) def download(url: str, savepath: str, proxies: dict = None): """ file を download する Params:: url: download url savepath: 保存先のpath proxies: 下記のように指定 proxies = { "http": "http://[email protected]:[email protected]:8080" \ } """ if proxies is not None: proxyHandler = request.ProxyHandler(proxies) opener = request.build_opener(proxyHandler) request.install_opener(opener) logger.info(f"download url: {url}") request.urlretrieve(url, savepath)