def create_partition(self, partition_type: str, **kwargs) -> tuple[str, str]: set_seed(self.seed) if self.partition is not None: raise RuntimeError('Partition already created.') data = self.catalog.load_table(self.data_id) if partition_type == 'random': trn_ratio = kwargs['trn_ratio'] val_ratio = kwargs.get('val_ratio', None) trn_idx, val_idx = split_index(data, trn_ratio, b_ratio=val_ratio) trn_idx_id = self.catalog.save_index(self.data_id, trn_idx) val_idx_id = self.catalog.save_index(self.data_id, val_idx) names = trn_idx_id, val_idx_id elif partition_type == 'manual': trn_idx_id = kwargs['trn_idx_id'] val_idx_id = kwargs.get('val_idx_id', None) names = self._register_partition(trn_idx_id, val_idx_id=val_idx_id) else: raise ValueError('Unknown partition_type.') self.partition = Partition(names=names, type=partition_type, param=kwargs) return trn_idx_id, val_idx_id
def create_cv_partition(self, cv_type: str, **kwargs) -> list[tuple[str, str]]: if self.cv_partition is not None: raise RuntimeError('CV partition already created.') set_seed(self.seed) data = self.catalog.load_table(self.data_id) selector_class = SELECTORS.get(cv_type.lower(), None) if selector_class is not None: selector: BaseSelector = selector_class(**kwargs) cv_names = [] for idx in selector.split(data): trn_idx = pd.Index(idx[0]) val_idx = pd.Index(idx[1]) idx_ids = ( self.catalog.save_index(self.data_id, trn_idx), self.catalog.save_index(self.data_id, val_idx), ) cv_names.append(idx_ids) elif cv_type == 'manual': cv_names = kwargs['partition_idx_ids'] # self._register_partition(idx_ids[0], val_idx_id=idx_ids[1]) else: raise ValueError('Unknown cv_type.') self.cv_partition = CVPartition(cv_names=cv_names, type=cv_type, param=kwargs) return cv_names
def __init__(self, target_type: str, predictor_class: Type[BasePredictor], target: str, catalog: Catalog, data_id: str, metrics: list[Type[BaseMetric]] = None, train_param: dict = None, seed: int = None) -> None: """Initialize object.""" if seed is None: seed = set_seed_random() else: set_seed(seed) self.seed = seed self.target_type = target_type self.target = target self.catalog = catalog self.predictor_class = predictor_class self.metrics = metrics self.train_param = {} if train_param is None else train_param self.score: dict = {} self.perm: dict = {} self.data_id = data_id self.model_id: Optional[str] = None self.partition: Optional[Partition] = None self.cv_partition: Optional[CVPartition] = None self.cv_model_ids: Optional[list[str]] = None self.column_stats: Optional[pd.DataFrame] = None
def _calc_perm(self, model_id: str, idx_id: str, **kwargs) -> dict[str, pd.DataFrame]: set_seed(self.seed) model = self.catalog.load_model(model_id) data_idx = self.catalog.load_index(idx_id) data = self.catalog.load_table(self.data_id).iloc[data_idx] x = data.drop(columns=[self.target]) y = data[self.target] result = model.calc_perm(x, y, **kwargs) # TODO: Save perm return result
def _validate(self, model_id: str, val_idx_id: str) -> dict: set_seed(self.seed) data = self.catalog.load_table(self.data_id) model = self.catalog.load_model(model_id) data_idx = self.catalog.load_index(val_idx_id) sample = data.iloc[data_idx] x = sample.drop(columns=[self.target]) y = sample[self.target] score, y_pred = model.validate(x, y) # TODO: Save y_pred? return score
def split(self, x: X_TYPE, y: Y_TYPE = None) -> Iterable[tuple[pd.Index, pd.Index]]: """Split.""" for seed in self.seeds: set_seed(seed) trn_size = round(x.shape[0] * self.trn_ratio) trn_idx = x.sample(trn_size).index if self.val_ratio: val_size = round(x.shape[0] * self.val_ratio) val_idx = x.drop(trn_idx).sample(val_size).index else: val_idx = x.drop(trn_idx).index yield trn_idx, val_idx
def _train(self, trn_idx_id: str) -> str: set_seed(self.seed) data = self.catalog.load_table(self.data_id) data_idx = self.catalog.load_index(trn_idx_id) sample = data.iloc[data_idx] x = sample.drop(columns=[self.target]) y = sample[self.target] model = self._get_model() model_id = model.hash_model(x, y) try: model = self.catalog.load_model(model_id) except IndexError: model.train(x, y) new_model_id = self.catalog.save_model(model) assert new_model_id == model_id return model_id
def test_set_seed(): """Test set_seed().""" seed_value = 4 seed.set_seed(seed_value) expected = random.random() seed.set_seed(seed_value) assert random.random() == expected seed.set_seed(1) assert random.random() != expected
from power_ml.ai.predictor.sklearn_predictor import SklearnPredictor from power_ml.util.seed import set_seed dataset = load_boston() x = pd.DataFrame(dataset['data'], columns=dataset['feature_names']) y = dataset['target'] # Train model. predictor = SklearnPredictor(LinearRegression, 'regression') predictor.train(x, y) # Initialize. perm = PermutationImportance(predictor, MAE, x, y, n=20) # Recommend to set seed. set_seed(3) # Get single column shuffled metric. col = 'DIS' metric = perm.shuffle_and_evaluate(col) print('{}: {}'.format(col, metric)) # Iterate permutation importance, for col_perm in perm.iter_perm(): col, weight, score = col_perm print('{:10}, {:.4f}, {:.4f}'.format(col, weight, score)) # Analyzed permutation importance # n_jobs=-1: Use all CPU cores. perms = perm.calc(n_jobs=-1) print(perms)
"""Example of model.""" import pandas as pd from sklearn.datasets import load_boston from power_ml.ai.metrics import MAE, MAPE from power_ml.ai.model import Model from power_ml.ai.predictor.light_gbm import LightGBM from power_ml.util.seed import set_seed # Recommend to set seed. set_seed(82) dataset = load_boston() x = pd.DataFrame(dataset['data'], columns=dataset['feature_names']) y = dataset['target'] x_trn, y_trn = x[:400], y[:400] x_tst, y_tst = x[400:], y[400:] param = { 'objective': 'regression', } predictor = LightGBM('regression', param=param) model = Model(predictor, metrics=[MAE, MAPE]) model.train(x_trn, y_trn) # Permutation Importance perms = model.calc_perm(x_trn, y_trn, n=5, n_jobs=1) for metric, perm in model.calc_perm(x_trn, y_trn, n=10, n_jobs=1).items(): print(metric) print(perm)
"""Example of seed.""" import random from power_ml.util.seed import set_seed, set_seed_random for v in [5, 5, 6]: set_seed(v) print('Seed: {}'.format(v)) print(random.random()) print(random.random()) v = set_seed_random() print('Seed: {}'.format(v)) print(random.random())