def test_select_model(train_data, grid_search, parallel_over_dict): _train_data = train_data if parallel_over_dict: col, value = list(parallel_over_dict.items())[0] _train_data = train_data[train_data[col] == value].drop( columns="Region") partition_columns = ["Region", "Product"] results = select_model( _train_data, target_col_name="Quantity", partition_columns=partition_columns, parallel_over_dict=parallel_over_dict, grid_search=grid_search, country_code_column="Holidays_code", ) if parallel_over_dict: partitions = ( train_data.loc[train_data[col] == value, partition_columns].drop_duplicates().to_dict( orient="records")) else: partitions = train_data[partition_columns].drop_duplicates().to_dict( orient="records") assert len(results) == len(partitions) for result in results: assert result.best_model_name == "good_dummy" assert result.partition in partitions
def test_persist_experts_in_physical_partition(train_data, grid_search, tmp_path): partition_columns = ["Product"] results = select_model( train_data, target_col_name="Quantity", partition_columns=partition_columns, grid_search=grid_search, ) persist_experts_in_physical_partition( tmp_path, results, persist_cv_results=True, persist_cv_data=True, persist_model_reprs=True, persist_best_model=True, persist_partition=True, persist_model_selector_results=True, ) files = os.listdir(tmp_path) assert len(files) == 18 for result in results: with open(os.path.join(tmp_path, result.partition_hash + ".cv_results"), "rb") as file: cv_results = pickle.load(file) assert isinstance(cv_results, type(result.cv_results)) assert all(cv_results.columns == result.cv_results.columns) with open(os.path.join(tmp_path, result.partition_hash + ".cv_data"), "rb") as file: cv_data = pickle.load(file) assert isinstance(cv_data, type(result.cv_data)) assert all(cv_data.columns == result.cv_data.columns) with open(os.path.join(tmp_path, result.partition_hash + ".model_reprs")) as file: model_reprs = json.load(file) assert isinstance(model_reprs, type(result.model_reprs)) assert model_reprs == result.model_reprs with open(os.path.join(tmp_path, result.partition_hash + ".best_model"), "rb") as file: model = pickle.load(file) assert isinstance(model, type(result.best_model)) assert str(model.get_params()) == str(result.best_model.get_params()) with open(os.path.join(tmp_path, result.partition_hash + ".partition")) as file: partition = json.load(file) assert partition == result.partition with open( os.path.join(tmp_path, result.partition_hash + ".model_selector_result"), "rb", ) as file: model_selector_result = pickle.load(file) assert isinstance(model_selector_result, type(result)) assert str(model_selector_result.__dict__) == str(result.__dict__)
def test_load_expert(train_data, grid_search, tmp_path): partition_columns = ["Product"] results = select_model( train_data, target_col_name="Quantity", partition_columns=partition_columns, grid_search=grid_search, ) persist_experts_in_physical_partition( tmp_path, results, persist_cv_results=True, persist_cv_data=True, persist_model_reprs=True, persist_best_model=True, persist_partition=True, persist_model_selector_results=True, ) files = os.listdir(tmp_path) assert len(files) == 18 for result in results: cv_results = _load_file(partition_label=result.partition, path=tmp_path, expert_type="cv_results") assert isinstance(cv_results, type(result.cv_results)) assert all(cv_results.columns == result.cv_results.columns) cv_data = _load_file(partition_label=result.partition, path=tmp_path, expert_type="cv_data") assert isinstance(cv_data, type(result.cv_data)) assert all(cv_data.columns == result.cv_data.columns) model_reprs = _load_file(partition_label=result.partition, path=tmp_path, expert_type="model_reprs") assert isinstance(model_reprs, type(result.model_reprs)) assert model_reprs == result.model_reprs pkl_model = _load_file(partition_label=result.partition, path=tmp_path, expert_type="best_model") assert isinstance(pkl_model, type(result.best_model)) assert str(pkl_model.get_params()) == str( result.best_model.get_params()) partition = _load_file(partition_label=result.partition, path=tmp_path, expert_type="partition") assert isinstance(partition, type(result.partition)) assert partition == result.partition model_selector_result = _load_file( partition_label=result.partition, path=tmp_path, expert_type="model_selector_result", ) assert isinstance(model_selector_result, type(result)) assert str(model_selector_result.__dict__) == str(result.__dict__) # with partition_hash pkl_model = _load_file( partition_hash=result.partition_hash, path=tmp_path, expert_type="best_model", ) assert isinstance(pkl_model, type(result.best_model)) assert str(pkl_model.get_params()) == str( result.best_model.get_params()) with pytest.raises(ValueError): _load_file( partition_label=result.partition, partition_hash=result.partition_hash, path=tmp_path, expert_type="best_model", ) with pytest.raises(ValueError): _load_file(path=tmp_path, expert_type="best_model")