def test_iris_one_student(self): data = self.load_data() oracle = Oracle(teacher=RuleModel(), student_modelers=[RandomForestModeler()]) oracle.build(data['training_data']) pred = oracle.predict(data['test_data'])['predictions'] assert len(pred) == len(data['test_data']['y']) assert any(pred != data['test_data']['y']) # print(metrics.f1_score(data['test_data']['y'], pred, average='macro')) with tempfile.TemporaryDirectory() as path: os.environ['H1ST_MODEL_REPO_PATH'] = path version = oracle.persist() oracle_2 = Oracle(teacher=RuleModel(), student_modelers=[RandomForestModeler()]) oracle_2.load_params(version) assert 'sklearn' in str(type(oracle_2.students[0].base_model)) pred_2 = oracle_2.predict(data['test_data'])['predictions'] pred_df = pd.concat((pred, pred_2), axis=1) pred_df.columns = ['predictions', 'pred_2'] assert len( pred_df[pred_df['predictions'] != pred_df['pred_2']]) == 0
def test_rule_based_ensemble_one_equipment(self): data = self.load_data() data['training_data']['X'].drop('machineID', axis=1, inplace=True) oracle_modeler = TimeseriesOracleModeler( teacher=RuleModel(), student_modelers=[RandomForestModeler(), AdaBoostModeler()], ensembler_modeler=RuleBasedModeler( model_class=RuleBasedClassificationModel)) oracle = oracle_modeler.build_model( {'unlabeled_data': data['training_data']['X']}, ts_col='date') pred = oracle.predict(data['training_data'])['predictions'] assert len(pred) == 2 with tempfile.TemporaryDirectory() as path: os.environ['H1ST_MODEL_REPO_PATH'] = path version = oracle.persist() oracle_2 = TimeSeriesOracle( teacher=RuleModel(), students=[RandomForestModel(), AdaBoostModel()], ensembler=RuleBasedClassificationModel()) oracle_2.load_params(version) assert 'sklearn' in str(type(oracle_2.students[0].base_model)) pred_2 = oracle_2.predict(data['training_data'])['predictions'] pred_df = pd.DataFrame({'pred': pred, 'pred_2': pred_2}) assert len(pred_df[pred_df['pred'] != pred_df['pred_2']]) == 0
def __init__(self, teacher: PredictiveModel, ensembler_modeler: Modeler, student_modelers: List = [RandomForestModeler(), AdaBoostModeler()], model_class = Oracle ): self.teacher = teacher self.student_modelers = student_modelers self.ensembler_modeler = ensembler_modeler self.model_class = model_class self.stats = {}
def test_ml_based_ensemble(self): data = self.load_data() oracle_modeler = TimeseriesOracleModeler( teacher=RuleModel(), student_modelers=[RandomForestModeler(), AdaBoostModeler()], ensembler_modeler=MyMLModeler()) num_samples = 2 oracle = oracle_modeler.build_model( { 'unlabeled_data': data['training_data']['X'], 'labeled_data': { 'X_train': data['training_data']['X'], 'y_train': np.array(range(num_samples)), 'X_test': data['training_data']['X'], 'y_test': np.array(range(num_samples)) } }, id_col='machineID', ts_col='date') pred = oracle.predict(data['training_data'])['predictions'] assert len(pred) == num_samples with tempfile.TemporaryDirectory() as path: os.environ['H1ST_MODEL_REPO_PATH'] = path version = oracle.persist() oracle_2 = TimeSeriesOracle( teacher=RuleModel(), students=[RandomForestModel(), AdaBoostModel()], ensembler=MyMLModel()) oracle_2.load_params(version) assert 'sklearn' in str(type(oracle_2.students[0].base_model)) pred_2 = oracle_2.predict(data['training_data'])['predictions'] pred_df = pd.DataFrame({'pred': pred, 'pred_2': pred_2}) assert len(pred_df[pred_df['pred'] != pred_df['pred_2']]) == 0