def test_save_evolution(self): _feature_engineer: FeatureEngineer = FeatureEngineer( df=DATA_SET, target_feature='AveragePrice', temp_dir=DATA_DIR) _feature_engineer.set_predictors(exclude_original_data=False) _si: SwarmIntelligence = SwarmIntelligence( mode='model', target=_feature_engineer.get_target(), input_file_path=None, train_data_file_path=None, test_data_file_path=None, valid_data_file_path=None, df=None, data_set=None, features=_feature_engineer.get_predictors(), re_split_data=False, re_sample_cases=False, re_sample_features=False, re_populate=True, max_trials=2, max_features=-1, labels=None, models=['cat'], model_params=None, burn_in_adjustments=-1, warm_start=True, warm_start_strategy='monotone', warm_start_constant_hidden_layers=0, warm_start_constant_category='very_small', max_adjustments=5, pop_size=64, adjustment_rate=0.1, adjustment_prob=0.85, early_stopping=0, convergence=True, convergence_measure='median', timer_in_seconds=43200, force_target_type=None, plot=False, output_file_path='data', deep_learning_type='batch', deep_learning_output_size=None, log=False, feature_engineer=_feature_engineer, sampling_function=None) _si.optimize() _si.save_evolution(si=True, model=True, evolution_history=True, adjustment_history=True, final_adjustment=True) _found_pickles: List[bool] = [] for pickle in [ 'model', 'GeneticAlgorithm', 'evolution_history', 'adjustment_history', 'final_adjustment' ]: if os.path.isfile('data/{}.p'.format(pickle)): _found_pickles.append(True) else: _found_pickles.append(False) self.assertTrue(expr=all(_found_pickles))
def test_merge_engineer(self): _all_features: List[str] = FEATURE_ENGINEER.get_features() _all_features.sort(reverse=False) _engineer: FeatureEngineer = FeatureEngineer( df=None, file_path=DATA_FILE_PATH, target_feature='AveragePrice') _categorical_features: List[str] = _engineer.get_feature_types().get( 'categorical') _engineer.save(file_path='data/feature_learning_cat.p', cls_obj=True, overwrite=True, create_dir=False) del _engineer _feature_engineer: FeatureEngineer = FeatureEngineer( feature_engineer_file_path='data/feature_learning_cat.p') _feature_engineer.clean(markers=dict(features=_categorical_features)) _feature_engineer.merge_engineer( feature_engineer_file_path='data/feature_learning_cat.p') _features: List[str] = _feature_engineer.get_features() _features.sort(reverse=False) self.assertListEqual(list1=_all_features, list2=_features)
def test_optimize_modeling(self): _feature_engineer: FeatureEngineer = FeatureEngineer( df=DATA_SET, target_feature='AveragePrice', temp_dir=DATA_DIR, auto_typing=False) _feature_engineer.set_predictors(exclude_original_data=False) _ga: GeneticAlgorithm = GeneticAlgorithm( mode='model', target=_feature_engineer.get_target(), input_file_path=None, train_data_file_path=None, test_data_file_path=None, valid_data_file_path=None, df=None, data_set=None, features=_feature_engineer.get_predictors(), re_split_data=False, re_sample_cases=False, re_sample_features=False, re_populate=True, max_trials=2, max_features=-1, labels=None, models=['cat'], model_params=None, burn_in_generations=-1, warm_start=True, warm_start_strategy='monotone', warm_start_constant_hidden_layers=0, warm_start_constant_category='very_small', max_generations=2, pop_size=4, mutation_rate=0.1, mutation_prob=0.85, parents_ratio=0.5, early_stopping=0, convergence=True, convergence_measure='median', timer_in_seconds=43200, force_target_type=None, plot=False, output_file_path='data', deep_learning_type='batch', deep_learning_output_size=None, log=False, verbose=False, checkpoint=True, feature_engineer=_feature_engineer, sampling_function=None) _ga.optimize() self.assertTrue(expr=_ga.evolution_gradient.get('max')[0] <= _ga.evolution_gradient.get('max')[-1])
def test_optimize_continue(self): _feature_engineer: FeatureEngineer = FeatureEngineer( df=DATA_SET, target_feature='AveragePrice', temp_dir=DATA_DIR) _feature_engineer.set_predictors(exclude_original_data=False) _max_adjustments: int = 5 _si: SwarmIntelligence = SwarmIntelligence( mode='model', target=_feature_engineer.get_target(), input_file_path=None, train_data_file_path=None, test_data_file_path=None, valid_data_file_path=None, df=None, data_set=None, features=_feature_engineer.get_predictors(), re_split_data=False, re_sample_cases=False, re_sample_features=False, re_populate=True, max_trials=2, max_features=-1, labels=None, models=['cat'], model_params=None, burn_in_adjustments=-1, warm_start=True, warm_start_strategy='monotone', warm_start_constant_hidden_layers=0, warm_start_constant_category='very_small', max_adjustments=_max_adjustments, pop_size=64, adjustment_rate=0.1, adjustment_prob=0.85, early_stopping=0, convergence=True, convergence_measure='median', timer_in_seconds=43200, force_target_type=None, plot=False, output_file_path='data', deep_learning_type='batch', deep_learning_output_size=None, log=False, feature_engineer=_feature_engineer, sampling_function=None) _si.optimize() self.assertTrue( expr=_si.evolution_gradient.get( 'max')[0] <= _si.evolution_gradient.get('max')[-1] and len(_si.evolution_gradient.get('max')) > _max_adjustments)
def test_supervised_reg(self): _feature_engineer: FeatureEngineer = FeatureEngineer( df=DATA_SET, target_feature='AveragePrice') _feature_engineer.set_predictors(exclude_original_data=False) DataMiner(df=_feature_engineer.get_data(dask_df=True), file_path=None, target=_feature_engineer.get_target(), predictors=_feature_engineer.get_predictors(), feature_engineer=_feature_engineer, feature_generator=True, train_critic=True, plot=True, output_path='data', **dict(max_generations=2)).supervised( models=['cat', 'xgb', 'svm'], feature_selector='shapley', top_features=0.5, optimizer='ga', force_target_type=None, train=True, train_size=0.8, random=True, clf_eval_metric='auc', reg_eval_metric='rmse_norm', save_train_test_data=True, save_ga=True, **dict(engineer_categorical=False)) _found_results: List[bool] = [] for result in [ 'feature_learning_data.parquet', 'feature_learning.p', 'feature_importance_shapley.html', 'feature_tournament_game_size.html', 'genetic.p', 'model.p' ]: if os.path.isfile('data/{}'.format(result)): _found_results.append(True) else: _found_results.append(False) self.assertTrue(expr=all(_found_results))
def test_visualize_reg(self): _feature_engineer: FeatureEngineer = FeatureEngineer( df=DATA_SET, target_feature='AveragePrice', temp_dir=DATA_DIR) _feature_engineer.set_predictors(exclude_original_data=False) _si: SwarmIntelligence = SwarmIntelligence( mode='model', target=_feature_engineer.get_target(), input_file_path=None, train_data_file_path=None, test_data_file_path=None, valid_data_file_path=None, df=None, data_set=None, features=_feature_engineer.get_predictors(), re_split_data=False, re_sample_cases=False, re_sample_features=False, re_populate=True, max_trials=2, max_features=-1, labels=None, models=None, model_params=None, burn_in_adjustments=-1, warm_start=True, warm_start_strategy='monotone', warm_start_constant_hidden_layers=0, warm_start_constant_category='very_small', max_adjustments=5, pop_size=4, adjustment_rate=0.1, adjustment_prob=0.85, early_stopping=0, convergence=True, convergence_measure='median', timer_in_seconds=43200, force_target_type=None, plot=False, output_file_path='data', deep_learning_type='batch', deep_learning_output_size=None, log=False, feature_engineer=_feature_engineer, sampling_function=None) _si.optimize() _si.visualize(results_table=True, model_distribution=True, model_evolution=True, param_distribution=True, train_time_distribution=True, breeding_map=True, breeding_graph=True, fitness_distribution=True, fitness_evolution=True, fitness_dimensions=True, per_adjustment=True, prediction_of_best_model=True, epoch_stats=True) _found_plot: List[bool] = [] for plot in [ 'ga_metadata_table', 'ga_model_evolution', 'ga_model_distribution', #'ga_parameter_treemap', 'ga_training_time_distribution', #'ga_breeding_heatmap', #'ga_breeding_graph', 'ga_fitness_score_distribution_per_adjustment', 'ga_metadata_evolution_coords_actor_only', 'ga_evolution_fitness_score', 'ga_prediction_evaluation_coords', 'ga_prediction_scatter_contour' ]: if os.path.isfile('data/{}.html'.format(plot)): _found_plot.append(True) else: _found_plot.append(False) self.assertTrue(expr=all(_found_plot))
def test_data_import(self): _feature_engineer = FeatureEngineer(file_path='data/avocado.csv') self.assertTrue(expr=_feature_engineer.get_n_cases() > 0)
import unittest from happy_learning.feature_engineer import FeatureEngineer, PROCESSING_ACTION_SPACE, SUPPORTED_TYPES from typing import Dict, List DATA_FILE_PATH: str = 'data/avocado.csv' FEATURE_ENGINEER_FILE_PATH: str = 'data/feature_engineer.p' FEATURE_ENGINEER: FeatureEngineer = FeatureEngineer( df=None, file_path=DATA_FILE_PATH, target_feature='AveragePrice', generate_new_feature=True, keep_original_data=True, unify_invalid_values=True, encode_missing_data=False, max_level_processing=5, activate_actor=False, missing_value_analysis=True, auto_cleaning=False, auto_typing=True, auto_engineering=False, auto_text_mining=False, seed=1234, partitions=4) def _check_feature_orchestra(meth: str, features: List[str]) -> bool: """ Check internal FeatureOrchestra decorator for the FeatureEngineer class :param meth: str
def test_ga_clf(self): _feature_engineer: FeatureEngineer = FeatureEngineer( df=DATA_SET, target_feature='type') _feature_engineer.set_predictors(exclude=None, exclude_original_data=False) _ga: GeneticAlgorithm = GeneticAlgorithm( mode='model', target=_feature_engineer.get_target(), input_file_path=None, train_data_file_path=None, test_data_file_path=None, valid_data_file_path=None, df=None, data_set=None, features=_feature_engineer.get_predictors(), re_split_data=False, re_sample_cases=False, re_sample_features=False, re_populate=True, max_trials=2, max_features=-1, labels=None, models=['cat'], model_params=None, burn_in_generations=-1, warm_start=True, warm_start_strategy='monotone', warm_start_constant_hidden_layers=0, warm_start_constant_category='very_small', max_generations=10, pop_size=64, mutation_rate=0.1, mutation_prob=0.15, parents_ratio=0.5, early_stopping=0, convergence=True, convergence_measure='min', timer_in_seconds=43200, force_target_type=None, plot=False, output_file_path='data', deep_learning_type='batch', deep_learning_output_size=None, log=False, feature_engineer=_feature_engineer, sampling_function=None) _ga.optimize() _feature_learning: FeatureLearning = FeatureLearning( feature_engineer=_feature_engineer, df=None, file_path=None, target=_feature_engineer.get_target(), force_target_type=None, max_features=-1, keep_fittest_only=True, train_categorical_critic=False, train_continuous_critic=False, engineer_time_disparity=True, engineer_categorical=True, engineer_text=True, output_path='data') _feature_learning_engineer = _feature_learning.ga() _ga_using_new_features: GeneticAlgorithm = GeneticAlgorithm( mode='model', target=_feature_learning_engineer.get_target(), input_file_path=None, train_data_file_path=None, test_data_file_path=None, valid_data_file_path=None, df=None, data_set=None, features=_feature_learning_engineer.get_predictors(), re_split_data=False, re_sample_cases=False, re_sample_features=False, re_populate=True, max_trials=2, max_features=-1, labels=None, models=['cat'], model_params=None, burn_in_generations=-1, warm_start=True, warm_start_strategy='monotone', warm_start_constant_hidden_layers=0, warm_start_constant_category='very_small', max_generations=10, pop_size=64, mutation_rate=0.1, mutation_prob=0.15, parents_ratio=0.5, early_stopping=0, convergence=True, convergence_measure='min', timer_in_seconds=43200, force_target_type=None, plot=False, output_file_path='data', deep_learning_type='batch', deep_learning_output_size=None, log=False, feature_engineer=_feature_learning_engineer, sampling_function=None) _ga_using_new_features.optimize() self.assertTrue( expr=_ga_using_new_features.final_generation[ _ga_using_new_features.best_individual_idx]['fitness_score'] >= _ga.final_generation[_ga.best_individual_idx]['fitness_score'])