def test_save_evolution(self):
     _feature_engineer: FeatureEngineer = FeatureEngineer(
         df=DATA_SET, target_feature='AveragePrice', temp_dir=DATA_DIR)
     _feature_engineer.set_predictors(exclude_original_data=False)
     _si: SwarmIntelligence = SwarmIntelligence(
         mode='model',
         target=_feature_engineer.get_target(),
         input_file_path=None,
         train_data_file_path=None,
         test_data_file_path=None,
         valid_data_file_path=None,
         df=None,
         data_set=None,
         features=_feature_engineer.get_predictors(),
         re_split_data=False,
         re_sample_cases=False,
         re_sample_features=False,
         re_populate=True,
         max_trials=2,
         max_features=-1,
         labels=None,
         models=['cat'],
         model_params=None,
         burn_in_adjustments=-1,
         warm_start=True,
         warm_start_strategy='monotone',
         warm_start_constant_hidden_layers=0,
         warm_start_constant_category='very_small',
         max_adjustments=5,
         pop_size=64,
         adjustment_rate=0.1,
         adjustment_prob=0.85,
         early_stopping=0,
         convergence=True,
         convergence_measure='median',
         timer_in_seconds=43200,
         force_target_type=None,
         plot=False,
         output_file_path='data',
         deep_learning_type='batch',
         deep_learning_output_size=None,
         log=False,
         feature_engineer=_feature_engineer,
         sampling_function=None)
     _si.optimize()
     _si.save_evolution(si=True,
                        model=True,
                        evolution_history=True,
                        adjustment_history=True,
                        final_adjustment=True)
     _found_pickles: List[bool] = []
     for pickle in [
             'model', 'GeneticAlgorithm', 'evolution_history',
             'adjustment_history', 'final_adjustment'
     ]:
         if os.path.isfile('data/{}.p'.format(pickle)):
             _found_pickles.append(True)
         else:
             _found_pickles.append(False)
     self.assertTrue(expr=all(_found_pickles))
 def test_merge_engineer(self):
     _all_features: List[str] = FEATURE_ENGINEER.get_features()
     _all_features.sort(reverse=False)
     _engineer: FeatureEngineer = FeatureEngineer(
         df=None, file_path=DATA_FILE_PATH, target_feature='AveragePrice')
     _categorical_features: List[str] = _engineer.get_feature_types().get(
         'categorical')
     _engineer.save(file_path='data/feature_learning_cat.p',
                    cls_obj=True,
                    overwrite=True,
                    create_dir=False)
     del _engineer
     _feature_engineer: FeatureEngineer = FeatureEngineer(
         feature_engineer_file_path='data/feature_learning_cat.p')
     _feature_engineer.clean(markers=dict(features=_categorical_features))
     _feature_engineer.merge_engineer(
         feature_engineer_file_path='data/feature_learning_cat.p')
     _features: List[str] = _feature_engineer.get_features()
     _features.sort(reverse=False)
     self.assertListEqual(list1=_all_features, list2=_features)
예제 #3
0
 def test_optimize_modeling(self):
     _feature_engineer: FeatureEngineer = FeatureEngineer(
         df=DATA_SET,
         target_feature='AveragePrice',
         temp_dir=DATA_DIR,
         auto_typing=False)
     _feature_engineer.set_predictors(exclude_original_data=False)
     _ga: GeneticAlgorithm = GeneticAlgorithm(
         mode='model',
         target=_feature_engineer.get_target(),
         input_file_path=None,
         train_data_file_path=None,
         test_data_file_path=None,
         valid_data_file_path=None,
         df=None,
         data_set=None,
         features=_feature_engineer.get_predictors(),
         re_split_data=False,
         re_sample_cases=False,
         re_sample_features=False,
         re_populate=True,
         max_trials=2,
         max_features=-1,
         labels=None,
         models=['cat'],
         model_params=None,
         burn_in_generations=-1,
         warm_start=True,
         warm_start_strategy='monotone',
         warm_start_constant_hidden_layers=0,
         warm_start_constant_category='very_small',
         max_generations=2,
         pop_size=4,
         mutation_rate=0.1,
         mutation_prob=0.85,
         parents_ratio=0.5,
         early_stopping=0,
         convergence=True,
         convergence_measure='median',
         timer_in_seconds=43200,
         force_target_type=None,
         plot=False,
         output_file_path='data',
         deep_learning_type='batch',
         deep_learning_output_size=None,
         log=False,
         verbose=False,
         checkpoint=True,
         feature_engineer=_feature_engineer,
         sampling_function=None)
     _ga.optimize()
     self.assertTrue(expr=_ga.evolution_gradient.get('max')[0] <=
                     _ga.evolution_gradient.get('max')[-1])
 def test_optimize_continue(self):
     _feature_engineer: FeatureEngineer = FeatureEngineer(
         df=DATA_SET, target_feature='AveragePrice', temp_dir=DATA_DIR)
     _feature_engineer.set_predictors(exclude_original_data=False)
     _max_adjustments: int = 5
     _si: SwarmIntelligence = SwarmIntelligence(
         mode='model',
         target=_feature_engineer.get_target(),
         input_file_path=None,
         train_data_file_path=None,
         test_data_file_path=None,
         valid_data_file_path=None,
         df=None,
         data_set=None,
         features=_feature_engineer.get_predictors(),
         re_split_data=False,
         re_sample_cases=False,
         re_sample_features=False,
         re_populate=True,
         max_trials=2,
         max_features=-1,
         labels=None,
         models=['cat'],
         model_params=None,
         burn_in_adjustments=-1,
         warm_start=True,
         warm_start_strategy='monotone',
         warm_start_constant_hidden_layers=0,
         warm_start_constant_category='very_small',
         max_adjustments=_max_adjustments,
         pop_size=64,
         adjustment_rate=0.1,
         adjustment_prob=0.85,
         early_stopping=0,
         convergence=True,
         convergence_measure='median',
         timer_in_seconds=43200,
         force_target_type=None,
         plot=False,
         output_file_path='data',
         deep_learning_type='batch',
         deep_learning_output_size=None,
         log=False,
         feature_engineer=_feature_engineer,
         sampling_function=None)
     _si.optimize()
     self.assertTrue(
         expr=_si.evolution_gradient.get(
             'max')[0] <= _si.evolution_gradient.get('max')[-1]
         and len(_si.evolution_gradient.get('max')) > _max_adjustments)
예제 #5
0
 def test_supervised_reg(self):
     _feature_engineer: FeatureEngineer = FeatureEngineer(
         df=DATA_SET, target_feature='AveragePrice')
     _feature_engineer.set_predictors(exclude_original_data=False)
     DataMiner(df=_feature_engineer.get_data(dask_df=True),
               file_path=None,
               target=_feature_engineer.get_target(),
               predictors=_feature_engineer.get_predictors(),
               feature_engineer=_feature_engineer,
               feature_generator=True,
               train_critic=True,
               plot=True,
               output_path='data',
               **dict(max_generations=2)).supervised(
                   models=['cat', 'xgb', 'svm'],
                   feature_selector='shapley',
                   top_features=0.5,
                   optimizer='ga',
                   force_target_type=None,
                   train=True,
                   train_size=0.8,
                   random=True,
                   clf_eval_metric='auc',
                   reg_eval_metric='rmse_norm',
                   save_train_test_data=True,
                   save_ga=True,
                   **dict(engineer_categorical=False))
     _found_results: List[bool] = []
     for result in [
             'feature_learning_data.parquet', 'feature_learning.p',
             'feature_importance_shapley.html',
             'feature_tournament_game_size.html', 'genetic.p', 'model.p'
     ]:
         if os.path.isfile('data/{}'.format(result)):
             _found_results.append(True)
         else:
             _found_results.append(False)
     self.assertTrue(expr=all(_found_results))
 def test_visualize_reg(self):
     _feature_engineer: FeatureEngineer = FeatureEngineer(
         df=DATA_SET, target_feature='AveragePrice', temp_dir=DATA_DIR)
     _feature_engineer.set_predictors(exclude_original_data=False)
     _si: SwarmIntelligence = SwarmIntelligence(
         mode='model',
         target=_feature_engineer.get_target(),
         input_file_path=None,
         train_data_file_path=None,
         test_data_file_path=None,
         valid_data_file_path=None,
         df=None,
         data_set=None,
         features=_feature_engineer.get_predictors(),
         re_split_data=False,
         re_sample_cases=False,
         re_sample_features=False,
         re_populate=True,
         max_trials=2,
         max_features=-1,
         labels=None,
         models=None,
         model_params=None,
         burn_in_adjustments=-1,
         warm_start=True,
         warm_start_strategy='monotone',
         warm_start_constant_hidden_layers=0,
         warm_start_constant_category='very_small',
         max_adjustments=5,
         pop_size=4,
         adjustment_rate=0.1,
         adjustment_prob=0.85,
         early_stopping=0,
         convergence=True,
         convergence_measure='median',
         timer_in_seconds=43200,
         force_target_type=None,
         plot=False,
         output_file_path='data',
         deep_learning_type='batch',
         deep_learning_output_size=None,
         log=False,
         feature_engineer=_feature_engineer,
         sampling_function=None)
     _si.optimize()
     _si.visualize(results_table=True,
                   model_distribution=True,
                   model_evolution=True,
                   param_distribution=True,
                   train_time_distribution=True,
                   breeding_map=True,
                   breeding_graph=True,
                   fitness_distribution=True,
                   fitness_evolution=True,
                   fitness_dimensions=True,
                   per_adjustment=True,
                   prediction_of_best_model=True,
                   epoch_stats=True)
     _found_plot: List[bool] = []
     for plot in [
             'ga_metadata_table',
             'ga_model_evolution',
             'ga_model_distribution',
             #'ga_parameter_treemap',
             'ga_training_time_distribution',
             #'ga_breeding_heatmap',
             #'ga_breeding_graph',
             'ga_fitness_score_distribution_per_adjustment',
             'ga_metadata_evolution_coords_actor_only',
             'ga_evolution_fitness_score',
             'ga_prediction_evaluation_coords',
             'ga_prediction_scatter_contour'
     ]:
         if os.path.isfile('data/{}.html'.format(plot)):
             _found_plot.append(True)
         else:
             _found_plot.append(False)
     self.assertTrue(expr=all(_found_plot))
 def test_data_import(self):
     _feature_engineer = FeatureEngineer(file_path='data/avocado.csv')
     self.assertTrue(expr=_feature_engineer.get_n_cases() > 0)
import unittest

from happy_learning.feature_engineer import FeatureEngineer, PROCESSING_ACTION_SPACE, SUPPORTED_TYPES
from typing import Dict, List

DATA_FILE_PATH: str = 'data/avocado.csv'
FEATURE_ENGINEER_FILE_PATH: str = 'data/feature_engineer.p'
FEATURE_ENGINEER: FeatureEngineer = FeatureEngineer(
    df=None,
    file_path=DATA_FILE_PATH,
    target_feature='AveragePrice',
    generate_new_feature=True,
    keep_original_data=True,
    unify_invalid_values=True,
    encode_missing_data=False,
    max_level_processing=5,
    activate_actor=False,
    missing_value_analysis=True,
    auto_cleaning=False,
    auto_typing=True,
    auto_engineering=False,
    auto_text_mining=False,
    seed=1234,
    partitions=4)


def _check_feature_orchestra(meth: str, features: List[str]) -> bool:
    """
    Check internal FeatureOrchestra decorator for the FeatureEngineer class

    :param meth: str
 def test_ga_clf(self):
     _feature_engineer: FeatureEngineer = FeatureEngineer(
         df=DATA_SET, target_feature='type')
     _feature_engineer.set_predictors(exclude=None,
                                      exclude_original_data=False)
     _ga: GeneticAlgorithm = GeneticAlgorithm(
         mode='model',
         target=_feature_engineer.get_target(),
         input_file_path=None,
         train_data_file_path=None,
         test_data_file_path=None,
         valid_data_file_path=None,
         df=None,
         data_set=None,
         features=_feature_engineer.get_predictors(),
         re_split_data=False,
         re_sample_cases=False,
         re_sample_features=False,
         re_populate=True,
         max_trials=2,
         max_features=-1,
         labels=None,
         models=['cat'],
         model_params=None,
         burn_in_generations=-1,
         warm_start=True,
         warm_start_strategy='monotone',
         warm_start_constant_hidden_layers=0,
         warm_start_constant_category='very_small',
         max_generations=10,
         pop_size=64,
         mutation_rate=0.1,
         mutation_prob=0.15,
         parents_ratio=0.5,
         early_stopping=0,
         convergence=True,
         convergence_measure='min',
         timer_in_seconds=43200,
         force_target_type=None,
         plot=False,
         output_file_path='data',
         deep_learning_type='batch',
         deep_learning_output_size=None,
         log=False,
         feature_engineer=_feature_engineer,
         sampling_function=None)
     _ga.optimize()
     _feature_learning: FeatureLearning = FeatureLearning(
         feature_engineer=_feature_engineer,
         df=None,
         file_path=None,
         target=_feature_engineer.get_target(),
         force_target_type=None,
         max_features=-1,
         keep_fittest_only=True,
         train_categorical_critic=False,
         train_continuous_critic=False,
         engineer_time_disparity=True,
         engineer_categorical=True,
         engineer_text=True,
         output_path='data')
     _feature_learning_engineer = _feature_learning.ga()
     _ga_using_new_features: GeneticAlgorithm = GeneticAlgorithm(
         mode='model',
         target=_feature_learning_engineer.get_target(),
         input_file_path=None,
         train_data_file_path=None,
         test_data_file_path=None,
         valid_data_file_path=None,
         df=None,
         data_set=None,
         features=_feature_learning_engineer.get_predictors(),
         re_split_data=False,
         re_sample_cases=False,
         re_sample_features=False,
         re_populate=True,
         max_trials=2,
         max_features=-1,
         labels=None,
         models=['cat'],
         model_params=None,
         burn_in_generations=-1,
         warm_start=True,
         warm_start_strategy='monotone',
         warm_start_constant_hidden_layers=0,
         warm_start_constant_category='very_small',
         max_generations=10,
         pop_size=64,
         mutation_rate=0.1,
         mutation_prob=0.15,
         parents_ratio=0.5,
         early_stopping=0,
         convergence=True,
         convergence_measure='min',
         timer_in_seconds=43200,
         force_target_type=None,
         plot=False,
         output_file_path='data',
         deep_learning_type='batch',
         deep_learning_output_size=None,
         log=False,
         feature_engineer=_feature_learning_engineer,
         sampling_function=None)
     _ga_using_new_features.optimize()
     self.assertTrue(
         expr=_ga_using_new_features.final_generation[
             _ga_using_new_features.best_individual_idx]['fitness_score'] >=
         _ga.final_generation[_ga.best_individual_idx]['fitness_score'])