def test_get_feature_types(self): self.assertDictEqual(d1={'continuous': ['C', 'G', 'H'], 'categorical': ['A', 'B', 'F', 'I', 'J', 'K'], 'ordinal': [], 'date': ['D'], 'text': ['E'] }, d2=EasyExploreUtils().get_feature_types(df=DATA_SET, features=list(DATA_SET.keys()), dtypes=DATA_SET.dtypes.tolist() ) )
def test_get_pairs(self): self.assertListEqual(list1=[tuple(['A', 'B']), tuple(['A', 'C']), tuple(['B', 'C'])], list2=EasyExploreUtils().get_pairs(features=['A', 'B', 'C'], max_features_each_pair=2) )
def test_get_duplicates(self): self.assertDictEqual(d1=dict(cases=[], features=['K']), d2=EasyExploreUtils().get_duplicates(df=DATA_SET, cases=True, features=True) )
def test_convert_jupyter(self): EasyExploreUtils().convert_jupyter(notebook_name=os.path.join(OUTPUT_PATH, 'test_notebook.ipynb'), to='html') self.assertTrue(expr=os.path.isfile(os.path.join(OUTPUT_PATH, 'test_notebook.ipynb')))
def test_check_dtypes(self): self.assertDictEqual(d1={'B': 'int', 'D': 'date', 'F': 'int', 'I': 'int', 'J': 'int', 'K': 'int'}, d2=EasyExploreUtils().check_dtypes(df=DATA_SET, date_edges=None).get('conversion') )
def __init__(self, df: Union[dd.DataFrame, pd.DataFrame], n_chains: int = 3, n_iter: int = 15, n_burn_in_iter: int = 3, ml_meth: dict = None, predictors: dict = None, imp_sequence: List[str] = None, cor_threshold_for_predictors: float = None, pool_eval_meth: str = 'std', impute_hard_missing: bool = False, soft_missing_values: list = None): """ :param df: Pandas or dask DataFrame Data set :param n_chains: int Number of markov chains :param n_iter: int Number of iterations :param n_burn_in_iter: int Number of burn-in iterations (warm start) :param ml_meth: str Name of the supervised machine learning algorithm :param predictors: dict Pre-defined predictors for each feature imputation :param imp_sequence: :param cor_threshold_for_predictors: float :param pool_eval_meth: :param impute_hard_missing: :param soft_missing_values: """ if isinstance(df, pd.DataFrame): self.df: dd.DataFrame = dd.from_pandas(data=df, npartitions=4) elif isinstance(df, dd.DataFrame): self.df: dd.DataFrame = df self.feature_types: dict = EasyExploreUtils().get_feature_types( df=self.df, features=list(self.df.columns), dtypes=self.df.dtypes.tolist()) self.n_chains: int = 3 if n_chains <= 1 else n_chains self.chains: dict = {m: {} for m in range(0, self.n_chains, 1)} self.n_burn_in_iter: int = 3 if n_burn_in_iter <= 0 else n_burn_in_iter self.n_iter: int = (15 if n_iter <= 1 else n_iter) + self.n_burn_in_iter self.data_types: List[str] = ['cat', 'cont', 'date'] _encoder = LabelEncoder() for ft in self.df.columns: if str(self.df[ft].dtype).find('object') >= 0: self.df[ft] = self.df[ft].fillna('NaN') #self.df.loc[self.df[ft].isnull().compute(), ft] = 'NaN' self.df[ft] = dd.from_array(x=_encoder.fit_transform( y=self.df[ft].values)) self.ml_meth: dict = ml_meth if self.ml_meth is not None: for meth in self.ml_meth: if meth.find('cat') >= 0: pass else: self.ml_meth = dict(cat='xgb', cont='xgb', date='xgb') self.predictors: dict = predictors self.impute_hard_missing: bool = impute_hard_missing self.mis_freq: dict = MissingDataAnalysis( df=self.df, other_mis=soft_missing_values).freq_nan_by_features() self.nan_idx: dict = MissingDataAnalysis( df=self.df, other_mis=soft_missing_values).get_nan_idx_by_features() self.imp_sequence: List[ str] = [] if imp_sequence is None else imp_sequence if len(self.imp_sequence) == 0: # self.imp_sequence = [mis_freq[0] for mis_freq in sorted(self.mis_freq.items(), key=lambda x: x[1], reverse=False)] for mis_freq in sorted(self.mis_freq.items(), key=lambda x: x[1], reverse=False): if mis_freq[1] > 0: self.imp_sequence.append(mis_freq[0]) if self.predictors is None: self.predictors = {} if cor_threshold_for_predictors is None: for ft in self.mis_freq.keys(): self.predictors.update({ ft: list(set(list(self.df.columns)).difference([ft])) }) else: if (cor_threshold_for_predictors > 0.0) and (cor_threshold_for_predictors < 1.0): _cor: pd.DataFrame = StatsUtils( data=self.df, features=list(self.df.columns)).correlation() for ft in self.df.columns: self.predictors.update({ ft: _cor.loc[_cor[ft] >= cor_threshold_for_predictors, ft].index.values.tolist() }) if len(self.predictors[ft]) == 0: raise MultipleImputationException( 'No predictors found to impute feature "{}" based on given correlation threshold (>={})' .format(ft, cor_threshold_for_predictors)) else: for ft in self.df.columns: self.predictors.update({ ft: list(set(list(self.df.columns)).difference([ft])) }) if pool_eval_meth not in ['std', 'var', 'aic', 'bic']: raise MultipleImputationException( 'Method for pooling chain evaluation ({}) not supported'. format(pool_eval_meth)) self.pool_eval_meth: str = pool_eval_meth
def supervised(self, models: List[str] = None, feature_selector: str = 'shapley', top_features: float = 0.5, optimizer: str = 'ga', force_target_type: str = None, train: bool = True, train_size: float = 0.8, random: bool = True, stratification: bool = False, clf_eval_metric: str = 'auc', reg_eval_metric: str = 'rmse_norm', save_train_test_data: bool = True, save_ga: bool = True, **kwargs ): """ Run supervised machine learning models :param models: List[str] Name of the supervised machine learning models to use :param feature_selector: str Feature selection method: -> shapley: Shapley Value based on the FeatureTournament framework :param top_features: float Amount of top features to select :param optimizer: str Model optimizer method: -> ga: Genetic Algorithm -> None: Develop model manually using pre-defined parameter config without optimization :param force_target_type: str Name of the target type to force (useful if target type is ordinal) -> reg: define target type as regression instead of multi classification -> clf_multi: define target type as multi classification instead of regression :param train: bool Whether to train or to predict from supervised machine learning models :param train_size: float Proportion of cases in the training data set :param random: bool Whether to sample randomly or by index :param stratification: bool Whether to stratify train and test data sets :param clf_eval_metric: str Name of the metric to use for evaluate classification models :param reg_eval_metric: str Name of the metric to use for evaluate regression models :param save_train_test_data: bool Whether to save train-test data split or not :param save_ga: bool Whether to save "Genetic" object or not :param kwargs: dict Key-word arguments of classes FeatureSelector / DataExporter / Genetic / MLSampler / DataVisualizer """ self.force_target_type = force_target_type if train: _train_size: float = train_size if (train_size > 0) and (train_size < 1) else 0.8 if self.feature_generator: self.feature_engineer = FeatureLearning(feature_engineer=self.feature_engineer, target=self.feature_engineer.get_target(), force_target_type=force_target_type, max_features=0, keep_fittest_only=True if kwargs.get('keep_fittest_only') is None else kwargs.get('keep_fittest_only'), train_continuous_critic=False if kwargs.get('train_continuous_critic') is None else kwargs.get('train_continuous_critic'), train_categorical_critic=False if kwargs.get('train_categorical_critic') is None else kwargs.get('train_categorical_critic'), engineer_time_disparity=True if kwargs.get('engineer_time_disparity') is None else kwargs.get('engineer_time_disparity'), engineer_categorical=False if kwargs.get('engineer_categorical') is None else kwargs.get('engineer_categorical'), output_path=self.output_path, **self.kwargs ).ga() else: self.feature_engineer.set_predictors(exclude_original_data=False) if feature_selector is not None: _imp_features: dict = FeatureSelector(df=self.feature_engineer.get_training_data(output='df_dask'), target=self.feature_engineer.get_target(), features=self.feature_engineer.get_predictors(), force_target_type=force_target_type, aggregate_feature_imp=self.feature_engineer.get_processing()['features']['raw'], visualize_all_scores=self.plot if kwargs.get('visualize_all_scores') is None else kwargs.get('visualize_all_scores'), visualize_variant_scores=self.plot if kwargs.get('visualize_variant_scores') is None else kwargs.get('visualize_variant_scores'), visualize_core_feature_scores=self.plot if kwargs.get('visualize_core_feature_scores') is None else kwargs.get('visualize_core_feature_scores'), path=self.output_path ).get_imp_features(meth=feature_selector, imp_threshold=0.001 if kwargs.get('imp_threshold') is None else kwargs.get('imp_threshold') ) _ratio: float = top_features if (top_features > 0) and (top_features <= 1) else 0.5 _top_n_features: int = round(self.feature_engineer.get_n_predictors() * _ratio) self.feature_engineer.set_predictors(features=_imp_features.get('imp_features')[0:_top_n_features], exclude_original_data=False ) if self.output_path is not None or kwargs.get('file_path') is not None: DataExporter(obj=_imp_features, file_path='{}feature_importance.pkl'.format(self.output_path) if kwargs.get('file_path') is None else kwargs.get('file_path'), create_dir=True if kwargs.get('create_dir') is None else kwargs.get('create_dir'), overwrite=False if kwargs.get('overwrite') is None else kwargs.get('overwrite') ).file() if optimizer == 'ga': _ga = GeneticAlgorithm(mode='model', df=self.feature_engineer.get_training_data(), target=self.feature_engineer.get_target(), force_target_type=force_target_type, features=self.feature_engineer.get_predictors(), stratify=stratification, labels=None if kwargs.get('labels') is None else kwargs.get('labels'), models=models, burn_in_generations=10 if kwargs.get('burn_in_generations') is None else kwargs.get('burn_in_generations'), max_generations=25 if kwargs.get('max_generations') is None else kwargs.get('max_generations'), pop_size=64 if kwargs.get('pop_size') is None else kwargs.get('pop_size'), mutation_rate=0.1 if kwargs.get('mutation_rate') is None else kwargs.get('mutation_rate'), mutation_prob=0.15 if kwargs.get('mutation_prob') is None else kwargs.get('mutation_prob'), parents_ratio=0.5 if kwargs.get('parents_ratio') is None else kwargs.get('parents_ratio'), early_stopping=0 if kwargs.get('early_stopping') is None else kwargs.get('early_stopping'), convergence=False if kwargs.get('convergence') is None else kwargs.get('convergence'), convergence_measure='median' if kwargs.get('convergence_measure') is None else kwargs.get('convergence_measure'), timer_in_seconds=43200 if kwargs.get('timer_in_seconds') is None else kwargs.get('timer_in_seconds'), plot=self.plot, output_file_path=self.output_path ) _ga.optimize() if save_train_test_data: DataExporter(obj=_ga.data_set, file_path='{}train_test_data.pkl'.format(self.output_path), create_dir=True if kwargs.get('create_dir') is None else kwargs.get('create_dir'), overwrite=False if kwargs.get('overwrite') is None else kwargs.get('overwrite') ).file() if save_ga: _ga.save_evolution(ga=True, model=False) else: _model_eval_plot: dict = {} _data_set: dict = MLSampler(df=self.feature_engineer.get_data(), target=self.feature_engineer.get_target(), features=self.feature_engineer.get_predictors(), train_size=_train_size, random_sample=random, stratification=stratification ).train_test_sampling(validation_split=0.1 if kwargs.get('validation_split') is None else kwargs.get('validation_split')) if save_train_test_data: DataExporter(obj=_data_set, file_path='{}train_test_data.pkl'.format(self.output_path), create_dir=True if kwargs.get('create_dir') is None else kwargs.get('create_dir'), overwrite=False if kwargs.get('overwrite') is None else kwargs.get('overwrite') ).file() for model in models: if HappyLearningUtils().get_ml_type(values=self.feature_engineer.get_target_values()) == 'reg': _model = ModelGeneratorReg(model_name=model, reg_params=None).generate_model() _model.train(x=_data_set.get('x_train').values, y=_data_set.get('y_train').values, validation=dict(x_val=_data_set.get('x_val').values, y_val=_data_set.get('y_val').values ) ) _pred: np.array = _model.predict(x=_data_set.get('x_test').values) _model.eval(obs=_data_set.get('y_test').values, pred=_pred, eval_metric=[reg_eval_metric]) _perc_table: pd.DataFrame = EasyExploreUtils().get_perc_eval(pred=_pred, obs=_data_set.get('y_test').values.tolist(), aggregation='median', percentiles=10 ) _min_table: pd.DataFrame = EasyExploreUtils().get_perc_eval(pred=_pred, obs=_data_set.get('y_test').values.tolist(), aggregation='min', percentiles=10 ) _max_table: pd.DataFrame = EasyExploreUtils().get_perc_eval(pred=_pred, obs=_data_set.get('y_test').values.tolist(), aggregation='max', percentiles=10 ) _multi: dict = {'bar_obs': dict(y=_perc_table['obs'].values, name='obs', error_y=dict(type='data', array=_max_table['obs'].values - _min_table[ 'obs'].values) ), 'bar_preds': dict(y=_perc_table['preds'].values, name='pred', error_y=dict(type='data', array=_max_table['preds'].values - _min_table[ 'preds'].values) ) } _model_eval_df: pd.DataFrame(data={'obs': _data_set.get('y_test').values, 'preds': _pred}) _model_eval_df['abs_diff'] = _model_eval_df['obs'] - _model_eval_df['preds'] _model_eval_df['rel_diff'] = _model_eval_df['obs'] / _model_eval_df['preds'] # TODO: Add train & test error to plot _model_eval_plot.update({'Prediction vs. Observation (Value Based)': dict(data=_model_eval_df, features=['obs', 'preds'], plot_type='joint', render=True, file_path='{}prediction_scatter_{}.html'.format(self.output_path, model) ), 'Prediction vs. Observation (Range Based)': dict(data=_model_eval_df, features=['obs', 'preds', 'abs_diff', 'rel_diff'], plot_type='parcoords', render=True, file_path='{}prediction_coords_{}.html'.format(self.output_path, model) ), 'Prediction vs. Observation (Percentile Based)': dict(data=_perc_table, plot_type='multi', render=True, file_path='{}prediction_percentiles_{}.html'.format(self.output_path, model), kwargs=dict(layout=dict(barmode='group', xaxis=dict(tickmode='array', tickvals=[p for p in range(0, 10, 1)], ticktext=[str(label) for label in _perc_table['obs'].values.tolist()] ) ), multi=_multi ) ) }) else: _model = ModelGeneratorClf(model_name=model, clf_params={}).generate_model() _model.train(x=_data_set.get('x_train').values, y=_data_set.get('y_train').values, validation=dict(x_val=_data_set.get('x_val').values, y_val=_data_set.get('y_val').values ) ) _pred: np.array = _model.predict(x=_data_set.get('x_test').values) _model.eval(obs=_data_set.get('y_test').values, pred=_pred, eval_metric=[clf_eval_metric]) _confusion_matrix: pd.DataFrame = EvalClf(obs=_data_set.get('y_test').values.tolist(), pred=_pred, probability=True ).confusion(normalize='true') _model_eval_plot.update({'Confusion Matrix': dict(data=_confusion_matrix, plot_type='heat', kwargs={'layout': {'xaxis': {'title': 'Observation'}, 'yaxis': {'title': 'Prediction'} }, 'text': _confusion_matrix.values.tolist() } ) }) if self.output_path is not None: DataExporter(obj=_model.model, file_path='{}model_{}'.format(self.output_path, model), create_dir=True, overwrite=False ).file() else: raise NotImplementedError('Prediction method not implemented yet')