def save_model(self, model, sub_dir='model', name='model', seed=None): model_dir = self._get_dir(sub_dir) create_path(model_dir, merge=True) model_file = os.path.join(model_dir, '{}.pkl'.format(name) if seed is None else '{}_{}.pkl'.format(name, seed)) joblib.dump(model, model_file)
def save_predictions_as_pickle(self, predictions, sub_dir='prediction', name='prediction', seed=None): output_dir = self._get_prediction_output_dir(sub_dir) create_path(output_dir, merge=True) predict_file = os.path.join(output_dir, '{}.pickle'.format(name) if seed is None else '{}_{}.pickle'.format(name, seed)) pickle.dump(predictions.astype(np.float32), predict_file)
def save_json(self, data, sub_dir='json', name='json_file', seed=None): json_dir = self._get_dir(sub_dir) create_path(json_dir, merge=True) json_file = os.path.join(json_dir, '{}.json'.format(name) if seed is None else '{}_{}.json'.format(name, seed)) with open(json_file, 'w') as wf: json.dump(data, wf)
def save_predictions_as_npy(self, predictions, sub_dir='prediction', name='prediction', seed=None): output_dir = self._get_prediction_output_dir(sub_dir) create_path(output_dir, merge=True) predict_file = os.path.join(output_dir, '{}.npy'.format(name) if seed is None else '{}_{}.npy'.format(name, seed)) np.save(predict_file, predictions)
def save_featurizer_as_dataframe(self, output_df, name='featurizer', save_type='pickle.gz'): featurizer_dir = self._get_featurizer_output_dir() create_path(featurizer_dir, merge=True) featurizer_file = os.path.join( featurizer_dir, '{}.{}'.format(name, save_type)) if save_type == 'csv': output_df.to_csv(featurizer_file) elif save_type.startswith('pickle'): output_df.to_pickle(featurizer_file)
def save_predictions_as_txt(self, predictions, sub_dir='prediction', name='prediction', seed=None): output_dir = self._get_prediction_output_dir(sub_dir) create_path(output_dir, merge=True) predict_file = os.path.join(output_dir, '{}.txt'.format(name) if seed is None else '{}_{}.txt'.format(name, seed)) with open(predict_file, 'w') as wf: wf.write("\n".join(list(map(str, predictions))))
def save_predictions_as_dataframe(self, predictions, subdir='prediction', name='prediction', seed=None): predict_dir = self._get_prediction_output_dir(subdir) create_path(predict_dir, merge=True) predict_file = os.path.join(predict_dir, '{}.csv'.format(name) if seed is None else '{}_{}.csv'.format(name, seed)) predict_df = pd.DataFrame(predictions, columns=['target', 'predict'] if isinstance(predictions[0], np.ndarray) else ['predict']) predict_df.to_csv(predict_file)
def _prepare_paths(self, output_path=None, auto_rename=False): timestamp = time.time() pid = os.getpid() if output_path == 'tmp' or output_path == 'default': output_path = \ '/tmp/amlearn/task_%d/output_%d' % (pid, int(timestamp)) self.output_path_ = output_path if output_path is not None: if auto_rename and os.path.exists(self.output_path_): self.output_path_ = auto_rename_file(self.output_path_) create_path(self.output_path_, overwrite=self.overwrite_path, merge=self.merge_path) self.output_path_created_ = True
def _fit_cv(self, X, y, random_state=None, scoring=None, cv_num=1, cv_params=None, val_size=0.3, save_model=False, save_score=True, save_prediction=False, prediction_types='dataframe', save_feature_importances=True, save_train_val_idx=False, **fit_params): # If user's cv_params contains 'cv_num' parameter, use the max value # between function parameter 'cv_num' and cv_params's 'cv_num'. if not self.imblearn: self.backend.logger.info('Start Cross Validation.') cv_start_time = time.time() if cv_params is None: cv_params = {} if 'cv_num' in cv_params.keys(): cv_num = max(cv_num, cv_params['cv_num']) cv_params.pop('cv_num') if 'scoring' in cv_params.keys(): cv_params.pop('scoring') return_train_score = cv_params.get('return_train_score', True) if cv_num > 1: np.random.seed(random_state) classifier_params = \ appropriate_kwargs(fit_params, self.classifier.fit) results, scorers = \ cross_validate(estimator=self.classifier, scoring=scoring, fit_params=classifier_params, X=X, y=y, cv=cv_num, **cv_params) else: results, scorers = self._fit(X, y, self.classifier, val_size=val_size, return_train_score=return_train_score, random_state=random_state, scoring=scoring, **fit_params) cv_num = 1 # TODO: now if scoring is more than one, score_name only can be the first of them. self.score_name = self.score_name if hasattr(self, 'score_name') \ else list(scorers.keys())[0] self.best_score_, (self.best_model_, self.best_model_tag_)= \ max(zip(results['test_{}'.format(self.score_name)], zip(results['estimators'], [''] if cv_num == 1 else ["cv_{}".format(i) for i in range(cv_num)])), key=lambda x: x[0]) if not self.imblearn: self.backend.logger.info( "\tCV classification finish in {:.4f} seconds.".format( time.time() - cv_start_time)) if save_model or save_score or save_train_val_idx or save_prediction \ or save_feature_importances: imblearn_output_path = \ os.path.join(self.backend.output_path, self.imblearn_tag) create_path(imblearn_output_path) if save_score: write_file( os.path.join(imblearn_output_path, 'mean_scores.txt'), '{}\n{}\n{}'.format( ','.join(['dataset'] + list(scorers.keys())), ','.join(['test'] + [ str(np.mean(results['test_{}'.format(score_name)])) for score_name in scorers.keys() ]), ','.join(['train'] + [ str(np.mean(results['train_{}'.format( score_name)])) for score_name in scorers.keys() ]) if return_train_score else -1)) check_path_while_saving(self.backend.output_path) for cv_idx in range(cv_num): sub_path = os.path.join(self.imblearn_tag, "cv_{}".format(cv_idx)) cv_output_path = \ os.path.join(self.backend.output_path, sub_path) create_path(cv_output_path) if save_score: write_file( os.path.join(cv_output_path, 'scores.txt'), '{}\n{}\n{}'.format( ','.join(['dataset'] + list(scorers.keys())), ','.join(['test'] + [ str(results['test_{}'.format(score_name)] [cv_idx]) for score_name in scorers.keys() ]), ','.join(['train'] + [ str(results['train_{}'.format(score_name)] [cv_idx]) for score_name in scorers.keys() ]) if return_train_score else -1)) score_model = results['estimators'][cv_idx] if save_model: self.backend.save_model(score_model, sub_path) if save_feature_importances: self.backend.save_json( self.feature_importances_dict(score_model), sub_path, name='feature_importances') if save_train_val_idx: train_idx = results['indices'][cv_idx][0] val_idx = results['indices'][cv_idx][1] write_file(os.path.join(cv_output_path, 'train_idx.txt'), "\n".join(list(map(str, train_idx)))) write_file(os.path.join(cv_output_path, 'val_idx.txt'), "\n".join(list(map(str, val_idx)))) if save_prediction: if 'X_val' in fit_params.keys( ) and 'y_val' in fit_params.keys(): test_X = fit_params['X_val'] test_y = fit_params['y_val'] else: test_X = X[results['indices'][cv_idx][1]] \ if isinstance(X, np.ndarray) \ else X.iloc[results['indices'][cv_idx][1]] test_y = y[results['indices'][cv_idx][1]] \ if isinstance(y, np.ndarray) \ else y.iloc[results['indices'][cv_idx][1]] if hasattr(score_model, 'predict_proba'): predictions = score_model.predict_proba(test_X) elif hasattr(score_model, 'decision_function'): predictions = score_model.decision_function(test_X) else: predictions = score_model.predict(test_X) targets_and_predictions = \ np.array(list(zip(test_y, predictions[:, 1]))) if not isinstance(prediction_types, list_like()): prediction_types = [prediction_types] for predict_type in prediction_types: if predict_type in self.backend.valid_predictions_type: getattr( self.backend, 'save_predictions_as_{}'.format(predict_type))( targets_and_predictions, sub_path) else: raise ValueError( 'predict_type {} is unknown, ' 'Possible values are {}'.format( predict_type, self.backend.valid_predictions_type)) return results, scorers
def _fit_cv(self, X, y, val_size=0.3, random_state=None, scoring=None, cv_num=1, cv_params=None, save_train_val_idx=True, save_model=True, save_score=True, save_prediction=True, prediction_types='dataframe', save_feature_importances=True, **fit_params): # If user's cv_params contains 'cv_num' parameter, use the max value # between function parameter 'cv_num' and cv_params's 'cv_num'. self.backend.logger.info('Start Cross Validation.') cv_start_time = time.time() if cv_params is None: cv_params = {} if 'cv_num' in cv_params.keys(): cv_num = max(cv_num, cv_params['cv_num']) cv_params.pop('cv_num') if 'scoring' in cv_params.keys(): cv_params.pop('scoring') return_train_score = cv_params.get('return_train_score', True) if cv_num > 1: if random_state is False: pass else: np.random.seed(random_state) results, scorers = \ cross_validate(estimator=self.regressor, scoring=scoring, fit_params=fit_params, X=X, y=y, cv=cv_num, **cv_params) else: results, scorers = self._fit( X, y, self.regressor, val_size=val_size, return_train_score=return_train_score, random_state=random_state, scoring=scoring, **fit_params) cv_num = 1 # TODO: now if scorers list length is more than 1, score_name only can # be the first of them. self.score_name = self.score_name if hasattr(self, 'score_name') \ else list(scorers.keys())[0] self.best_score_, (self.best_model_, self.best_model_tag_)= \ max(zip(results['test_{}'.format(self.score_name)], zip(results['estimators'], [''] if cv_num == 1 else ["cv_{}".format(i) for i in range(cv_num)])), key=lambda x: x[0]) self.backend.logger.info( "\tCV regression finish in {:.4f} seconds.".format( time.time() - cv_start_time)) if save_score: write_file( os.path.join(self.backend.output_path, 'mean_scores.txt'), '{}\n{}\n{}'.format( ','.join(['dataset'] + list(scorers.keys())), ','.join(['test'] + [str(np.mean(results['test_{}'.format( score_name)])) for score_name in scorers.keys()]), ','.join(['train'] + [str(np.mean(results['train_{}'.format( score_name)])) for score_name in scorers.keys()]) if return_train_score else -1)) for cv_idx in range(cv_num): cv_tag = "cv_{}".format(cv_idx) cv_output_path = os.path.join(self.backend.output_path, cv_tag) create_path(cv_output_path, merge=True) if save_score: write_file(os.path.join(cv_output_path, 'scores.txt'), '{}\n{}\n{}'.format( ','.join(['dataset'] + list(scorers.keys())), ','.join(['test'] + [str(results['test_{}'.format( score_name)][cv_idx]) for score_name in scorers.keys()]), ','.join(['train'] + [str(results['train_{}'.format( score_name)][cv_idx]) for score_name in scorers.keys()]) if return_train_score else -1)) score_model = results['estimators'][cv_idx] if save_model: self.backend.save_model(score_model, cv_tag) if save_feature_importances: self.backend.save_json( self.feature_importances_dict(score_model), cv_tag, name='feature_importances') if save_train_val_idx: train_idx = results['indices'][cv_idx][0] val_idx = results['indices'][cv_idx][1] write_file(os.path.join(cv_output_path, 'train_idx.txt'), "\n".join(list(map(str, train_idx)))) write_file(os.path.join(cv_output_path, 'val_idx.txt'), "\n".join(list(map(str, val_idx)))) if save_prediction: predictions = \ score_model.predict(X.iloc[results['indices'][cv_idx][1]]) targets_and_predictions = \ np.array(list(zip(y.iloc[results['indices'][cv_idx][1]], predictions))) if not isinstance(prediction_types, list_like()): prediction_types = [prediction_types] for predict_type in prediction_types: if predict_type in self.backend.valid_predictions_type: instance = getattr(self.backend, 'save_predictions_as_{}'.format( predict_type)) instance(targets_and_predictions, cv_tag) else: raise ValueError( 'predict_type {} is unknown, ' 'Possible values are {}'.format( predict_type, self.backend.valid_predictions_type)) return self