class BasePredictions: def __init__(self, **kwargs): self.X = kwargs.get("X") self.y = kwargs.get("y") self.combs = kwargs.get("combs") self.clf = kwargs.get("clf") # arguments with default values self.top = kwargs.get("top", 20) self.mean_fpr = np.linspace(0, 1, 100) self.n_estimators = kwargs.get("n_estimators", 500) self.class_weight = kwargs.get("class_weight", "balanced") self.min_samples_split = kwargs.get("min_samples_split", 3) self.min_samples_leaf = kwargs.get("min_samples_leaf", 3) self.colsample_bytree = kwargs.get("colsample_bytree", 0.6) self.learning_rate = kwargs.get("learning_rate", 0.1) self.random_state = kwargs.get("random_state", 125) self.max_depth = kwargs.get("max_depth", None) self.objective = kwargs.get("objective", 'binary:logistic') self.scale_pos_weight = kwargs.get("scale_pos_weight", 1) self._set_classifier() self.predicted = dict() self.topfeat = dict() self.fpr = dict() self.tpr = dict() self.tprs = dict() self.auc = dict() self.precision = dict() self.recall = dict() self.avprec = dict() def _set_classifier(self): if self.clf.lower() == "randomforest": self.clf = RandomForestClassifier( bootstrap=True, class_weight=self.class_weight, max_depth=self.max_depth, n_estimators=self.n_estimators, max_features='sqrt', min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, random_state=self.random_state, n_jobs=-1) elif self.clf.lower() == "xgboost": if self.max_depth is None: self.max_depth = 5 self.clf = xgb.XGBClassifier( learning_rate=self.learning_rate, colsample_bytree=self.colsample_bytree, random_state=self.random_state, max_depth=self.max_depth, n_estimators=self.n_estimators, scale_pos_weight=self.scale_pos_weight, objective=self.objective, n_jobs=-1) else: raise ValueError("only randomforest and xgboost are supported")
def fit(self, X, y, tree="rf", recursive=True, cv=5): """ Fits to the data (X) and target (y) to determine the selected_features. Args: X (pandas.DataFrame): input data, note that numpy matrix is NOT accepted since the X.columns is used for feature names y (pandas.Series or np.ndarray): list of outputs used for fitting the tree model tree (str or instantiated sklearn tree-based model): if a model is directly fed, it must have the .feature_importances_ attribute recursive (bool): whether to recursively reduce the features (True) or just do it once (False) cv (int or CrossValidation): sklearn's cross-validation with the same options (int or actual instantiated CrossValidation) Returns (None): sets the class attribute .selected_features """ m0 = len(X.columns) if isinstance(tree, str): if tree.lower() in ["rf", "random forest", "randomforest"]: if self.mode.lower() in ["classification", "classifier"]: tree = RandomForestClassifier(random_state=self.rs) else: tree = RandomForestRegressor(random_state=self.rs) elif tree.lower() in ["gb", "gbt", "gradiet boosting"]: if self.mode.lower() in ["classification", "classifier"]: tree = GradientBoostingClassifier(random_state=self.rs) else: tree = GradientBoostingRegressor(random_state=self.rs) else: raise AutomatminerError( "Unsupported tree_type {}!".format(tree)) cv = check_cv(cv=cv, y=y, classifier=is_classifier(tree)) all_feats = [] for train, _ in cv.split(X, y, groups=None): Xtrn = X.iloc[train] ytrn = y.iloc[train] all_feats += self.get_reduced_features(tree, Xtrn, ytrn, recursive) # take the union of selected features of each fold self.selected_features = list(set(all_feats)) logger.info( self._log_prefix + "Finished tree-based feature reduction of {} initial features to " "{}".format(m0, len(self.selected_features))) return self
def main(): if len(sys.argv) == 4: database_filepath, classifier, model_filepath = sys.argv[1:] print('Loading data...\n DATABASE: {}'.format(database_filepath)) x, y, category_names = load_data(database_filepath) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) print('Building model...') if classifier.lower() == 'rf': message = f'Model used: RandomForest\n' classifier = RandomForestClassifier() cls_params = { 'cls__estimator__n_estimators': [50, 100, 200], 'cls__estimator__max_depth': [None, 1, 3, 5] } elif classifier.lower() == 'ad': message = f'Model used: AdaBoost\n' classifier = AdaBoostClassifier() cls_params = { 'cls__estimator__base_estimator': [ DecisionTreeClassifier(max_depth=1), RandomForestClassifier(max_depth=1) ], 'cls__estimator__n_estimators': [25, 50, 100], } elif classifier.lower() == 'dt': message = f'Model used: DecisionTree\n' classifier = DecisionTreeClassifier() cls_params = {'cls__estimator__max_depth': [None, 1, 3, 5]} else: raise ValueError(f'{classifier.lower()} is not a valid ' f'classifier choice') params = { 'vect__max_features': [None, 20, 50, 100], 'tfidf__use_idf': [True, False], 'tfidf__norm': ['l1', 'l2'], **cls_params } message = message + f'Parameters for GridSearch:\n{params}' print(message) model = build_model(classifier=classifier) print('Training model...') best_estimator = optimize_model(model=model, params=params, x=x_train, y=y_train) print('Evaluating model...') evaluate_model(best_estimator, x_test, y_test) print('Saving model...\n MODEL: {}'.format(model_filepath)) save_model(best_estimator, model_filepath) print('Trained model saved!') else: print('Please provide the filepath of the disaster messages database ' 'as the first argument and the filepath of the pickle file to ' 'save the model to as the second argument. \n\nExample: python ' 'train_classifier.py ../data/DisasterResponse.db classifier.pkl')