def _test_classification_report(n_classes=2): classifiers = ClassifiersFactory() classifiers.add_classifier('gb', GradientBoostingClassifier(n_estimators=10)) classifiers.add_classifier('rf', RandomForestClassifier()) classifiers.add_classifier('ada', AdaBoostClassifier(n_estimators=10)) X, y = generate_classification_sample(1000, 5, n_classes=n_classes) classifiers.fit(X, y) X, y = generate_classification_sample(1000, 5, n_classes=n_classes) test_lds = LabeledDataStorage(X, y, sample_weight=None) report = classifiers.test_on_lds(test_lds) val = numpy.mean(X['column0']) labels_dict = None if n_classes > 2: labels_dict = {} for i in range(n_classes): labels_dict[i] = str(i) _classification_mask_report(report, "column0 > %f" % val, X, labels_dict) _classification_mask_report(report, lambda x: numpy.array(x['column0']) < val, X, labels_dict) _classification_mask_report(report, None, X, labels_dict)
def test_factory(): factory = ClassifiersFactory() try: from rep.estimators.tmva import TMVAClassifier factory.add_classifier('tmva', TMVAClassifier()) except ImportError: pass factory.add_classifier('rf', RandomForestClassifier(n_estimators=10)) factory.add_classifier('ada', AdaBoostClassifier(n_estimators=20)) X, y, sample_weight = generate_classification_data() assert factory == factory.fit(X, y, sample_weight=sample_weight, features=list(X.columns), parallel_profile='threads-4') for cl in factory.values(): assert list(cl.features) == list(X.columns) proba = factory.predict_proba(X, parallel_profile='threads-4') labels = factory.predict(X, parallel_profile='threads-4') for key, val in labels.items(): score = accuracy_score(y, val) print(key, score) assert score > 0.7, key for key, val in proba.items(): assert numpy.allclose(val.sum(axis=1), 1), 'probabilities do not sum to 1' assert numpy.all(val >= 0.), 'negative probabilities' auc_score = roc_auc_score(y, val[:, 1]) print(auc_score) assert auc_score > 0.8 for key, iterator in factory.staged_predict_proba(X).items(): assert key != 'tmva', 'tmva does not support staged pp' for p in iterator: assert p.shape == (len(X), 2) # checking that last iteration coincides with previous assert numpy.all(p == proba[key]) # testing picklability dump_string = cPickle.dumps(factory) clf_loaded = cPickle.loads(dump_string) assert type(factory) == type(clf_loaded) probs1 = factory.predict_proba(X) probs2 = clf_loaded.predict_proba(X) for key, val in probs1.items(): assert numpy.all(val == probs2[key]), 'something strange was loaded' report = ClassificationReport({'rf': factory['rf']}, LabeledDataStorage(X, y, sample_weight)) report.feature_importance_shuffling(roc_auc_score_mod).plot(new_plot=True, figsize=(18, 3)) report = factory.test_on_lds(LabeledDataStorage(X, y, sample_weight)) report = factory.test_on(X, y, sample_weight=sample_weight) val = numpy.mean(X['column0']) yield check_report_with_mask, report, "column0 > %f" % (val / 2.), X yield check_report_with_mask, report, lambda x: numpy.array(x['column0']) < val * 2., X yield check_report_with_mask, report, None, X
def __init__(self, base_estimators=None, bagging_base=None, stacking='xgb', features_stack=None, bagging_stack=None, hunting=False, transform=True, transform_pred=True): """blablabla Parameters ---------- base_estimators : dict('clf': classifier OR keyword-parameters) Contains all the level-0 classifiers. The key is the name of the classifier and the value is either a **prefitted** classifier or a dictionary containing the keyword arguments to instantiate such a classifier. If no pre-trained classifier is provided, the key-value has to be one of the following: - **'xgb'** creates an XGBoost classifier - **'rdf'** creates a Random Forest classifier - **'erf'** creates a Forest consisting of Extra Randomized Trees - **'nn'** creates an artificial Neural Network from TheaNets - **'ada'** creates an AdaBoost instance with Decision Trees as basis - **'gb'** creates a Gradient Boosted classifier with Decision Trees as basis """ if base_estimators is None: OrderedDict(self.__DEFAULT_CLF_CFG) else: self._base_estimators = base_estimators if isinstance(stacking, str): self._clf_1 = {stacking: None} elif isinstance(stacking, dict): self._clf_1 = stacking elif stacking in (False, None): stacking = False else: self._clf_1 = {'clf_stacking': stacking} # stacking is a classifier self._transform_data = transform self._bagging = bagging_base self._hunting = hunting self._clf_1_bagging = bagging_stack self._features_stack = features_stack self._clf_0 = {} self._factory = ClassifiersFactory() self._base_scaler = None self._pred_scaler = None
if args.verbose: fprint("Start loading input") with open(args.dir + "/train_uniform_reports.pkl", 'rb') as infile: reports = pickle.load(infile) if args.verbose: fprint("Finish loading input") from mods import config_path config_path() args.config = args.config.replace(".py", "") uconfig = getattr(__import__(args.config, fromlist="uconfig"), "uconfig") # check for subset of classifiers if len(args.classifiers) > 0: for rname, report in reports.iteritems(): est_old = report.estimators pred_old = report.prediction est_new = ClassifiersFactory() pred_new = OrderedDict() for classifier in args.classifiers: if classifier in est_old: est_new[classifier] = est_old[classifier] pred_new[classifier] = pred_old[classifier] else: raise ValueError("Requested classifier " + classifier + " not found in report " + rname) report.estimators = est_new report.prediction = pred_new # to serialize plots repplots = OrderedDict() matplots = OrderedDict()
trainX = train_data[0] testX = test_data[0] trainY = train_data[1] testY = test_data[1] trainW = {} testW = {} for iw, weight in enumerate(sorted(W)): trainW[weight] = train_data[iw + 2] testW[weight] = test_data[iw + 2] if args.verbose: fprint("Split data into train_size=" + str(uconfig.training.size) + ", test_size=" + str(uconfig.training.size)) # create classifiers classifiers = ClassifiersFactory() weights = OrderedDict() # standard bdt if "bdt" in uconfig.training.algorithms: base_grad = GradientBoostingClassifier( max_depth=uconfig.hyper.max_depth, n_estimators=uconfig.hyper.n_estimators, subsample=uconfig.hyper.subsample, learning_rate=uconfig.hyper.learning_rate, min_samples_leaf=uconfig.hyper.min_samples_leaf, ) classifiers["bdt"] = SklearnClassifier(base_grad, features=uconfig.features.train) weights["bdt"] = trainW[uconfig.training.algorithms["bdt"]]