def test_predict_unfitted(self): rules = TrainableRules( base_model_name="sklearn.tree.DecisionTreeClassifier", prune_branches_algorithms=[], prune_attributes=False, confidence_threshold=0) with self.assertRaises(NotFittedError): rules.predict(self.test_x)
def test_budget(budget): rules = TrainableRules( "sklearn.tree.DecisionTreeClassifier", prune_branches_algorithms=["top-down-greedy"], prune_attributes=False, top_down_greedy_budget=(False, budget), random_state=1989) rules.fit(self.train_x, self.train_y) return rules.score(self.train_x, self.train_y)
def test_predict_winner_indices(self): rules = TrainableRules("sklearn.tree.DecisionTreeClassifier", prune_branches_algorithms=[], prune_attributes=False, min_samples_leaf=26, random_state=1989) rules.fit(self.train_x, self.train_y) pred_y, winners = rules.rules.predict(self.train_x, return_winner_indices=True) for ycls, w in zip(pred_y, winners): self.assertEqual(ycls, rules.rules.rules[w].stats.cls)
def test_tree_attr_pruning(self): model = tree.DecisionTreeClassifier(min_samples_leaf=26, random_state=1989) model = model.fit(self.train_x, self.train_y) rules = TrainableRules("sklearn.tree.DecisionTreeClassifier", prune_branches_algorithms=[], prune_attributes=True, min_samples_leaf=26, random_state=1989) rules.fit(self.train_x, self.train_y) tree_score = model.score(self.test_x, self.test_y) rules_score = rules.score(self.test_x, self.test_y) self.assertGreater(rules_score * 1.1, tree_score)
def test_tree_no_pruning(self): model = tree.DecisionTreeClassifier(min_samples_leaf=26, random_state=1989) model = model.fit(self.train_x, self.train_y) rules = TrainableRules( base_model_name="sklearn.tree.DecisionTreeClassifier", prune_branches_algorithms=[], confidence_threshold=0, prune_attributes=False, min_samples_leaf=26, random_state=1989) rules.fit(self.train_x, self.train_y) tree_score = model.score(self.train_x, self.train_y) rules_score = rules.score(self.train_x, self.train_y) self.assertGreater(rules_score * 1.1, tree_score)
def test_forest_no_pruning(self): model = ensemble.RandomForestClassifier(n_estimators=50, min_samples_leaf=26, random_state=1989) model = model.fit(self.train_x, self.train_y) rules = TrainableRules("sklearn.ensemble.RandomForestClassifier", prune_branches_algorithms=[], prune_attributes=False, n_estimators=50, min_samples_leaf=26, random_state=1989) rules.fit(self.train_x, self.train_y) forest_score = model.score(self.train_x, self.train_y) rules_score = rules.score(self.train_x, self.train_y) self.assertGreater(rules_score * 1.1, forest_score)
def test_rules_estimator(self): estimator = TrainableRules("sklearn.tree.DecisionTreeClassifier", prune_branches_algorithms=[], prune_attributes=False) scores = model_selection.cross_val_score(estimator, self.x, self.y) score = sum(scores) / len(scores) self.assertGreater(score, .5)
def setUp(self): (self.train_x, self.test_x, self.train_y, self.test_y), _, _ = load_abalone_data() self.config = { "trainable_rules": { "base_model_name": "sklearn.tree.DecisionTreeClassifier", "prune_branches_algorithms": [], "prune_attributes": False, "min_samples_leaf": 26, "random_state": 1989, }, } trainer = TrainableRules(**self.config["trainable_rules"], origin_config=self.config) trainer.fit(self.test_x, self.test_y) self.rules = trainer.rules self.fm = FormatModel().load( os.path.join(os.path.dirname(__file__), "model_jquery.asdf")) self.maxDiff = None
def test_integration(self): res = self.extractor.extract_features(self.files) self.assertIsNotNone(res, "Failed to parse files.") X, y, _, = res train_X, test_X, train_y, test_y = \ model_selection.train_test_split(X, y, random_state=1989) model = tree.DecisionTreeClassifier(min_samples_leaf=26, random_state=1989, max_depth=None, max_features="auto", min_samples_split=2) model.fit(train_X, train_y) rules = TrainableRules( base_model_name="sklearn.tree.DecisionTreeClassifier", prune_branches_algorithms=[], prune_attributes=False, min_samples_leaf=26, random_state=1989, max_depth=None, max_features="auto", min_samples_split=2, confidence_threshold=0) rules.fit(train_X, train_y) model_score_train = model.score(train_X, train_y) model_score_test = model.score(test_X, test_y) rules_score_train = rules.score(train_X, train_y) rules_score_test = rules.score(test_X, test_y) self.assertEqual(rules_score_train, model_score_train) self.assertEqual(rules_score_test, model_score_test)
def test_integration(self): X, y, _ = self.extractor.extract_features(self.files) train_X, test_X, train_y, test_y = \ model_selection.train_test_split(X, y, random_state=1989) model = tree.DecisionTreeClassifier(min_samples_leaf=26, random_state=1989, max_depth=None, max_features="auto", min_samples_split=2) model.fit(train_X, train_y) rules = TrainableRules("sklearn.tree.DecisionTreeClassifier", prune_branches_algorithms=[], prune_attributes=False, min_samples_leaf=26, random_state=1989, max_depth=None, max_features="auto", min_samples_split=2) rules.fit(train_X, train_y) model_score_train = model.score(train_X, train_y) model_score_test = model.score(test_X, test_y) rules_score_train = rules.score(train_X, train_y) rules_score_test = rules.score(test_X, test_y) self.assertEqual(rules_score_train, model_score_train) self.assertEqual(rules_score_test, model_score_test)
def test_reduced_error_prune(self): LEAF = _tree.TREE_LEAF UNDEFINED = _tree.TREE_UNDEFINED class FakeFeature: def __getitem__(self, item): pass def __setitem__(self, key, value): pass class FakeTree: def __init__(self, *args): self.children_left = numpy.array(args[0]) self.children_right = numpy.array(args[1]) self.feature = FakeFeature() self.value = numpy.array([ [[50, 100]], [[45, 50]], [[5, 40]], [[40, 10]], [[2, 20]], [[3, 20]], [[20, 5]], [[20, 5]], [[1, 19]], [[2, 1]], [[5, 50]], ]) class FakeModel: def __init__(self): self.tree_ = FakeTree( [1, 2, 4, 6, 8, LEAF, LEAF, LEAF, LEAF, LEAF, LEAF], [10, 3, 5, 7, 9, LEAF, LEAF, LEAF, LEAF, LEAF, LEAF], ) self.classes_ = [0, 1] def decision_path(self, X): return sparse.csr_matrix([ [1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0], [1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0], [1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0], [1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0], [1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], ]) def predict(self, X): return numpy.array([1, 0, 1, 0, 0, 1]) class FakeX: shape = [6] model = FakeModel() pruned_model = TrainableRules._prune_reduced_error( model, FakeX, numpy.array([1, 1, 1, 0, 0, 1])) self.assertEqual(list(pruned_model.tree_.children_left), [ 1, 2, LEAF, LEAF, UNDEFINED, UNDEFINED, UNDEFINED, UNDEFINED, UNDEFINED, UNDEFINED, LEAF ]) self.assertEqual(list(pruned_model.tree_.children_right), [ 10, 3, LEAF, LEAF, UNDEFINED, UNDEFINED, UNDEFINED, UNDEFINED, UNDEFINED, UNDEFINED, LEAF ])
def train(cls, ptr: ReferencePointer, config: Dict[str, Any], data_request_stub: DataStub, **data) -> AnalyzerModel: """ Train a model given the files available. :param ptr: Git repository state pointer. :param config: configuration dict. :param data: contains "files" - the list of files in the pointed state. :param data_request_stub: connection to the Lookout data retrieval service, not used. :return: AnalyzerModel containing the learned rules, per language. """ config = cls._load_train_config(config) cls.log.info("train %s %s %s", ptr.url, ptr.commit, pformat(config, width=4096, compact=True)) files_by_language = cls._files_by_language(data["files"]) model = FormatModel().construct(cls, ptr) for language, files in files_by_language.items(): language = language.lower() try: fe = FeatureExtractor( language=language, siblings_window=config["siblings_window"], parents_depth=config["parents_depth"]) except ImportError: cls.log.warning("skipped %d %s files - not supported", len(files), language) continue else: cls.log.info("training on %d %s files", len(files), language) # we sort to make the features reproducible X, y, _ = fe.extract_features(f[1] for f in sorted(files.items())) lower_bound_instances = config["lower_bound_instances"] if X.shape[0] < lower_bound_instances: cls.log.warning("skipped %d %s files: too few samples (%d/%d)", len(files), language, X.shape[0], lower_bound_instances) continue cls.log.debug("training the rules model") bscv = BayesSearchCV(TrainableRules( prune_branches_algorithms=config["prune_branches_algorithms"], prune_attributes=config["prune_attributes"], top_down_greedy_budget=config["top_down_greedy_budget"], uncertain_attributes=config["uncertain_attributes"], prune_dataset_ratio=config["prune_dataset_ratio"], n_estimators=config["n_estimators"], random_state=config["random_state"]), { "base_model_name": Categorical([ "sklearn.ensemble.RandomForestClassifier", "sklearn.tree.DecisionTreeClassifier" ]), "max_depth": Categorical([None, 5, 10]), "max_features": Categorical([None, "auto"]), "min_samples_split": Integer(2, 20), "min_samples_leaf": Integer(1, 20) }, n_jobs=-1, n_iter=config["n_iter"], random_state=config["random_state"]) bscv.fit(X, y) cls.log.debug("score of the best estimator found: %.3f", bscv.best_score_) cls.log.debug("params of the best estimator found: %s", str(bscv.best_params_)) cls.log.debug("training the model with complete data") trainable_rules = TrainableRules( prune_branches_algorithms=["reduced-error"], prune_attributes=True, random_state=42, uncertain_attributes=True, **bscv.best_params_) trainable_rules.fit(X, y) model[language] = trainable_rules.rules cls.log.info("trained %s", model) return model
def train(cls, ptr: ReferencePointer, config: Mapping[str, Any], data_service: DataService, files: Iterator[File], **data) -> FormatModel: """ Train a model given the files available. :param ptr: Git repository state pointer. :param config: configuration dict. :param data: contains "files" - the list of files in the pointed state. :param data_service: connection to the Lookout data retrieval service. :param files: iterator of File records from the data service. :return: AnalyzerModel containing the learned rules, per language. """ _log = logging.getLogger(cls.__name__) train_config = cls._load_config(config)["train"] _log.info("train %s %s %s %s", __version__, ptr.url, ptr.commit, pformat(train_config, width=4096, compact=True)) model = FormatModel().generate(cls, ptr) for language, files in files_by_language(files).items(): try: lang_config = train_config[language] except KeyError: _log.warning("language %s is not supported, skipped", language) continue _log.info("effective train config for %s:\n%s", language, pformat(lang_config, width=120, compact=True)) random_state = lang_config["random_state"] files = filter_files( files, lang_config["line_length_limit"], lang_config["overall_size_limit"], random_state, _log) submit_event("%s.train.%s.files" % (cls.name, language), len(files)) if len(files) == 0: _log.info("zero files after filtering, language %s is skipped.", language) continue try: fe = FeatureExtractor(language=language, **lang_config["feature_extractor"]) except ImportError: _log.warning("skipped %d %s files - not supported", len(files), language) continue else: _log.info("training on %d %s files", len(files), language) train_files, test_files = FormatAnalyzer.split_train_test( files, lang_config["test_dataset_ratio"], random_state=random_state) # ensure that the features are reproducible train_files = sorted(train_files, key=lambda x: x.path) test_files = sorted(test_files, key=lambda x: x.path) X_train, y_train, _ = fe.extract_features(train_files) X_train, selected_features = fe.select_features(X_train, y_train) if test_files: X_test, y_test, _ = fe.extract_features(test_files) if lang_config["test_dataset_ratio"]: _log.debug("Real test ratio is %.3f", X_test.shape[0] / (X_test.shape[0] + X_train.shape[0]) if test_files else 0) lang_config["feature_extractor"]["selected_features"] = selected_features lang_config["feature_extractor"]["label_composites"] = fe.labels_to_class_sequences lower_bound_instances = lang_config["lower_bound_instances"] if X_train.shape[0] < lower_bound_instances: _log.warning("skipped %d %s files: too few samples (%d/%d)", len(files), language, X_train.shape[0], lower_bound_instances) continue _log.info("extracted %d samples to train, searching for the best hyperparameters", X_train.shape[0]) optimizer = Optimizer(**lang_config["optimizer"], random_state=random_state) best_score, best_params = optimizer.optimize(X_train, y_train) if _log.isEnabledFor(logging.DEBUG): _log.debug("score of the best estimator found: %.6f", best_score) _log.debug("params of the best estimator found: %s", str(best_params)) _log.debug("training the model with complete data") else: _log.info("finished hyperopt at %.6f, training the full model", -best_score) lang_config["trainable_rules"].update(best_params) trainable_rules = TrainableRules(**lang_config["trainable_rules"], random_state=random_state, origin_config=lang_config) trainable_rules.fit(X_train, y_train) importances = trainable_rules.feature_importances_ _log.debug( "feature importances from %s:\n\t%s", lang_config["trainable_rules"]["base_model_name"], "\n\t".join("%-55s %.5E" % (fe.feature_names[i], importances[i]) for i in numpy.argsort(-importances)[:25] if importances[i] > 1e-5)) trainable_rules.prune_categorical_attributes(fe) _log.info("obtained %d rules, generating the classification report", len(trainable_rules.rules)) trainable_rules.rules.generate_classification_report( X_train, y_train, "train", fe.composite_class_representations) if test_files: trainable_rules.rules.generate_classification_report( X_test, y_test, "test", fe.composite_class_representations) submit_event("%s.train.%s.rules" % (cls.name, language), len(trainable_rules.rules)) if trainable_rules.rules.rules: model[language] = trainable_rules.rules else: _log.warning("model for %s has 0 rules. Skipped.", language) _log.info("trained %s", model) return model
def test_predict_unfitted(self): rules = TrainableRules("sklearn.tree.DecisionTreeClassifier", prune_branches_algorithms=[], prune_attributes=False) with self.assertRaises(NotFittedError): rules.predict(self.test_x)
def train(cls, ptr: ReferencePointer, config: Mapping[str, Any], data_service: DataService, **data) -> FormatModel: """ Train a model given the files available. :param ptr: Git repository state pointer. :param config: configuration dict. :param data: contains "files" - the list of files in the pointed state. :param data_service: connection to the Lookout data retrieval service. :return: AnalyzerModel containing the learned rules, per language. """ _log = logging.getLogger(cls.__name__) _log.info("train %s %s %s", ptr.url, ptr.commit, pformat(config, width=4096, compact=True)) model = FormatModel().construct(cls, ptr) config = cls._load_train_config(config) for language, files in files_by_language(data["files"]).items(): try: lang_config = config[language] except KeyError: _log.warning("language %s is not supported, skipped", language) continue files = filter_files(files, lang_config["line_length_limit"], _log) submit_event("%s.train.%s.files" % (cls.name, language), len(files)) if len(files) == 0: _log.info( "zero files after filtering, language %s is skipped.", language) continue try: fe = FeatureExtractor(language=language, **lang_config["feature_extractor"]) except ImportError: _log.warning("skipped %d %s files - not supported", len(files), language) continue else: _log.info("training on %d %s files", len(files), language) # we sort to make the features reproducible X, y, _ = fe.extract_features(sorted(files, key=lambda x: x.path)) X, selected_features = fe.select_features(X, y) lang_config["feature_extractor"][ "selected_features"] = selected_features lang_config["feature_extractor"][ "label_composites"] = fe.labels_to_class_sequences lower_bound_instances = lang_config["lower_bound_instances"] if X.shape[0] < lower_bound_instances: _log.warning("skipped %d %s files: too few samples (%d/%d)", len(files), language, X.shape[0], lower_bound_instances) continue _log.debug("training the rules model") optimizer = Optimizer( n_jobs=lang_config["n_jobs"], n_iter=lang_config["n_iter"], cv=lang_config["cv"], random_state=lang_config["trainable_rules"]["random_state"]) best_score, best_params = optimizer.optimize(X, y) _log.debug("score of the best estimator found: %.6f", best_score) _log.debug("params of the best estimator found: %s", str(best_params)) _log.debug("training the model with complete data") lang_config["trainable_rules"].update(best_params) trainable_rules = TrainableRules(**lang_config["trainable_rules"], origin_config=lang_config) trainable_rules.fit(X, y) importances = trainable_rules.feature_importances_ _log.debug( "feature importances from %s:\n\t%s", lang_config["trainable_rules"]["base_model_name"], "\n\t".join( "%-55s %.5E" % (fe.feature_names[i], importances[i]) for i in numpy.argsort(-importances)[:25] if importances[i] > 1e-5)) submit_event("%s.train.%s.rules" % (cls.name, language), len(trainable_rules.rules)) # TODO(vmarkovtsev): save the achieved precision, recall, etc. to the model # throw away imprecise classes if trainable_rules.rules.rules: model[language] = trainable_rules.rules else: _log.warning("model for %s has 0 rules. Skipping.", language) _log.info("trained %s", model) return model