def _split_into_subroutines(self, trees_ast, trees_num_leaves): result = [] subroutine_trees = [] subroutine_sum_leaves = 0 for tree, num_leaves in zip(trees_ast, trees_num_leaves): next_sum = subroutine_sum_leaves + num_leaves if subroutine_trees and next_sum > self._leaves_cutoff_threshold: # Exceeded the max leaves in the current subroutine, # finalize this one and start a new one. partial_result = utils.apply_op_to_expressions( ast.BinNumOpType.ADD, *subroutine_trees) result.append(ast.SubroutineExpr(partial_result)) subroutine_trees = [] subroutine_sum_leaves = 0 subroutine_sum_leaves += num_leaves subroutine_trees.append(tree) if subroutine_trees: partial_result = utils.apply_op_to_expressions( ast.BinNumOpType.ADD, *subroutine_trees) result.append(ast.SubroutineExpr(partial_result)) return result
def _assemble_multi_class_output(self): support_vectors = self.model.support_vectors_ coef = self.model.dual_coef_ intercept = self.model.intercept_ n_support = self.model.n_support_ n_support_len = len(n_support) kernel_exprs = self._apply_kernel(support_vectors, to_reuse=True) support_ranges = [] for i in range(n_support_len): range_start = sum(n_support[:i]) range_end = range_start + n_support[i] support_ranges.append((range_start, range_end)) # One-vs-one decisions. decisions = [] for i in range(n_support_len): for j in range(i + 1, n_support_len): kernel_weight_mul_ops = [ utils.mul(kernel_exprs[k], ast.NumVal(coef[i][k])) for k in range(*support_ranges[j]) ] kernel_weight_mul_ops.extend([ utils.mul(kernel_exprs[k], ast.NumVal(coef[j - 1][k])) for k in range(*support_ranges[i]) ]) decision = utils.apply_op_to_expressions( ast.BinNumOpType.ADD, ast.NumVal(intercept[len(decisions)]), *kernel_weight_mul_ops) decisions.append(decision) return ast.VectorVal(decisions)
def _linear_to_ast(coef, intercept): feature_weight_mul_ops = [ utils.mul(ast.FeatureRef(index), ast.NumVal(value)) for index, value in enumerate(coef) ] return utils.apply_op_to_expressions(ast.BinNumOpType.ADD, ast.NumVal(intercept), *feature_weight_mul_ops)
def _assemble_single_output(self, trees, base_score=0): if self._tree_limit: trees = trees[:self._tree_limit] trees_ast = [self._assemble_tree(t) for t in trees] result_ast = utils.apply_op_to_expressions(ast.BinNumOpType.ADD, ast.NumVal(base_score), *trees_ast) return ast.SubroutineExpr(result_ast)
def softmax(exprs): exp_exprs = [ast.ExpExpr(e, to_reuse=True) for e in exprs] exp_sum_expr = utils.apply_op_to_expressions(ast.BinNumOpType.ADD, *exp_exprs, to_reuse=True) return [ ast.BinNumExpr(e, exp_sum_expr, ast.BinNumOpType.DIV) for e in exp_exprs ]
def _rbf_kernel(self, support_vector): elem_wise = [ ast.PowExpr( utils.sub(ast.NumVal(support_element), ast.FeatureRef(i)), ast.NumVal(2)) for i, support_element in enumerate(support_vector) ] kernel = utils.apply_op_to_expressions(ast.BinNumOpType.ADD, *elem_wise) kernel = utils.mul(self._neg_gamma_expr, kernel) return ast.ExpExpr(kernel)
def _assemble_single_output(self, estimator_params, base_score=0, split_idx=0): estimators_ast = self._assemble_estimators(estimator_params, split_idx) tmp_ast = utils.apply_op_to_expressions( ast.BinNumOpType.ADD, ast.NumVal(base_score), *estimators_ast) result_ast = self._final_transform(tmp_ast) return result_ast
def assemble(self): trees = self.model.estimators_ def assemble_tree_expr(t): assembler = TreeModelAssembler(t) return assembler.assemble() assembled_trees = [assemble_tree_expr(t) for t in trees] return utils.apply_bin_op( utils.apply_op_to_expressions(ast.BinNumOpType.ADD, *assembled_trees), ast.NumVal(1 / self.model.n_estimators), ast.BinNumOpType.MUL)
def assemble(self): coef = 1.0 / self.model.n_estimators trees = self.model.estimators_ def assemble_tree_expr(t): assembler = TreeModelAssembler(t) return utils.apply_bin_op( ast.SubroutineExpr(assembler.assemble()), ast.NumVal(coef), ast.BinNumOpType.MUL) assembled_trees = [assemble_tree_expr(t) for t in trees] return utils.apply_op_to_expressions( ast.BinNumOpType.ADD, *assembled_trees)
def _assemble_single_output(self): support_vectors = self.model.support_vectors_ coef = self.model.dual_coef_[0] intercept = self.model.intercept_[0] kernel_exprs = self._apply_kernel(support_vectors) kernel_weight_mul_ops = [] for index, value in enumerate(coef): kernel_weight_mul_ops.append( utils.mul(kernel_exprs[index], ast.NumVal(value))) return utils.apply_op_to_expressions(ast.BinNumOpType.ADD, ast.NumVal(intercept), *kernel_weight_mul_ops)
def _assemble_single_output(self, idx=0): support_vectors = self.model.support_vectors_ coef = self._get_single_coef(idx) intercept = self._get_single_intercept(idx) kernel_exprs = self._apply_kernel(support_vectors) kernel_weight_mul_ops = [ utils.mul(kernel_exprs[index], ast.NumVal(value)) for index, value in enumerate(coef) ] return utils.apply_op_to_expressions(ast.BinNumOpType.ADD, ast.NumVal(intercept), *kernel_weight_mul_ops)
def _cosine_kernel(self, support_vector): support_vector_norm = np.linalg.norm(support_vector) if support_vector_norm == 0.0: support_vector_norm = 1.0 feature_norm = ast.SqrtExpr(utils.apply_op_to_expressions( ast.BinNumOpType.ADD, *[ utils.mul(ast.FeatureRef(i), ast.FeatureRef(i)) for i in range(len(support_vector)) ]), to_reuse=True) safe_feature_norm = ast.IfExpr(utils.eq(feature_norm, ast.NumVal(0.0)), ast.NumVal(1.0), feature_norm) kernel = self._linear_kernel(support_vector / support_vector_norm) kernel = utils.div(kernel, safe_feature_norm) return kernel
def test_linear_model(): # Default updater ("shotgun") is nondeterministic estimator = xgb.XGBRegressor(n_estimators=2, random_state=1, updater="coord_descent", feature_selector="shuffle", booster="gblinear") utils.get_regression_model_trainer()(estimator) assembler = XGBoostModelAssemblerSelector(estimator) actual = assembler.assemble() feature_weight_mul = [ ast.BinNumExpr(ast.FeatureRef(0), ast.NumVal(-0.154567), ast.BinNumOpType.MUL), ast.BinNumExpr(ast.FeatureRef(1), ast.NumVal(0.0815865), ast.BinNumOpType.MUL), ast.BinNumExpr(ast.FeatureRef(2), ast.NumVal(-0.0979713), ast.BinNumOpType.MUL), ast.BinNumExpr(ast.FeatureRef(3), ast.NumVal(4.80472), ast.BinNumOpType.MUL), ast.BinNumExpr(ast.FeatureRef(4), ast.NumVal(1.35478), ast.BinNumOpType.MUL), ast.BinNumExpr(ast.FeatureRef(5), ast.NumVal(0.327222), ast.BinNumOpType.MUL), ast.BinNumExpr(ast.FeatureRef(6), ast.NumVal(0.0610654), ast.BinNumOpType.MUL), ast.BinNumExpr(ast.FeatureRef(7), ast.NumVal(0.46989), ast.BinNumOpType.MUL), ast.BinNumExpr(ast.FeatureRef(8), ast.NumVal(-0.0674318), ast.BinNumOpType.MUL), ast.BinNumExpr(ast.FeatureRef(9), ast.NumVal(-0.000506212), ast.BinNumOpType.MUL), ast.BinNumExpr(ast.FeatureRef(10), ast.NumVal(0.0732867), ast.BinNumOpType.MUL), ast.BinNumExpr(ast.FeatureRef(11), ast.NumVal(0.0108842), ast.BinNumOpType.MUL), ast.BinNumExpr(ast.FeatureRef(12), ast.NumVal(-0.140096), ast.BinNumOpType.MUL), ] expected = ast.BinNumExpr( ast.NumVal(0.5), apply_op_to_expressions(ast.BinNumOpType.ADD, ast.NumVal(11.138), *feature_weight_mul), ast.BinNumOpType.ADD) assert utils.cmp_exprs(actual, expected)
def _assemble_single_output(self, trees, base_score=0): if self._tree_limit: trees = trees[:self._tree_limit] trees_ast = [ast.SubroutineExpr(self._assemble_tree(t)) for t in trees] to_sum = trees_ast # In a large tree we need to generate multiple subroutines to avoid # java limitations https://github.com/BayesWitnesses/m2cgen/issues/103. trees_num_leaves = [self._count_leaves(t) for t in trees] if sum(trees_num_leaves) > self._leaves_cutoff_threshold: to_sum = self._split_into_subroutines(trees_ast, trees_num_leaves) tmp_ast = utils.apply_op_to_expressions(ast.BinNumOpType.ADD, ast.NumVal(base_score), *to_sum) result_ast = self._final_transform(tmp_ast) return ast.SubroutineExpr(result_ast)
def _linear_kernel(self, support_vector): elem_wise = [ utils.mul(ast.NumVal(support_element), ast.FeatureRef(i)) for i, support_element in enumerate(support_vector) ] return utils.apply_op_to_expressions(ast.BinNumOpType.ADD, *elem_wise)