def boosting_last_predict(tree, fields, input_data, path=None): """Predict function for boosting and last prediction strategy """ if path is None: path = [] node = get_node(tree) children_number = node[OFFSETS["children#"]] children = [] if children_number == 0 else node[OFFSETS["children"]] count = node[OFFSETS["count"]] if children: for child in children: [operator, field, value, term, missing] = get_predicate(child) if apply_predicate(operator, field, value, term, missing, input_data, fields[field]): path.append(predicate_to_rule(operator, fields[field], value, term, missing)) return boosting_last_predict( \ child, fields, \ input_data, path=path) return Prediction( node[OFFSETS["output"]], path, None, distribution=None, count=count, median=None, distribution_unit=None, children=children, d_min=None, d_max=None)
def get_tree_leaves(tree, fields, path, leaves, filter_function=None): node = get_node(tree) predicate = get_predicate(tree) if isinstance(predicate, list): [operator, field, value, term, missing] = get_predicate(tree) path.append( to_lisp_rule(operator, field, value, term, missing, fields[field])) children_number = node[offsets["children#"]] children = [] if children_number == 0 else node[offsets["children"]] if children: for child in children: leaves += get_tree_leaves(child, fields, path[:], leaves, filter_function=filter_function) else: leaf = { 'id': node[offsets["id"]], 'confidence': node[offsets["confidence"]], 'count': node[offsets["count"]], 'distribution': node[offsets["distribution"]], 'impurity': gini_impurity(node[offsets["distribution"]], node[offsets["count"]]), 'output': node[offsets["output"]], 'path': path } if 'weighted_distribution' in offsets: leaf.update( \ {"weighted_distribution": node[offsets[ \ "weighted_distribution"]], "weight": node[offsets["weight"]]}) if (not hasattr(filter_function, '__call__') or filter_function(leaf)): leaves += [leaf] return leaves
def boosting_proportional_predict(tree, fields, input_data, path=None, missing_found=False): """Makes a prediction based on a number of field values considering all the predictions of the leaves that fall in a subtree. Each time a splitting field has no value assigned, we consider both branches of the split to be true, merging their predictions. The function returns the merged distribution and the last node reached by a unique path. """ if path is None: path = [] node = get_node(tree) children_number = node[OFFSETS["children#"]] children = [] if children_number == 0 else node[OFFSETS["children"]] g_sum = node[OFFSETS["g_sum"]] h_sum = node[OFFSETS["h_sum"]] count = node[OFFSETS["count"]] if not children: return (g_sum, h_sum, count, path) if one_branch(children, input_data) or \ fields[children[0][FIELD_OFFSET]]["optype"] in \ ["text", "items"]: for child in children: [operator, field, value, term, missing] = get_predicate(child) if apply_predicate(operator, field, value, term, missing, input_data, fields[field]): new_rule = predicate_to_rule(operator, fields[field], value, term, missing) if new_rule not in path and not missing_found: path.append(new_rule) return boosting_proportional_predict( \ child, fields, input_data, path, missing_found) else: # missing value found, the unique path stops missing_found = True g_sums = 0.0 h_sums = 0.0 population = 0 for child in children: g_sum, h_sum, count, _ = \ boosting_proportional_predict( \ child, fields, input_data, path, missing_found) g_sums += g_sum h_sums += h_sum population += count return (g_sums, h_sums, population, path)
def missing_prefix_code(tree, fields, field, input_map, cmv): """Part of the condition that checks for missings when missing_splits has been used """ predicate = get_predicate(tree) missing = predicate[MISSING_OFFSET] negation = "" if missing else " not" connection = "or" if missing else "and" if not missing: cmv.append(fields[field]['slug']) return "%s is%s None %s " % (map_data(fields[field]['slug'], input_map, True), negation, connection)
def __init__(self, tree, offsets): predicate = get_predicate(tree) if isinstance(predicate, bool): self.predicate = predicate else: [operator, field, value, term, _] = predicate self.predicate = Predicate(INVERSE_OP[operator], field, value, term) node = get_node(tree) for attr in offsets: if attr not in ["children#", "children"]: setattr(self, attr, node[offsets[attr]]) children = [] if node[offsets["children#"]] == 0 else \ node[offsets["children"]] setattr(self, "children", children)
def generate_rules(tree, offsets, objective_id, fields, depth=0, ids_path=None, subtree=True): """Translates a tree model into a set of IF-THEN rules. """ rules_str = "" node = get_node(tree) children_number = node[offsets["children#"]] children = [] if children_number == 0 else node[offsets["children"]] children = filter_nodes(children, offsets, ids=ids_path, subtree=subtree) if children: for child in children: predicate = get_predicate(child) if isinstance(predicate, list): [operator, field, value, term, missing] = predicate child_node = get_node(child) rules_str += ("%s IF %s %s\n" % (INDENT * depth, predicate_to_rule(operator, fields[field], value, term, missing, label='slug'), "AND" if child_node[offsets["children#"]] > 0 else "THEN")) rules_str += generate_rules(child, offsets, objective_id, fields, depth + 1, ids_path=ids_path, subtree=subtree) else: rules_str += ("%s %s = %s\n" % (INDENT * depth, (fields[objective_id]['slug'] if objective_id else "Prediction"), node[offsets["output"]])) return rules_str
def split_condition_code(tree, fields, depth, input_map, pre_condition, term_analysis_fields, item_analysis_fields, cmv): """Condition code for the split """ predicate = get_predicate(tree) [operation, field, value, term, _] = predicate optype = fields[field]['optype'] value = value_to_print(value, optype) if optype in ['text', 'items']: if optype == 'text': term_analysis_fields.append((field, term)) matching_function = "term_matches" else: item_analysis_fields.append((field, term)) matching_function = "item_matches" return "%sif (%s%s(%s, \"%s\", %s%s) %s " \ "%s):\n" % \ (INDENT * depth, pre_condition, matching_function, map_data(fields[field]['slug'], input_map, False), fields[field]['slug'], 'u' if isinstance(term, str) else '', value_to_print(term, 'categorical'), PYTHON_OPERATOR[operation], value) operator = (MISSING_OPERATOR[operation] if value is None else PYTHON_OPERATOR[operation]) if value is None: cmv.append(fields[field]['slug']) return "%sif (%s%s %s %s):\n" % \ (INDENT * depth, pre_condition, map_data(fields[field]['slug'], input_map, False), operator, value)
def depth_first_search(tree, path): """Search for leafs' values and instances """ node = get_node(tree) predicate = get_predicate(tree) if isinstance(predicate, list): [operation, field, value, term, _] = predicate operator = INVERSE_OP[operation] path.append(Predicate(operator, field, value, term)) if term: if field not in model.terms: model.terms[field] = [] if term not in model.terms[field]: model.terms[field].append(term) if node[offsets["children#"]] == 0: add_to_groups( groups, node[offsets["output"]], path, node[offsets["count"]], node[offsets["confidence"]], gini_impurity(node[offsets["distribution"]], node[offsets["count"]])) return node[offsets["count"]] children = node[offsets["children"]][:] children.reverse() children_sum = 0 for child in children: children_sum += depth_first_search(child, path[:]) if children_sum < node[offsets["count"]]: add_to_groups( groups, node[offsets["output"]], path, node[offsets["count"]] - children_sum, node[offsets["confidence"]], gini_impurity(node[offsets["distribution"]], node[offsets["count"]])) return node[offsets["count"]]
def plug_in_body(tree, offsets, fields, objective_id, regression, depth=1, cmv=None, input_map=False, ids_path=None, subtree=True): """Translate the model into a set of "if" python statements. `depth` controls the size of indentation. As soon as a value is missing that node is returned without further evaluation. """ # label for the confidence measure and initialization metric = "error" if regression else "confidence" if cmv is None: cmv = [] body = "" term_analysis_fields = [] item_analysis_fields = [] node = get_node(tree) children = [] if node[offsets["children#"]] == 0 else \ node[offsets["children"]] children = filter_nodes(children, offsets, ids=ids_path, subtree=subtree) if children: # field used in the split field = mintree_split(children) has_missing_branch = (missing_branch(children) or none_value(children)) # the missing is singled out as a special case only when there's # no missing branch in the children list one_branch = not has_missing_branch or \ fields[field]['optype'] in COMPOSED_FIELDS if (one_branch and not fields[field]['slug'] in cmv): body += missing_check_code(tree, offsets, fields, objective_id, field, depth, input_map, cmv, metric) for child in children: [_, field, value, _, _] = get_predicate(child) pre_condition = "" # code when missing_splits has been used if has_missing_branch and value is not None: pre_condition = missing_prefix_code(child, fields, field, input_map, cmv) # complete split condition code body += split_condition_code( \ child, fields, depth, input_map, pre_condition, term_analysis_fields, item_analysis_fields, cmv) # value to be determined in next node next_level = plug_in_body(child, offsets, fields, objective_id, regression, depth + 1, cmv=cmv[:], input_map=input_map, ids_path=ids_path, subtree=subtree) body += next_level[0] term_analysis_fields.extend(next_level[1]) item_analysis_fields.extend(next_level[2]) else: value = value_to_print(node[offsets["output"]], fields[objective_id]['optype']) body = "%sreturn {\"prediction\":%s, \"%s\":%s}\n" % ( \ INDENT * depth, value, metric, node[offsets["confidence"]]) return body, term_analysis_fields, item_analysis_fields
def tableau_body(tree, offsets, fields, objective_id, body="", conditions=None, cmv=None, ids_path=None, subtree=True, attr=DFT_ATTR): """Translate the model into a set of "if" statements in Tableau syntax `depth` controls the size of indentation. As soon as a value is missing that node is returned without further evaluation. """ if cmv is None: cmv = [] if body: alternate = "ELSEIF" else: if conditions is None: conditions = [] alternate = "IF" node = get_node(tree) children_number = node[offsets["children#"]] children = [] if children_number == 0 else node[offsets["children"]] children = filter_nodes(children, offsets, ids=ids_path, subtree=subtree) if children: [_, field, _, _, _] = get_predicate(children[0]) has_missing_branch = (missing_branch(children) or none_value(children)) # the missing is singled out as a special case only when there's # no missing branch in the children list if (not has_missing_branch and fields[field]['name'] not in cmv): conditions.append("ISNULL([%s])" % fields[field]['name']) body += ("%s %s THEN " % (alternate, " AND ".join(conditions))) if fields[objective_id]['optype'] == 'numeric': value = node[offsets[attr]] else: value = tableau_string(node[offsets[attr]]) body += ("%s\n" % value) cmv.append(fields[field]['name']) alternate = "ELSEIF" del conditions[-1] for child in children: pre_condition = "" post_condition = "" [operator, field, ch_value, _, missing] = get_predicate(child) if has_missing_branch and ch_value is not None: negation = "" if missing else "NOT " connection = "OR" if missing else "AND" pre_condition = ("(%sISNULL([%s]) %s " % (negation, fields[field]['name'], connection)) if not missing: cmv.append(fields[field]['name']) post_condition = ")" optype = fields[field]['optype'] if ch_value is None: value = "" elif optype in ['text', 'items']: return "" elif optype == 'numeric': value = ch_value else: value = repr(ch_value) operator = ("" if ch_value is None else PYTHON_OPERATOR[operator]) if ch_value is None: pre_condition = (T_MISSING_OPERATOR[operator]) post_condition = ")" conditions.append("%s[%s]%s%s%s" % (pre_condition, fields[field]['name'], operator, value, post_condition)) body = tableau_body(child, offsets, fields, objective_id, body, conditions[:], cmv=cmv[:], ids_path=ids_path, subtree=subtree, attr=attr) del conditions[-1] else: if fields[objective_id]['optype'] == 'numeric': value = tree[offsets[attr]] else: value = tableau_string(node[offsets[attr]]) body += ("%s %s THEN" % (alternate, " AND ".join(conditions))) body += " %s\n" % value return body