def one_branch(children, input_data): """Check if there's only one branch to be followed """ missing = split(children) in input_data return (missing or missing_branch(children) or none_value(children))
def predict_proportional(self, input_data, path=None): """Makes a prediction based on a number of field values averaging the predictions of the leaves that fall in a subtree. Each time a splitting field has no value assigned, we consider both branches of the split to be true, merging their predictions. """ if path is None: path = [] final_distribution = {} if not self.children: return merge_distributions({}, dict((x[0], x[1]) for x in self.distribution)) if split(self.children) in input_data: for child in self.children: if child.predicate.apply(input_data, self.fields): new_rule = child.predicate.to_rule(self.fields) if not new_rule in path: path.append(new_rule) return child.predict_proportional(input_data, path) else: for child in self.children: final_distribution = merge_distributions( final_distribution, child.predict_proportional(input_data, path)) return final_distribution
def predict(self, input_data, path=None): """Makes a prediction based on a number of field values. The input fields must be keyed by Id. """ def get_instances(distribution): """Returns the total number of instances in a distribution """ return sum(x[1] for x in distribution) if distribution else 0 if path is None: path = [] if self.children and split(self.children) in input_data: for child in self.children: if apply(OPERATOR[child.predicate.operator], [input_data[child.predicate.field], child.predicate.value]): path.append(u"%s %s %s" % ( self.fields[child.predicate.field]['name'], child.predicate.operator, child.predicate.value)) return child.predict(input_data, path) return (self.output, path, self.confidence, self.distribution, get_instances(self.distribution))
def python_body(self, depth=1, cmv=False): """Translate the model into a set of "if" python statements. `depth` controls the size of indentation. If `cmv` (control missing values) is set to True then as soon as a value is missing to evaluate a predicate the output at that node is returned without further evaluation. """ body = "" if self.children: if cmv: field = split(self.children) body += ("%sif (%s is None):\n " % (INDENT * depth, self.fields[field]['slug'])) if self.fields[self.objective_field]['optype'] == 'numeric': body += ("%s return %s\n" % (INDENT * (depth + 1), self.output)) else: body += ("%s return '%s'\n" % (INDENT * (depth + 1), self.output)) for child in self.children: body += ("%sif (%s %s %s):\n" % (INDENT * depth, self.fields[child.predicate.field]['slug'], PYTHON_OPERATOR[child.predicate.operator], `child.predicate.value`)) body += child.python_body(depth + 1)
def tableau_body(self, body=u"", conditions=None, cmv=None, ids_path=None, subtree=True): """Translate the model into a set of "if" statements in Tableau syntax `depth` controls the size of indentation. As soon as a value is missing that node is returned without further evaluation. """ if cmv is None: cmv = [] if conditions is None: conditions = [] alternate = u"IF" else: alternate = u"ELSEIF" children = filter_nodes(self.children, ids=ids_path, subtree=subtree) if children: field = split(children) if not self.fields[field]['name'] in cmv: conditions.append("ISNULL([%s])" % self.fields[field]['name']) body += (u"%s %s THEN " % (alternate, " AND ".join(conditions))) if self.fields[self.objective_field]['optype'] == 'numeric': value = self.output else: value = tableau_string(self.output) body += (u"%s\n" % value) cmv.append(self.fields[field]['name']) alternate = u"ELSEIF" del conditions[-1] for child in children: optype = self.fields[child.predicate.field]['optype'] if optype == 'text': return u"" if (optype == 'numeric'): value = child.predicate.value else: value = repr(child.predicate.value) conditions.append("[%s]%s%s" % ( self.fields[child.predicate.field]['name'], PYTHON_OPERATOR[child.predicate.operator], value)) body = child.tableau_body(body, conditions[:], cmv=cmv[:], ids_path=ids_path, subtree=subtree) del conditions[-1] else: if self.fields[self.objective_field]['optype'] == 'numeric': value = self.output else: value = tableau_string(self.output) body += ( u"%s %s THEN" % (alternate, " AND ".join(conditions))) body += u" %s\n" % value return body
def predict_proportional(self, input_data, path=None, missing_found=False, median=False, parent=None): """Makes a prediction based on a number of field values averaging the predictions of the leaves that fall in a subtree. Each time a splitting field has no value assigned, we consider both branches of the split to be true, merging their predictions. The function returns the merged distribution and the last node reached by a unique path. """ if path is None: path = [] final_distribution = {} if not self.children: distribution = self.distribution if not self.weighted else \ self.weighted_distribution return (merge_distributions({}, dict((x[0], x[1]) for x in distribution)), self.min, self.max, self, self.count, parent) if one_branch(self.children, input_data) or \ self.fields[split(self.children)]["optype"] in \ ["text", "items"]: for child in self.children: if child.predicate.apply(input_data, self.fields): new_rule = child.predicate.to_rule(self.fields) if new_rule not in path and not missing_found: path.append(new_rule) return child.predict_proportional(input_data, path, missing_found, median, parent=self) else: # missing value found, the unique path stops missing_found = True minimums = [] maximums = [] population = 0 for child in self.children: (subtree_distribution, subtree_min, subtree_max, _, subtree_pop, _) = \ child.predict_proportional(input_data, path, missing_found, median, parent=self) if subtree_min is not None: minimums.append(subtree_min) if subtree_max is not None: maximums.append(subtree_max) population += subtree_pop final_distribution = merge_distributions( final_distribution, subtree_distribution) return (final_distribution, min(minimums) if minimums else None, max(maximums) if maximums else None, self, population, self)
def python_body(self, depth=1, cmv=None, input_map=False): """Translate the model into a set of "if" python statements. `depth` controls the size of indentation. As soon as a value is missing that node is returned without further evaluation. """ def map_data(field, missing=False): """Returns the subject of the condition in map format when more than MAX_ARGS_LENGTH arguments are used. """ if input_map: if missing: return "not '%s' in data or data['%s']" % (field, field) else: return "data['%s']" % field return field if cmv is None: cmv = [] body = u"" if self.children: field = split(self.children) if not self.fields[field]['slug'] in cmv: body += (u"%sif (%s is None):\n" % (INDENT * depth, map_data(self.fields[field]['slug'], True))) if self.fields[self.objective_field]['optype'] == 'numeric': value = self.output else: value = repr(self.output) body += (u"%sreturn %s\n" % (INDENT * (depth + 1), value)) cmv.append(self.fields[field]['slug']) for child in self.children: if self.fields[child.predicate.field]['optype'] == 'numeric': value = child.predicate.value else: value = repr(child.predicate.value) body += (u"%sif (%s %s %s):\n" % (INDENT * depth, map_data(self.fields[child.predicate.field]['slug'], False), PYTHON_OPERATOR[child.predicate.operator], value)) body += child.python_body(depth + 1, cmv=cmv[:], input_map=input_map) else: if self.fields[self.objective_field]['optype'] == 'numeric': value = self.output else: value = repr(self.output) body = u"%sreturn %s\n" % (INDENT * depth, value) return body
def predict(self, input_data, path=None, missing_strategy=LAST_PREDICTION): """Makes a prediction based on a number of field values. The input fields must be keyed by Id. There are two possible strategies to predict when the value for the splitting field is missing: 0 - LAST_PREDICTION: the last issued prediction is returned. 1 - PROPORTIONAL: as we cannot choose between the two branches in the tree that stem from this split, we consider both. The algorithm goes on until the final leaves are reached and all their predictions are used to decide the final prediction. """ if path is None: path = [] if missing_strategy == PROPORTIONAL: final_distribution = self.predict_proportional(input_data, path=path) if self.regression: # sort elements by their mean distribution = [ list(element) for element in sorted(final_distribution.items(), key=lambda x: x[0]) ] distribution = merge_bins(distribution, BINS_LIMIT) prediction = mean(distribution) total_instances = sum( [instances for _, instances in distribution]) confidence = regression_error( unbiased_sample_variance(distribution, prediction), total_instances) return (prediction, path, confidence, distribution, total_instances) else: distribution = [ list(element) for element in sorted(final_distribution.items(), key=lambda x: (-x[1], x[0])) ] return (distribution[0][0], path, ws_confidence(distribution[0][0], final_distribution), distribution, get_instances(distribution)) else: if self.children and split(self.children) in input_data: for child in self.children: if child.predicate.apply(input_data, self.fields): path.append(child.predicate.to_rule(self.fields)) return child.predict(input_data, path) return (self.output, path, self.confidence, self.distribution, get_instances(self.distribution))
def predict(self, input_data, path=None, missing_strategy=LAST_PREDICTION): """Makes a prediction based on a number of field values. The input fields must be keyed by Id. There are two possible strategies to predict when the value for the splitting field is missing: 0 - LAST_PREDICTION: the last issued prediction is returned. 1 - PROPORTIONAL: as we cannot choose between the two branches in the tree that stem from this split, we consider both. The algorithm goes on until the final leaves are reached and all their predictions are used to decide the final prediction. """ if path is None: path = [] if missing_strategy == PROPORTIONAL: final_distribution = self.predict_proportional(input_data, path=path) if self.regression: # sort elements by their mean distribution = [list(element) for element in sorted(final_distribution.items(), key=lambda x: x[0])] distribution = merge_bins(distribution, BINS_LIMIT) prediction = mean(distribution) total_instances = sum([instances for _, instances in distribution]) confidence = regression_error( unbiased_sample_variance(distribution, prediction), total_instances) return (prediction, path, confidence, distribution, total_instances) else: distribution = [list(element) for element in sorted(final_distribution.items(), key=lambda x: (-x[1], x[0]))] return (distribution[0][0], path, ws_confidence(distribution[0][0], final_distribution), distribution, get_instances(distribution)) else: if self.children and split(self.children) in input_data: for child in self.children: if child.predicate.apply(input_data, self.fields): path.append(child.predicate.to_rule(self.fields)) return child.predict(input_data, path) return (self.output, path, self.confidence, self.distribution, get_instances(self.distribution))
def predict(self, input_data, path=[]): """Makes a prediction based on a number of field values. The input fields must be keyed by Id. """ if self.children and split(self.children) in input_data: for child in self.children: if apply(OPERATOR[child.predicate.operator], [input_data[child.predicate.field], child.predicate.value]): path.append("%s %s %s" % ( self.fields[child.predicate.field]['name'], child.predicate.operator, child.predicate.value)) return child.predict(input_data, path) else: return self.output, path
def predict_proportional(self, input_data, path=None, missing_found=False): """Makes a prediction based on a number of field values considering all the predictions of the leaves that fall in a subtree. Each time a splitting field has no value assigned, we consider both branches of the split to be true, merging their predictions. The function returns the merged distribution and the last node reached by a unique path. """ if path is None: path = [] if not self.children: return (self.g_sum, self.h_sum, self.count, path) if one_branch(self.children, input_data) or \ self.fields[split(self.children)]["optype"] in \ ["text", "items"]: for child in self.children: if child.predicate.apply(input_data, self.fields): new_rule = child.predicate.to_rule(self.fields) if new_rule not in path and not missing_found: path.append(new_rule) return child.predict_proportional(input_data, path, missing_found) else: # missing value found, the unique path stops missing_found = True g_sums = 0.0 h_sums = 0.0 population = 0 for child in self.children: g_sum, h_sum, count, _ = \ child.predict_proportional(input_data, path, missing_found) g_sums += g_sum h_sums += h_sum population += count return (g_sums, h_sums, population, path)
def predict(self, input_data, path=None): """Makes a prediction based on a number of field values. The input fields must be keyed by Id. """ def get_instances(distribution): """Returns the total number of instances in a distribution """ return sum(x[1] for x in distribution) if distribution else 0 if path is None: path = [] if self.children and split(self.children) in input_data: for child in self.children: if child.predicate.apply(input_data, self.fields): path.append(child.predicate.to_rule(self.fields)) return child.predict(input_data, path) return (self.output, path, self.confidence, self.distribution, get_instances(self.distribution))
def python_body(self, depth=1, cmv=None, input_map=False, ids_path=None, subtree=True): """Translate the model into a set of "if" python statements. `depth` controls the size of indentation. As soon as a value is missing that node is returned without further evaluation. """ def map_data(field, missing=False): """Returns the subject of the condition in map format when more than MAX_ARGS_LENGTH arguments are used. """ if input_map: if missing: return "data.get('%s')" % field else: return "data['%s']" % field return field if cmv is None: cmv = [] body = u"" term_analysis_fields = [] children = filter_nodes(self.children, ids=ids_path, subtree=subtree) if children: field = split(children) has_missing_branch = (missing_branch(children) or none_value(children)) # the missing is singled out as a special case only when there's # no missing branch in the children list if (not has_missing_branch and not self.fields[field]['slug'] in cmv): body += (u"%sif (%s is None):\n" % (INDENT * depth, map_data(self.fields[field]['slug'], True))) if self.fields[self.objective_id]['optype'] == 'numeric': value = self.output else: value = repr(self.output) body += (u"%sreturn %s\n" % (INDENT * (depth + 1), value)) cmv.append(self.fields[field]['slug']) for child in children: field = child.predicate.field pre_condition = u"" if has_missing_branch and child.predicate.value is not None: negation = u"" if child.predicate.missing else u" not" connection = u"or" if child.predicate.missing else u"and" pre_condition = (u"%s is%s None %s " % (map_data(self.fields[field]['slug'], True), negation, connection)) if not child.predicate.missing: cmv.append(self.fields[field]['slug']) optype = self.fields[field]['optype'] if (optype == 'numeric' or optype == 'text' or child.predicate.value is None): value = child.predicate.value else: value = repr(child.predicate.value) if optype == 'text': body += ( u"%sif (%sterm_matches(%s, \"%s\", %s\"%s\") %s %s):" u"\n" % (INDENT * depth, pre_condition, map_data(self.fields[field]['slug'], False), self.fields[field]['slug'], ('u' if isinstance(child.predicate.term, unicode) else ''), child.predicate.term.replace("\"", "\\\""), PYTHON_OPERATOR[child.predicate.operator], value)) term_analysis_fields.append((field, child.predicate.term)) else: operator = (MISSING_OPERATOR[child.predicate.operator] if child.predicate.value is None else PYTHON_OPERATOR[child.predicate.operator]) if child.predicate.value is None: cmv.append(self.fields[field]['slug']) body += (u"%sif (%s%s %s %s):\n" % (INDENT * depth, pre_condition, map_data(self.fields[field]['slug'], False), operator, value)) next_level = child.python_body(depth + 1, cmv=cmv[:], input_map=input_map, ids_path=ids_path, subtree=subtree) body += next_level[0] term_analysis_fields.extend(next_level[1]) else: if self.fields[self.objective_id]['optype'] == 'numeric': value = self.output else: value = repr(self.output) body = u"%sreturn %s\n" % (INDENT * depth, value) return body, term_analysis_fields
def tableau_body(self, body=u"", conditions=None, cmv=None, ids_path=None, subtree=True): """Translate the model into a set of "if" statements in Tableau syntax `depth` controls the size of indentation. As soon as a value is missing that node is returned without further evaluation. """ if cmv is None: cmv = [] if body: alternate = u"ELSEIF" else: if conditions is None: conditions = [] alternate = u"IF" children = filter_nodes(self.children, ids=ids_path, subtree=subtree) if children: field = split(children) has_missing_branch = (missing_branch(children) or none_value(children)) # the missing is singled out as a special case only when there's # no missing branch in the children list if (not has_missing_branch and not self.fields[field]['name'] in cmv): conditions.append("ISNULL([%s])" % self.fields[field]['name']) body += (u"%s %s THEN " % (alternate, " AND ".join(conditions))) if self.fields[self.objective_id]['optype'] == 'numeric': value = self.output else: value = tableau_string(self.output) body += (u"%s\n" % value) cmv.append(self.fields[field]['name']) alternate = u"ELSEIF" del conditions[-1] for child in children: pre_condition = u"" post_condition = u"" if has_missing_branch and child.predicate.value is not None: negation = u"" if child.predicate.missing else u"NOT " connection = u"OR" if child.predicate.missing else u"AND" pre_condition = ( u"(%sISNULL([%s]) %s " % (negation, self.fields[field]['name'], connection)) if not child.predicate.missing: cmv.append(self.fields[field]['name']) post_condition = u")" optype = self.fields[child.predicate.field]['optype'] if child.predicate.value is None: value = "" elif optype == 'text': return u"" elif optype == 'numeric': value = child.predicate.value else: value = repr(child.predicate.value) operator = ("" if child.predicate.value is None else PYTHON_OPERATOR[child.predicate.operator]) if child.predicate.value is None: pre_condition = ( T_MISSING_OPERATOR[child.predicate.operator]) post_condition = ")" conditions.append( "%s[%s]%s%s%s" % (pre_condition, self.fields[child.predicate.field]['name'], operator, value, post_condition)) body = child.tableau_body(body, conditions[:], cmv=cmv[:], ids_path=ids_path, subtree=subtree) del conditions[-1] else: if self.fields[self.objective_id]['optype'] == 'numeric': value = self.output else: value = tableau_string(self.output) body += (u"%s %s THEN" % (alternate, " AND ".join(conditions))) body += u" %s\n" % value return body
def tableau_body(self, body=u"", conditions=None, cmv=None, ids_path=None, subtree=True): """Translate the model into a set of "if" statements in Tableau syntax `depth` controls the size of indentation. As soon as a value is missing that node is returned without further evaluation. """ if cmv is None: cmv = [] if body: alternate = u"ELSEIF" else: if conditions is None: conditions = [] alternate = u"IF" children = filter_nodes(self.children, ids=ids_path, subtree=subtree) if children: field = split(children) has_missing_branch = (missing_branch(children) or none_value(children)) # the missing is singled out as a special case only when there's # no missing branch in the children list if (not has_missing_branch and not self.fields[field]['name'] in cmv): conditions.append("ISNULL([%s])" % self.fields[field]['name']) body += (u"%s %s THEN " % (alternate, " AND ".join(conditions))) if self.fields[self.objective_id]['optype'] == 'numeric': value = self.output else: value = tableau_string(self.output) body += (u"%s\n" % value) cmv.append(self.fields[field]['name']) alternate = u"ELSEIF" del conditions[-1] for child in children: pre_condition = u"" post_condition = u"" if has_missing_branch and child.predicate.value is not None: negation = u"" if child.predicate.missing else u"NOT " connection = u"OR" if child.predicate.missing else u"AND" pre_condition = ( u"(%sISNULL([%s]) %s " % ( negation, self.fields[field]['name'], connection)) if not child.predicate.missing: cmv.append(self.fields[field]['name']) post_condition = u")" optype = self.fields[child.predicate.field]['optype'] if child.predicate.value is None: value = "" elif optype == 'text': return u"" elif optype == 'numeric': value = child.predicate.value else: value = repr(child.predicate.value) operator = ("" if child.predicate.value is None else PYTHON_OPERATOR[child.predicate.operator]) if child.predicate.value is None: pre_condition = ( T_MISSING_OPERATOR[child.predicate.operator]) post_condition = ")" conditions.append("%s[%s]%s%s%s" % ( pre_condition, self.fields[child.predicate.field]['name'], operator, value, post_condition)) body = child.tableau_body(body, conditions[:], cmv=cmv[:], ids_path=ids_path, subtree=subtree) del conditions[-1] else: if self.fields[self.objective_id]['optype'] == 'numeric': value = self.output else: value = tableau_string(self.output) body += ( u"%s %s THEN" % (alternate, " AND ".join(conditions))) body += u" %s\n" % value return body
def plug_in_body(self, depth=1, cmv=None, ids_path=None, subtree=True): """Translate the model into a set of "if" javascript statements. `depth` controls the size of indentation. As soon as a value is missing to evaluate a predicate the output at that node is returned without further evaluation. """ metric = "error" if self.regression else "confidence" if cmv is None: cmv = [] body = u"" term_analysis_fields = [] item_analysis_fields = [] prefix = u"" field_obj = self.fields[self.objective_id] if len(self.fields) > MAX_ARGS_LENGTH: prefix = u"data." children = filter_nodes(self.children, ids=ids_path, subtree=subtree) if children: # field used in the split field = split(children) has_missing_branch = missing_branch(children) # the missing is singled out as a special case only when there's # no missing branch in the children list one_branch = not has_missing_branch or \ self.fields[field]['optype'] in COMPOSED_FIELDS if (one_branch and not self.fields[field]['camelCase'] in cmv): body += self.missing_check_code(field, depth, prefix, cmv, metric) for child in children: field = child.predicate.field pre_condition = u"" # code when missing_splits has been used if has_missing_branch and child.predicate.value is not None: pre_condition = self.missing_prefix_code(child, field, prefix, cmv) # complete split condition code body += child.split_condition_code( \ field, depth, prefix, pre_condition, term_analysis_fields, item_analysis_fields) # value to be determined in next node next_level = child.plug_in_body(depth + 1, cmv=cmv[:], ids_path=ids_path, subtree=subtree) body += next_level[0] body += u"%s}\n" % (INDENT * depth) term_analysis_fields.extend(next_level[1]) item_analysis_fields.extend(next_level[2]) else: value = value_to_print(self.output, self.fields[self.objective_id]['optype']) body = u"%sreturn {prediction: %s, %s: %s};\n" % ( \ INDENT * depth, value, metric, self.confidence) return body, term_analysis_fields, item_analysis_fields
def python_body(self, depth=1, cmv=None, input_map=False, ids_path=None, subtree=True): """Translate the model into a set of "if" python statements. `depth` controls the size of indentation. As soon as a value is missing that node is returned without further evaluation. """ def map_data(field, missing=False): """Returns the subject of the condition in map format when more than MAX_ARGS_LENGTH arguments are used. """ if input_map: if missing: return "data.get('%s')" % field else: return "data['%s']" % field return field if cmv is None: cmv = [] body = u"" term_analysis_fields = [] children = filter_nodes(self.children, ids=ids_path, subtree=subtree) if children: field = split(children) has_missing_branch = (missing_branch(children) or none_value(children)) # the missing is singled out as a special case only when there's # no missing branch in the children list if (not has_missing_branch and not self.fields[field]['slug'] in cmv): body += (u"%sif (%s is None):\n" % (INDENT * depth, map_data(self.fields[field]['slug'], True))) if self.fields[self.objective_id]['optype'] == 'numeric': value = self.output else: value = repr(self.output) body += (u"%sreturn %s\n" % (INDENT * (depth + 1), value)) cmv.append(self.fields[field]['slug']) for child in children: field = child.predicate.field pre_condition = u"" if has_missing_branch and child.predicate.value is not None: negation = u"" if child.predicate.missing else u" not" connection = u"or" if child.predicate.missing else u"and" pre_condition = ( u"%s is%s None %s " % ( map_data(self.fields[field]['slug'], True), negation, connection)) if not child.predicate.missing: cmv.append(self.fields[field]['slug']) optype = self.fields[field]['optype'] if (optype == 'numeric' or optype == 'text' or child.predicate.value is None): value = child.predicate.value else: value = repr(child.predicate.value) if optype == 'text': body += ( u"%sif (%sterm_matches(%s, \"%s\", %s\"%s\") %s %s):" u"\n" % (INDENT * depth, pre_condition, map_data(self.fields[field]['slug'], False), self.fields[field]['slug'], ('u' if isinstance(child.predicate.term, unicode) else ''), child.predicate.term.replace("\"", "\\\""), PYTHON_OPERATOR[child.predicate.operator], value)) term_analysis_fields.append((field, child.predicate.term)) else: operator = (MISSING_OPERATOR[child.predicate.operator] if child.predicate.value is None else PYTHON_OPERATOR[child.predicate.operator]) if child.predicate.value is None: cmv.append(self.fields[field]['slug']) body += ( u"%sif (%s%s %s %s):\n" % (INDENT * depth, pre_condition, map_data(self.fields[field]['slug'], False), operator, value)) next_level = child.python_body(depth + 1, cmv=cmv[:], input_map=input_map, ids_path=ids_path, subtree=subtree) body += next_level[0] term_analysis_fields.extend(next_level[1]) else: if self.fields[self.objective_id]['optype'] == 'numeric': value = self.output else: value = repr(self.output) body = u"%sreturn %s\n" % (INDENT * depth, value) return body, term_analysis_fields
def plug_in_body(self, depth=1, cmv=None, ids_path=None, subtree=True): """Translate the model into a set of "if" javascript statements. `depth` controls the size of indentation. As soon as a value is missing to evaluate a predicate the output at that node is returned without further evaluation. """ metric = "error" if self.regression else "confidence" if cmv is None: cmv = [] body = u"" term_analysis_fields = [] item_analysis_fields = [] prefix = u"" field_obj = self.fields[self.objective_id] if len(self.fields) > MAX_ARGS_LENGTH: prefix = u"data." children = filter_nodes(self.children, ids=ids_path, subtree=subtree) if children: # field used in the split field = split(children) has_missing_branch = missing_branch(children) # the missing is singled out as a special case only when there's # no missing branch in the children list one_branch = not has_missing_branch or \ self.fields[field]['optype'] in COMPOSED_FIELDS if (one_branch and not self.fields[field]['camelCase'] in cmv): body += self.missing_check_code(field, depth, prefix, cmv, metric) for child in children: field = child.predicate.field pre_condition = u"" # code when missing_splits has been used if has_missing_branch and child.predicate.value is not None: pre_condition = self.missing_prefix_code( child, field, prefix, cmv) # complete split condition code body += child.split_condition_code( \ field, depth, prefix, pre_condition, term_analysis_fields, item_analysis_fields) # value to be determined in next node next_level = child.plug_in_body(depth + 1, cmv=cmv[:], ids_path=ids_path, subtree=subtree) body += next_level[0] body += u"%s}\n" % (INDENT * depth) term_analysis_fields.extend(next_level[1]) item_analysis_fields.extend(next_level[2]) else: value = value_to_print(self.output, self.fields[self.objective_id]['optype']) body = u"%sreturn {prediction: %s, %s: %s};\n" % ( \ INDENT * depth, value, metric, self.confidence) return body, term_analysis_fields, item_analysis_fields
def python_body(self, depth=1, cmv=None, input_map=False, ids_path=None, subtree=True): """Translate the model into a set of "if" python statements. `depth` controls the size of indentation. As soon as a value is missing that node is returned without further evaluation. """ def map_data(field, missing=False): """Returns the subject of the condition in map format when more than MAX_ARGS_LENGTH arguments are used. """ if input_map: if missing: return "not '%s' in data or data['%s']" % (field, field) else: return "data['%s']" % field return field if cmv is None: cmv = [] body = u"" term_analysis_fields = [] children = filter_nodes(self.children, ids=ids_path, subtree=subtree) if children: field = split(children) if not self.fields[field]['slug'] in cmv: body += (u"%sif (%s is None):\n" % (INDENT * depth, map_data(self.fields[field]['slug'], True))) if self.fields[self.objective_field]['optype'] == 'numeric': value = self.output else: value = repr(self.output) body += (u"%sreturn %s\n" % (INDENT * (depth + 1), value)) cmv.append(self.fields[field]['slug']) for child in children: optype = self.fields[child.predicate.field]['optype'] if (optype == 'numeric' or optype == 'text'): value = child.predicate.value else: value = repr(child.predicate.value) if optype == 'text': body += ( u"%sif (term_matches(%s, \"%s\", %s\"%s\") %s %s):\n" % (INDENT * depth, map_data(self.fields[child.predicate.field]['slug'], False), self.fields[child.predicate.field]['slug'], ('u' if isinstance(child.predicate.term, unicode) else ''), child.predicate.term.replace("\"", "\\\""), PYTHON_OPERATOR[child.predicate.operator], value)) term_analysis_fields.append( (child.predicate.field, child.predicate.term)) else: body += ( u"%sif (%s %s %s):\n" % (INDENT * depth, map_data(self.fields[child.predicate.field]['slug'], False), PYTHON_OPERATOR[child.predicate.operator], value)) next_level = child.python_body(depth + 1, cmv=cmv[:], input_map=input_map, ids_path=ids_path, subtree=subtree) body += next_level[0] term_analysis_fields.extend(next_level[1]) else: if self.fields[self.objective_field]['optype'] == 'numeric': value = self.output else: value = repr(self.output) body = u"%sreturn %s\n" % (INDENT * depth, value) return body, term_analysis_fields
def tableau_body(self, body=u"", conditions=None, cmv=None, ids_path=None, subtree=True): """Translate the model into a set of "if" statements in Tableau syntax `depth` controls the size of indentation. As soon as a value is missing that node is returned without further evaluation. """ if cmv is None: cmv = [] if conditions is None: conditions = [] alternate = u"IF" else: alternate = u"ELSEIF" children = filter_nodes(self.children, ids=ids_path, subtree=subtree) if children: field = split(children) if not self.fields[field]['name'] in cmv: conditions.append("ISNULL([%s])" % self.fields[field]['name']) body += (u"%s %s THEN " % (alternate, " AND ".join(conditions))) if self.fields[self.objective_field]['optype'] == 'numeric': value = self.output else: value = tableau_string(self.output) body += (u"%s\n" % value) cmv.append(self.fields[field]['name']) alternate = u"ELSEIF" del conditions[-1] for child in children: optype = self.fields[child.predicate.field]['optype'] if optype == 'text': return u"" if (optype == 'numeric'): value = child.predicate.value else: value = repr(child.predicate.value) conditions.append( "[%s]%s%s" % (self.fields[child.predicate.field]['name'], PYTHON_OPERATOR[child.predicate.operator], value)) body = child.tableau_body(body, conditions[:], cmv=cmv[:], ids_path=ids_path, subtree=subtree) del conditions[-1] else: if self.fields[self.objective_field]['optype'] == 'numeric': value = self.output else: value = tableau_string(self.output) body += (u"%s %s THEN" % (alternate, " AND ".join(conditions))) body += u" %s\n" % value return body
def python_body(self, depth=1, cmv=None, input_map=False, ids_path=None, subtree=True): """Translate the model into a set of "if" python statements. `depth` controls the size of indentation. As soon as a value is missing that node is returned without further evaluation. """ def map_data(field, missing=False): """Returns the subject of the condition in map format when more than MAX_ARGS_LENGTH arguments are used. """ if input_map: if missing: return "not '%s' in data or data['%s']" % (field, field) else: return "data['%s']" % field return field if cmv is None: cmv = [] body = u"" term_analysis_fields = [] children = filter_nodes(self.children, ids=ids_path, subtree=subtree) if children: field = split(children) if not self.fields[field]['slug'] in cmv: body += (u"%sif (%s is None):\n" % (INDENT * depth, map_data(self.fields[field]['slug'], True))) if self.fields[self.objective_field]['optype'] == 'numeric': value = self.output else: value = repr(self.output) body += (u"%sreturn %s\n" % (INDENT * (depth + 1), value)) cmv.append(self.fields[field]['slug']) for child in children: optype = self.fields[child.predicate.field]['optype'] if (optype == 'numeric' or optype == 'text'): value = child.predicate.value else: value = repr(child.predicate.value) if optype == 'text': body += ( u"%sif (term_matches(%s, \"%s\", %s\"%s\") %s %s):\n" % (INDENT * depth, map_data(self.fields[child.predicate.field]['slug'], False), self.fields[child.predicate.field]['slug'], ('u' if isinstance(child.predicate.term, unicode) else ''), child.predicate.term.replace("\"", "\\\""), PYTHON_OPERATOR[child.predicate.operator], value)) term_analysis_fields.append((child.predicate.field, child.predicate.term)) else: body += ( u"%sif (%s %s %s):\n" % (INDENT * depth, map_data(self.fields[child.predicate.field]['slug'], False), PYTHON_OPERATOR[child.predicate.operator], value)) next_level = child.python_body(depth + 1, cmv=cmv[:], input_map=input_map, ids_path=ids_path, subtree=subtree) body += next_level[0] term_analysis_fields.extend(next_level[1]) else: if self.fields[self.objective_field]['optype'] == 'numeric': value = self.output else: value = repr(self.output) body = u"%sreturn %s\n" % (INDENT * depth, value) return body, term_analysis_fields