def predict(self, input_data, by_name=True, print_path=False, out=sys.stdout, with_confidence=False): """Makes a prediction based on a number of field values. By default the input fields must be keyed by field name but you can use `by_name` to input them directly keyed by id. """ empty_fields = [(key, value) for (key, value) in input_data.items() if value is None] for (key, value) in empty_fields: del input_data[key] if by_name: wrong_keys = [key for key in input_data.keys() if not key in self.all_inverted_fields] if wrong_keys: LOGGER.error("Wrong field names in input data: %s" % ", ".join(wrong_keys)) input_data = dict( [[self.inverted_fields[key], value] for key, value in input_data.items() if key in self.inverted_fields]) for (key, value) in input_data.items(): if ((self.tree.fields[key]['optype'] == 'numeric' and isinstance(value, basestring)) or ( self.tree.fields[key]['optype'] != 'numeric' and not isinstance(value, basestring))): try: input_data.update({key: map_type(self.tree.fields[key] ['optype'])(value)}) except: raise Exception(u"Mismatch input data type in field " u"\"%s\" for value %s." % (self.tree.fields[key]['name'], value)) prediction_info = self.tree.predict(input_data) prediction, path, confidence, distribution, instances = prediction_info # Prediction path if print_path: out.write(utf8(u' AND '.join(path) + u' => %s \n' % prediction)) out.flush() if with_confidence: return [prediction, confidence, distribution, instances] return prediction
def pair(self, row, headers=None, objective_field=None, objective_field_present=None): """Pairs a list of values with their respective field ids. objective_field is the column_number of the objective field. `objective_field_present` must be True is the objective_field column is present in the row. """ if objective_field is None: objective_field = sorted(self.fields_by_column_number.keys())[-1] fields_names = [self.fields[self.field_id(i)]['name'] for i in sorted(self.fields_by_column_number.keys()) if i != objective_field] pair = {} if headers: if not isinstance(objective_field, basestring): objective_field = self.field_name(objective_field) if objective_field_present is None: objective_field_present = objective_field in headers for index in range(len(row)): if index < len(row) and not row[index] in self.missing_tokens: if (objective_field_present and headers[index] == objective_field): continue field = self.fields[self.fields_by_name[headers[index]]] row[index] = self.strip_affixes(row[index], field) try: pair.update({headers[index]: map_type(field['optype'])(row[index])}) except: message = (u"Mismatch input data type in field " u"\"%s\" for value %s. The expected " u"fields are: \n%s" % (field['name'], row[index], ",".join(fields_names))).encode("utf-8") raise Exception(message) else: if isinstance(objective_field, basestring): objective_field = self.field_column_number(objective_field) if objective_field_present is None: objective_field_present = len(row) == self.len() column_numbers = sorted(self.fields_by_column_number.keys()) index = 0 for column_number in column_numbers: if index < len(row) and not row[index] in self.missing_tokens: if column_number == objective_field: if objective_field_present: index += 1 continue field = self.fields[self.field_id(column_number)] row[index] = self.strip_affixes(row[index], field) try: pair.update({self.field_id(column_number): map_type(field['optype'])(row[index])}) except: message = (u"Mismatch input data type in field " u"\"%s\" for value %s. The expected " u"fields are: \n%s" % (field['name'], row[index], ",".join(fields_names))).encode("utf-8") raise Exception(message) index += 1 return pair