Пример #1
0
    def predict(self, input_data, by_name=True,
                print_path=False, out=sys.stdout, with_confidence=False):
        """Makes a prediction based on a number of field values.

        By default the input fields must be keyed by field name but you can use
        `by_name` to input them directly keyed by id.

        """
        empty_fields = [(key, value) for (key, value) in input_data.items()
                        if value is None]
        for (key, value) in empty_fields:
            del input_data[key]

        if by_name:
            wrong_keys = [key for key in input_data.keys() if not key
                          in self.all_inverted_fields]
            if wrong_keys:
                LOGGER.error("Wrong field names in input data: %s" %
                             ", ".join(wrong_keys))
            input_data = dict(
                [[self.inverted_fields[key], value]
                    for key, value in input_data.items()
                    if key in self.inverted_fields])

        for (key, value) in input_data.items():
            if ((self.tree.fields[key]['optype'] == 'numeric' and
                    isinstance(value, basestring)) or (
                    self.tree.fields[key]['optype'] != 'numeric' and
                    not isinstance(value, basestring))):
                try:
                    input_data.update({key:
                                       map_type(self.tree.fields[key]
                                                ['optype'])(value)})
                except:
                    raise Exception(u"Mismatch input data type in field "
                                    u"\"%s\" for value %s." %
                                    (self.tree.fields[key]['name'],
                                     value))

        prediction_info = self.tree.predict(input_data)
        prediction, path, confidence, distribution, instances = prediction_info

        # Prediction path
        if print_path:
            out.write(utf8(u' AND '.join(path) + u' => %s \n' % prediction))
            out.flush()
        if with_confidence:
            return [prediction, confidence, distribution, instances]
        return prediction
Пример #2
0
    def pair(self, row, headers=None,
             objective_field=None, objective_field_present=None):
        """Pairs a list of values with their respective field ids.

            objective_field is the column_number of the objective field.

           `objective_field_present` must be True is the objective_field column
           is present in the row.

        """

        if objective_field is None:
            objective_field = sorted(self.fields_by_column_number.keys())[-1]

        fields_names = [self.fields[self.field_id(i)]['name'] for i in
                        sorted(self.fields_by_column_number.keys())
                        if i != objective_field]

        pair = {}

        if headers:
            if not isinstance(objective_field, basestring):
                objective_field = self.field_name(objective_field)
            if objective_field_present is None:
                objective_field_present = objective_field in headers
            for index in range(len(row)):
                if index < len(row) and not row[index] in self.missing_tokens:
                    if (objective_field_present and
                            headers[index] == objective_field):
                        continue
                    field = self.fields[self.fields_by_name[headers[index]]]
                    row[index] = self.strip_affixes(row[index], field)
                    try:
                        pair.update({headers[index]:
                                     map_type(field['optype'])(row[index])})
                    except:
                        message = (u"Mismatch input data type in field "
                                   u"\"%s\" for value %s. The expected "
                                   u"fields are: \n%s" %
                                   (field['name'],
                                    row[index],
                                    ",".join(fields_names))).encode("utf-8")
                        raise Exception(message)
        else:
            if isinstance(objective_field, basestring):
                objective_field = self.field_column_number(objective_field)
            if objective_field_present is None:
                objective_field_present = len(row) == self.len()
            column_numbers = sorted(self.fields_by_column_number.keys())
            index = 0
            for column_number in column_numbers:
                if index < len(row) and not row[index] in self.missing_tokens:
                    if column_number == objective_field:
                        if objective_field_present:
                            index += 1
                        continue

                    field = self.fields[self.field_id(column_number)]
                    row[index] = self.strip_affixes(row[index], field)
                    try:
                        pair.update({self.field_id(column_number):
                                    map_type(field['optype'])(row[index])})
                    except:
                        message = (u"Mismatch input data type in field "
                                   u"\"%s\" for value %s. The expected "
                                   u"fields are: \n%s" %
                                   (field['name'],
                                    row[index],
                                    ",".join(fields_names))).encode("utf-8")
                        raise Exception(message)
                index += 1

        return pair