Пример #1
0
  def test_parse_array_unknown(self):
    array = eTree.fromstring(f'''
      <Array type="colors">🔴 🟢 🔵</Array>
    ''')

    with self.assertRaises(Exception) as cm:
      parse_array(array)

    assert str(cm.exception) == "Unknown array type encountered."
Пример #2
0
 def test_parse_sparse_array_int(self):
   array = eTree.fromstring(f'''
     <INT-SparseArray n="4">
       <Indices>1 2 3</Indices>
       <INT-Entries>3 1 4</INT-Entries>
     </INT-SparseArray>
   ''')
   assert parse_array(array) == [3, 1, 4, 0]
Пример #3
0
def get_vectors(vector_dictionary, s):
    """Return support vector values, parsed as a numpy array."""
    instance = vector_dictionary.find(f"VectorInstance[@id='{s}']")

    if instance is None:
        raise Exception(
            f'PMML model is broken, vector instance (id = {s}) not found.')

    array = instance.find('Array')
    if array is None:
        array = instance.find('REAL-Array')
    if array is None:
        array = instance.find('SparseArray')
    if array is None:
        array = instance.find('REAL-SparseArray')
    if array is None:
        raise Exception(
            f'PMML model is broken, vector instance (id = {s}) does not contain (Sparse)Array element.'
        )

    return np.array(parse_array(array))
Пример #4
0
def construct_tree(node, classes, field_mapping, i=0, rescale_factor=1):
    """
  Generate nodes and values used for constructing Cython Tree class.

  Parameters
  ----------
  node : eTree.Element
      XML Node element representing the current node.

  classes : list, None
      List of possible target classes. Is `None` for regression trees.

  field_mapping: { str: (int, callable) }
      Dictionary mapping column names to tuples with 1) index of the column and
      2) type of the column.

  i : int
      Index of the node in the result list.

  rescale_factor : float
      Factor to scale the output of every node with. Required for gradient
      boosting trees. Optional, and 1 by default.

  Returns
  -------
  (nodes, values) : tuple

      nodes : [()]
          List of nodes represented by: left child (int), right child (int),
          feature (int), value (int for categorical, float for continuous),
          impurity (float), sample count (int) and weighted sample count (int).

      values : [[]]
          List with training sample distributions at this node in the tree.

  """
    child_nodes = node.findall('Node')
    impurity = 0  # TODO: impurity doesnt affect predictions, but is nice to have
    i += 1

    def votes_for(field):
        # Deal with case where target field is a double, but ScoreDistribution value is an integer.
        if isinstance(field, float) and field.is_integer():
            return node.find(
                f"ScoreDistribution[@value='{field}']") or node.find(
                    f"ScoreDistribution[@value='{int(field)}']")

        return node.find(f"ScoreDistribution[@value='{field}']")

    if not child_nodes:
        record_count = node.get('recordCount')

        if record_count is not None and classes is not None:
            node_count_weighted = float(record_count)
            node_count = int(node_count_weighted)
            votes = [[[
                float(votes_for(c).get('recordCount'))
                if votes_for(c) is not None else 0.0 for c in classes
            ]]]
        else:
            score = node.get('score')
            node_count, node_count_weighted = (0, 0.0)

            if classes is None:
                # FIXME: unsure about `10 x rescale_factor`, but seems required, at least for r2pmml generated models
                votes = [[[float(score) * 10 * rescale_factor]]]
            else:
                votes = [[[1.0 if str(c) == score else 0.0 for c in classes]]]

        return [(TREE_LEAF, TREE_LEAF, TREE_UNDEFINED, SPLIT_UNDEFINED,
                 impurity, node_count, node_count_weighted)], votes

    predicate = child_nodes[0].find('SimplePredicate')
    set_predicate = child_nodes[0].find('SimpleSetPredicate')

    # Convert SimplePredicate with equals operator on category to set predicate
    if predicate is not None:
        is_categorical = isinstance(field_mapping[predicate.get('field')][1],
                                    Category)

        if predicate.get('operator') == 'equal' and is_categorical:
            set_predicate = eTree.fromstring(f'''
      <SimpleSetPredicate field="{predicate.get('field')}" booleanOperator="isIn">
       <Array type="string">&quot;{predicate.get('value')}&quot;</Array>
      </SimpleSetPredicate>
      ''')
            predicate = None
        elif predicate.get('operator') == 'notEqual' and is_categorical:
            set_predicate = eTree.fromstring(f'''
      <SimpleSetPredicate field="{predicate.get('field')}" booleanOperator="isNotIn">
       <Array type="string">&quot;{predicate.get('value')}&quot;</Array>
      </SimpleSetPredicate>
      ''')
            predicate = None

    if predicate is not None and predicate.get('operator') in [
            'greaterThan', 'greaterOrEqual'
    ]:
        child_nodes.reverse()

    left_node, left_value = construct_tree(child_nodes[0], classes,
                                           field_mapping, i, rescale_factor)
    offset = len(left_node)
    right_node, right_value = construct_tree(child_nodes[1], classes,
                                             field_mapping, i + offset,
                                             rescale_factor)

    children = left_node + right_node
    distributions = left_value + right_value

    if predicate is not None:
        column, _ = field_mapping[predicate.get('field')]

        # We do not use field_mapping type as the Cython tree only supports floats
        value = np.float64(predicate.get('value'))

        # Account for `>=` != `>` and `<` != `<=`. scikit-learn only uses `<=`.
        if predicate.get('operator') == 'greaterOrEqual':
            value = np.nextafter(value, value - 1)
        if predicate.get('operator') == 'lessThan':
            value = np.nextafter(value, value - 1)
    else:
        if set_predicate is not None:
            column, field_type = field_mapping[set_predicate.get('field')]

            array = set_predicate.find('Array')
            categories = parse_array(array)

            mask = 0

            for category in categories:
                try:
                    index = field_type.categories.index(category)
                    mask |= 1 << index
                except ValueError:
                    warn(
                        'Categorical values are missing in the PMML document, '
                        'attempting to infer from decision tree splits.')
                    field_type.categories.append(category)
                    mask |= 1 << len(field_type.categories) - 1

            value = struct.pack(
                'Q', np.uint64(mask))  # Q = unsigned long long = uint64

            if set_predicate.get('booleanOperator') == 'isNotIn':
                value = struct.pack('Q', ~np.uint64(mask))
        else:
            raise Exception(
                'Unsupported tree format: unknown predicate structure in Node {}'
                .format(child_nodes[0].get('id')))

    if classes is None:
        distribution = [[0]]
        sample_count_weighted = 0
        sample_count = 0
    else:
        distribution = [
            list(map(add, distributions[0][0], distributions[offset][0]))
        ]
        sample_count_weighted = sum(distribution[0])
        sample_count = int(sample_count_weighted)

    return [(i, i + offset, column, value, impurity, sample_count, sample_count_weighted)] + children, \
           [distribution] + distributions
Пример #5
0
 def test_parse_array_real(self):
   array = eTree.fromstring(f'''
     <Array type="real">1.2 1.3 2.8</Array>
   ''')
   assert parse_array(array) == [1.2, 1.3, 2.8]
Пример #6
0
 def test_parse_array_int(self):
   array = eTree.fromstring(f'''
     <Array type="int">3 1 4</Array>
   ''')
   assert parse_array(array) == [3, 1, 4]
Пример #7
0
 def test_parse_array_num(self):
   array = eTree.fromstring(f'''
     <NUM-Array>1.2 1.3 2.8</NUM-Array>
   ''')
   assert parse_array(array) == [1.2, 1.3, 2.8]
Пример #8
0
 def test_parse_array_string(self):
   array = eTree.fromstring(f'''
     <Array type="string">&quot;test and stuff&quot; more tests</Array>
   ''')
   assert parse_array(array) == ['test and stuff', 'more', 'tests']