示例#1
0
  def test_prepare_data_exception_mismatch_columns_numpy(self):
    clf = PMMLBaseEstimator(pmml=StringIO("""
    <PMML xmlns="http://www.dmg.org/PMML-4_3" version="4.3">
      <DataDictionary>
        <DataField name="Class" optype="categorical" dataType="string">
          <Value value="setosa"/>
          <Value value="versicolor"/>
          <Value value="virginica"/>
        </DataField>
        <DataField name="test1" optype="continuous" dataType="double"/>
      </DataDictionary>
      <MiningSchema>
        <MiningField name="Class" usageType="target"/>
      </MiningSchema>
    </PMML>
    """))

    X = pd.DataFrame(data=[[1, 2], [3, 4], [5, 6]], columns=["test1", "test2"])

    with self.assertRaises(Exception) as cm:
      clf._prepare_data(np.asanyarray(X))

    assert str(cm.exception) == "The number of features in provided data does not match expected number of features " \
                                "in the PMML. Provide pandas.Dataframe, or provide data matching the DataFields in " \
                                "the PMML document."
示例#2
0
  def test_get_type_exception(self):
    template = """
    <PMML xmlns="http://www.dmg.org/PMML-4_3" version="4.3">
      <DataDictionary>
        <DataField name="test" optype="{}" dataType="{}"/>
      </DataDictionary>
    </PMML>"""

    # Test invalid data type
    pmml = StringIO(template.format("continuous", "does_not_exist"))
    clf = PMMLBaseEstimator(pmml)
    data_dictionary = clf.root.find("DataDictionary")
    data_field = data_dictionary.find("DataField")

    with self.assertRaises(Exception) as cm:
      get_type(data_field)
    assert str(cm.exception) == "Unsupported data type."

    # Test invalid operation type
    pmml = StringIO(template.format("does_not_exist", "string"))
    clf = PMMLBaseEstimator(pmml)
    data_dictionary = clf.root.find("DataDictionary")
    data_field = data_dictionary.find("DataField")

    with self.assertRaises(Exception) as cm:
      get_type(data_field)
    assert str(cm.exception) == "Unsupported operation type."
示例#3
0
  def test_parse_type_value_continuous(self):
    template = """
    <PMML xmlns="http://www.dmg.org/PMML-4_3" version="4.3">
      <DataDictionary>
        <DataField name="test" optype="{}" dataType="{}"/>
      </DataDictionary>
    </PMML>"""

    values = ["1234", 1234, 12.34, True]
    types = [
      ['continuous', 'integer', int],
      ['continuous', 'float', float],
      ['continuous', 'double', float],
    ]

    for value in values:
      for type in types:
        optype, pmml_type, data_type = type
        clf = PMMLBaseEstimator(pmml=StringIO(template.format(optype, pmml_type)))

        data_dictionary = clf.find(clf.root, "DataDictionary")
        data_field = clf.find(data_dictionary, "DataField")
        result = clf.parse_type(value, data_field)

        assert isinstance(result, data_type)
示例#4
0
  def test_parse_type_value_continuous_boolean(self):
    template = """
    <PMML xmlns="http://www.dmg.org/PMML-4_3" version="4.3">
      <DataDictionary>
        <DataField name="test" optype="{}" dataType="{}"/>
      </DataDictionary>
    </PMML>"""

    tests = {
      "1": True,
      "True": True,
      "YES": True,
      1: True,
      True: True,
      "0": False,
      "False": False,
      0: False
    }

    for value, expected in tests.items():
      clf = PMMLBaseEstimator(pmml=StringIO(template.format('continuous', 'boolean')))

      data_dictionary = clf.find(clf.root, "DataDictionary")
      data_field = clf.find(data_dictionary, "DataField")
      result = clf.parse_type(value, data_field)

      assert isinstance(result, Boolean)
      assert result == expected
示例#5
0
  def test_fit_exception(self):
    pmml = StringIO('<PMML xmlns="http://www.dmg.org/PMML-4_3" version="4.3"/>')
    clf = PMMLBaseEstimator(pmml)

    with self.assertRaises(Exception) as cm:
      clf.fit(X, y)

    assert str(cm.exception) == "Not supported."
示例#6
0
  def test_prepare_data_removes_unused_columns(self):
    clf = PMMLBaseEstimator(pmml=StringIO("""
      <PMML xmlns="http://www.dmg.org/PMML-4_3" version="4.3">
        <DataDictionary>
          <DataField name="Class" optype="categorical" dataType="string">
            <Value value="setosa"/>
            <Value value="versicolor"/>
            <Value value="virginica"/>
          </DataField>
          <DataField name="test1" optype="continuous" dataType="double"/>
        </DataDictionary>
        <MiningSchema>
          <MiningField name="Class" usageType="target"/>
        </MiningSchema>
      </PMML>
      """))

    X = pd.DataFrame(data=[[1, 2], [3, 4], [5, 6]], columns=["test1", "test2"])
    result = clf._prepare_data(X)

    assert list(X.columns) == ["test1", "test2"]
    assert list(result.columns) == ["test1"]
示例#7
0
  def test_target_field(self):
    clf = PMMLBaseEstimator(pmml=StringIO("""
    <PMML xmlns="http://www.dmg.org/PMML-4_3" version="4.3"/>
    """))
    assert clf.target_field is None

    clf = PMMLBaseEstimator(pmml=StringIO("""
      <PMML xmlns="http://www.dmg.org/PMML-4_3" version="4.3">
        <DataDictionary>
          <DataField name="Class" optype="categorical" dataType="string">
            <Value value="setosa"/>
            <Value value="versicolor"/>
            <Value value="virginica"/>
          </DataField>
        </DataDictionary>
        <MiningSchema>
          <MiningField name="Class" usageType="target"/>
        </MiningSchema>
      </PMML>
      """))
    assert clf.target_field.get('name') == 'Class'
    assert clf.target_field.get('optype') == 'categorical'
    assert clf.target_field.get('dataType') == 'string'
示例#8
0
  def test_prepare_data_exception_mismatch_columns_pandas(self):
    clf = PMMLBaseEstimator(pmml=StringIO("""
    <PMML xmlns="http://www.dmg.org/PMML-4_3" version="4.3">
      <DataDictionary>
        <DataField name="Class" optype="categorical" dataType="string">
          <Value value="setosa"/>
          <Value value="versicolor"/>
          <Value value="virginica"/>
        </DataField>
        <DataField name="test1" optype="continuous" dataType="double"/>
        <DataField name="test2" optype="continuous" dataType="double"/>
      </DataDictionary>
      <MiningSchema>
        <MiningField name="Class" usageType="target"/>
      </MiningSchema>
    </PMML>
    """))

    X = pd.DataFrame(data=[[1, 2], [3, 4], [5, 6]], columns=["Test_1", "Test_2"])

    with self.assertRaises(Exception) as cm:
      clf._prepare_data(X)

    assert str(cm.exception) == "The features in the input data do not match features expected by the PMML model."
示例#9
0
  def test_parse_type_value_categorical(self):
      template = """
      <PMML xmlns="http://www.dmg.org/PMML-4_3" version="4.3">
        <DataDictionary>
          <DataField name="Class" optype="categorical" dataType="string">
            <Value value="setosa"/>
            <Value value="versicolor"/>
            <Value value="virginica"/>
          </DataField>
        </DataDictionary>
      </PMML>"""

      clf = PMMLBaseEstimator(pmml=StringIO(template))
      data_dictionary = clf.find(clf.root, "DataDictionary")
      data_field = clf.find(data_dictionary, "DataField")

      with self.assertRaises(Exception) as cm: clf.parse_type("not_in_category", data_field)
      assert str(cm.exception) == "Value does not match any category."
      assert clf.parse_type("setosa", data_field) == "setosa"
      assert clf.parse_type("versicolor", data_field) == "versicolor"
      assert clf.parse_type("virginica", data_field) == "virginica"
示例#10
0
  def test_evaluate_feature_mapping(self):
    clf = PMMLBaseEstimator(pmml=StringIO("""
    <PMML xmlns="http://www.dmg.org/PMML-4_3" version="4.3">
      <DataDictionary>
        <DataField name="Class" optype="categorical" dataType="string">
          <Value value="setosa"/>
          <Value value="versicolor"/>
          <Value value="virginica"/>
        </DataField>
        <DataField name="sepal length (cm)" optype="continuous" dataType="float"/>
        <DataField name="sepal width (cm)" optype="continuous" dataType="float"/>
      </DataDictionary>
      <TransformationDictionary>
        <DerivedField name="integer(sepal length (cm))" optype="continuous" dataType="integer">
          <FieldRef field="sepal length (cm)"/>
        </DerivedField>
        <DerivedField name="double(sepal width (cm))" optype="continuous" dataType="double">
          <FieldRef field="sepal width (cm)"/>
        </DerivedField>
      </TransformationDictionary>
      <MiningSchema>
			  <MiningField name="Class" usageType="target"/>
      </MiningSchema>
    </PMML>
    """))

    Result = namedtuple('Result', 'column type')
    tests = {
      'sepal length (cm)':         Result(column=0, type=float),
      'sepal width (cm)':          Result(column=1, type=float),
      'integer(sepal length (cm))':Result(column=0, type=int),
      'double(sepal width (cm))':  Result(column=1, type=float)
    }

    for i in range(0, len(df)):
      for feature, result in tests.items():
        column, mapping = clf.field_mapping[feature]
        assert column == result.column
        mapped_value = mapping(df.iloc[i][column])
        assert type(mapped_value) == result.type

        if result.type == Category:
          assert mapped_value.value == df.iloc[i][column]
          assert mapped_value.categories == ["setosa", "versicolor", "virginica"]
        else:
          assert mapped_value == result.type(df.iloc[i][column])
示例#11
0
  def test_parse_type_interval(self):
    template = """
    <PMML xmlns="http://www.dmg.org/PMML-4_3" version="4.3">
      <DataDictionary>
        <DataField name="test" optype="ordinal" dataType="float">
          <Interval closure="openOpen" rightMargin="1"/>
          <Interval closure="openClosed" leftMargin="1" rightMargin="1.5"/>
          <Interval closure="openOpen" leftMargin="1.5" rightMargin="2.5"/>
          <Interval closure="closedOpen" leftMargin="2.5" rightMargin="3.5"/>
          <Interval closure="closedClosed" leftMargin="3.5" />
        </DataField>
      </DataDictionary>
    </PMML>"""

    clf = PMMLBaseEstimator(pmml=StringIO(template))
    data_dictionary = clf.find(clf.root, "DataDictionary")
    data_field = clf.find(data_dictionary, "DataField")

    assert clf.parse_type(-1, data_field) == Interval(-1, rightMargin=1, closure='openOpen')
    with self.assertRaises(Exception): clf.parse_type(1, data_field)
    assert clf.parse_type(2, data_field) == Interval(2, leftMargin=1.5, rightMargin=2.5, closure='openOpen')
    assert clf.parse_type(2.5, data_field) == Interval(2.5, leftMargin=2.5, rightMargin=3.5, closure='closedOpen')
    assert clf.parse_type(3.5, data_field) == Interval(3.5, leftMargin=3.5, closure='closedClosed')
示例#12
0
  def test_evaluate_feature_mapping(self):
    clf = PMMLBaseEstimator(pmml=StringIO("""
    <PMML xmlns="http://www.dmg.org/PMML-4_3" version="4.3">
      <DataDictionary>
        <DataField name="Class" optype="categorical" dataType="string">
          <Value value="setosa"/>
          <Value value="versicolor"/>
          <Value value="virginica"/>
        </DataField>
        <DataField name="sepal length (cm)" optype="continuous" dataType="float"/>
        <DataField name="sepal width (cm)" optype="continuous" dataType="float"/>
      </DataDictionary>
      <TransformationDictionary>
        <DerivedField name="integer(sepal length (cm))" optype="continuous" dataType="integer">
          <FieldRef field="sepal length (cm)"/>
        </DerivedField>
        <DerivedField name="double(sepal width (cm))" optype="continuous" dataType="double">
          <FieldRef field="sepal width (cm)"/>
        </DerivedField>
      </TransformationDictionary>
      <MiningSchema>
        <MiningField name="Class" usageType="target"/>
      </MiningSchema>
    </PMML>
    """))

    Result = namedtuple('Result', 'column type')
    category = Category(str,
                        categories=['setosa', 'versicolor', 'virginica'],
                        ordered=False)
    tests = {
      'sepal length (cm)':          Result(column=0, type=float),
      'sepal width (cm)':           Result(column=1, type=float),
      'integer(sepal length (cm))': Result(column=0, type=int),
      'double(sepal width (cm))':   Result(column=1, type=float),
      'Class':                      Result(column=None, type=category),
    }

    for i in range(0, X.shape[0]):
      for feature, result in tests.items():
        column, data_type = clf.field_mapping[feature]
        assert column == result.column
        assert data_type == result.type
示例#13
0
  def test_get_type_ordinal(self):
    template = """
      <PMML xmlns="http://www.dmg.org/PMML-4_3" version="4.3">
        <DataDictionary>
          <DataField name="Volume" optype="ordinal" dataType="string">
            <Value value="loud"/>
            <Value value="louder"/>
            <Value value="loudest"/>
          </DataField>
        </DataDictionary>
      </PMML>"""

    clf = PMMLBaseEstimator(pmml=StringIO(template))
    data_dictionary = clf.root.find("DataDictionary")
    data_field = data_dictionary.find("DataField")
    data_type: Category = get_type(data_field)

    assert data_type.categories == ['loud', 'louder', 'loudest']
    assert data_type.ordered
示例#14
0
  def test_get_type_categorical(self):
      template = """
      <PMML xmlns="http://www.dmg.org/PMML-4_3" version="4.3">
        <DataDictionary>
          <DataField name="Class" optype="categorical" dataType="string">
            <Value value="setosa"/>
            <Value value="versicolor"/>
            <Value value="virginica"/>
          </DataField>
        </DataDictionary>
      </PMML>"""

      clf = PMMLBaseEstimator(pmml=StringIO(template))
      data_dictionary = clf.root.find("DataDictionary")
      data_field = data_dictionary.find("DataField")
      data_type: Category = get_type(data_field)

      assert data_type.categories == ['setosa', 'versicolor', 'virginica']
      assert not data_type.ordered