示例#1
0
 def load_data_dictionary(self, X: pd.DataFrame):
     """
     Extract the data dictionary from a feature dataframe, and store it
     """
     dtypes = X.dtypes
     data_fields = []
     for index, value in dtypes.items():
         if np.issubdtype(value, np.integer):
             data_type = models.DataType.integer
             op_type = models.OpType.ordinal
         elif np.issubdtype(value, np.double):
             data_type = models.DataType.double
             op_type = models.OpType.continuous
         elif np.issubdtype(value, np.floating):
             data_type = models.DataType.float
             op_type = models.OpType.continuous
         elif np.issubdtype(value, np.bool_):
             data_type = models.DataType.boolean
             op_type = models.OpType.categorical
         else:
             data_type = models.DataType.string
             op_type = models.OpType.categorical
         data_fields.append(
             models.DataField(name=str(index),
                              optype=op_type,
                              dataType=data_type))
     self._data_dictionary = models.DataDictionary(data_fields)
    def test_serialize_mining_schema(self):
        # arrange
        srz = serializer.NyokaSerializer()
        mining_schema = models.MiningSchema(miningFields=[
            models.MiningField(name='toto0',
                               usageType=models.MiningFieldUsageType.active),
            models.MiningField(name='toto1',
                               usageType=models.MiningFieldUsageType.target)
        ])
        model = models.SimplePMMLRuleSetModel(
            dataDictionary=models.DataDictionary(dataFields=None),  # noqa
            ruleSetModel=models.RuleSetModel(miningSchema=mining_schema,
                                             ruleSet=None))  # noqa

        # when
        serialized = srz.serialize(model)
        res_data_dictionary_dict = xmltodict.parse(
            xml_input=serialized)['PMML']['RuleSetModel']['MiningSchema']

        # assert
        expected = '''
            <MiningSchema>
                <MiningField name="toto0" usageType="active"/>
                <MiningField name="toto1" usageType="target"/>
            </MiningSchema>
            '''
        self.assertEqual(res_data_dictionary_dict,
                         xmltodict.parse(xml_input=expected)['MiningSchema'])
    def test_serialize_rule_set(self):
        # arrange
        srz = serializer.NyokaSerializer()
        model = models.SimplePMMLRuleSetModel(
            dataDictionary=models.DataDictionary(dataFields=None),  # noqa
            ruleSetModel=models.RuleSetModel(
                miningSchema=None,  # noqa
                ruleSet=models.RuleSet(
                    ruleSelectionMethod=[models.RuleSelectionMethod.firstHit],
                    rules=None,  # noqa
                    recordCount=5,
                    nbCorrect=3,
                    defaultScore="toto",
                    defaultConfidence=0.5)))
        model_default = models.SimplePMMLRuleSetModel(
            dataDictionary=models.DataDictionary(dataFields=None),  # noqa
            ruleSetModel=models.RuleSetModel(
                miningSchema=None,  # noqa
                ruleSet=models.RuleSet(
                    ruleSelectionMethod=[models.RuleSelectionMethod.firstHit],
                    rules=None)))  # noqa

        # when
        serialized = srz.serialize(model)
        serialized_default = srz.serialize(model_default)
        res_data_dictionary_dict = xmltodict.parse(
            xml_input=serialized)['PMML']['RuleSetModel']['RuleSet']
        res_data_dictionary_dict_default = xmltodict.parse(
            xml_input=serialized_default)['PMML']['RuleSetModel']['RuleSet']

        # assert
        expected = '''
            <RuleSet recordCount="5" nbCorrect="3" defaultScore="toto" defaultConfidence="0.5">
                <RuleSelectionMethod criterion="firstHit"/>
            </RuleSet>
            '''
        expected_default = '''
                <RuleSet>
                    <RuleSelectionMethod criterion="firstHit"/>
                </RuleSet>
                '''
        self.assertEqual(res_data_dictionary_dict,
                         xmltodict.parse(xml_input=expected)['RuleSet'])
        self.assertEqual(
            res_data_dictionary_dict_default,
            xmltodict.parse(xml_input=expected_default)['RuleSet'])
    def test_serialize_predicate(self):
        # arrange
        srz = serializer.NyokaSerializer()
        model = models.SimplePMMLRuleSetModel(
            dataDictionary=models.DataDictionary(dataFields=None),  # noqa
            ruleSetModel=models.RuleSetModel(
                miningSchema=None,  # noqa
                ruleSet=models.RuleSet(
                    ruleSelectionMethod=None,  # noqa
                    rules=[
                        models.SimpleRule(
                            score='test1',
                            predicate=models.SimplePredicate(
                                field='toto1',
                                operator=models.Operator.greaterOrEqual,
                                value='128')),
                        models.SimpleRule(
                            score='test2',
                            predicate=models.CompoundPredicate(
                                booleanOperator=models.BooleanOperator.and_,
                                simplePredicates=[
                                    models.SimplePredicate(
                                        field='toto2',
                                        operator=models.Operator.lessThan,
                                        value='20.5'),
                                    models.SimplePredicate(
                                        field='toto2',
                                        operator=models.Operator.equal,
                                        value='good')
                                ]),
                            id='test-id',
                            recordCount=5,
                            nbCorrect=3,
                            confidence=0.86,
                            weight=0.6)
                    ])))

        # when
        serialized = srz.serialize(model)
        res_data_dictionary_dict = xmltodict.parse(
            xml_input=serialized)['PMML']['RuleSetModel']['RuleSet']

        # assert
        expected = '''
            <RuleSet>
                <SimpleRule score="test1">
                    <SimplePredicate field="toto1" operator="greaterOrEqual" value="128"/>
                </SimpleRule>
                <SimpleRule id="test-id" score="test2" recordCount="5" nbCorrect="3" confidence="0.86" weight="0.6">
                    <CompoundPredicate booleanOperator="and">
                        <SimplePredicate field="toto2" operator="lessThan" value="20.5"/>
                        <SimplePredicate field="toto2" operator="equal" value="good"/>
                    </CompoundPredicate>
                </SimpleRule>
            </RuleSet>
            '''
        self.assertEqual(res_data_dictionary_dict,
                         xmltodict.parse(xml_input=expected)['RuleSet'])
    def test_serialize_data_dictionary(self):
        # arrange
        srz = serializer.NyokaSerializer()
        data_dictionary = models.DataDictionary(dataFields=[
            models.DataField(name='toto0',
                             optype=models.OpType.continuous,
                             dataType=models.DataType.float),
            models.DataField(name='toto1',
                             optype=models.OpType.ordinal,
                             dataType=models.DataType.string),
            models.DataField(name='toto2',
                             optype=models.OpType.categorical,
                             dataType=models.DataType.boolean),
            models.DataField(name='toto3',
                             optype=models.OpType.categorical,
                             dataType=models.DataType.integer)
        ])
        model = models.SimplePMMLRuleSetModel(dataDictionary=data_dictionary,
                                              ruleSetModel=None)  # noqa

        # when
        serialized = srz.serialize(model)
        res_data_dictionary_dict = xmltodict.parse(
            xml_input=serialized)['PMML']['DataDictionary']

        # assert
        expected = '''
        <DataDictionary numberOfFields="4">
            <DataField name="toto0" optype="continuous" dataType="float"/>
            <DataField name="toto1" optype="ordinal" dataType="string"/>
            <DataField name="toto2" optype="categorical" dataType="boolean"/>
            <DataField name="toto3" optype="categorical" dataType="integer"/>
        </DataDictionary>
        '''
        self.assertEqual(res_data_dictionary_dict,
                         xmltodict.parse(xml_input=expected)['DataDictionary'])