Exemplo n.º 1
0
def test_aggregate_utils_should_work():
    # Given
    lhs_metrics = {
        "intent1": {
            "exact_parsings": 2,
            "intent": {
                "false_positive": 4,
                "true_positive": 6,
                "false_negative": 9
            },
            "slots": {
                "slot1": {
                    "false_positive": 1,
                    "true_positive": 2,
                    "false_negative": 3
                },
            },
        },
        "intent2": {
            "exact_parsings": 1,
            "intent": {
                "false_positive": 3,
                "true_positive": 2,
                "false_negative": 5
            },
            "slots": {
                "slot2": {
                    "false_positive": 4,
                    "true_positive": 2,
                    "false_negative": 2
                },
            },
        },
    }

    rhs_metrics = {
        "intent1": {
            "exact_parsings": 3,
            "intent": {
                "false_positive": 3,
                "true_positive": 3,
                "false_negative": 3
            },
            "slots": {
                "slot1": {
                    "false_positive": 2,
                    "true_positive": 3,
                    "false_negative": 1
                },
            },
        },
        "intent2": {
            "exact_parsings": 5,
            "intent": {
                "false_positive": 4,
                "true_positive": 5,
                "false_negative": 6
            },
            "slots": {},
        },
        "intent3": {
            "exact_parsings": 0,
            "intent": {
                "false_positive": 1,
                "true_positive": 7,
                "false_negative": 2
            },
            "slots": {},
        },
    }

    # When
    aggregated_metrics = aggregate_metrics(lhs_metrics, rhs_metrics, True)

    # Then
    expected_metrics = {
        "intent1": {
            "exact_parsings": 5,
            "intent": {
                "false_positive": 7,
                "true_positive": 9,
                "false_negative": 12,
            },
            "slots": {
                "slot1": {
                    "false_positive": 3,
                    "true_positive": 5,
                    "false_negative": 4
                },
            },
        },
        "intent2": {
            "exact_parsings": 6,
            "intent": {
                "false_positive": 7,
                "true_positive": 7,
                "false_negative": 11,
            },
            "slots": {
                "slot2": {
                    "false_positive": 4,
                    "true_positive": 2,
                    "false_negative": 2
                },
            },
        },
        "intent3": {
            "exact_parsings": 0,
            "intent": {
                "false_positive": 1,
                "true_positive": 7,
                "false_negative": 2
            },
            "slots": {},
        },
    }

    assert expected_metrics == aggregated_metrics
def compute_cross_val_metrics(dataset,
                              engine_class,
                              nb_folds=5,
                              train_size_ratio=1.0,
                              drop_entities=False,
                              include_slot_metrics=True,
                              slot_matching_lambda=None,
                              progression_handler=None):
    """Compute end-to-end metrics on the dataset using cross validation

    Args:
        dataset (dict or str): Dataset or path to dataset
        engine_class: Python class to use for training and inference, this
            class must inherit from `Engine`
        nb_folds (int, optional): Number of folds to use for cross validation
        train_size_ratio: float, ratio of intent utterances to use for
            training (default=5)
        drop_entities (bool, false): Specify whether not all entity values
            should be removed from training data
        include_slot_metrics (bool, true): If false, the slots metrics and the
            slots parsing errors will not be reported.
        slot_matching_lambda (lambda, optional):
            lambda expected_slot, actual_slot -> bool,
            if defined, this function will be use to match slots when computing
            metrics, otherwise exact match will be used.
            `expected_slot` corresponds to the slot as defined in the dataset,
            and `actual_slot` corresponds to the slot as returned by the NLU
        progression_handler (lambda, optional): handler called at each
            progression (%) step

    Returns:
        dict: Metrics results containing the following data

            - "metrics": the computed metrics
            - "parsing_errors": the list of parsing errors

    """

    if isinstance(dataset, basestring):
        with io.open(dataset, encoding="utf8") as f:
            dataset = json.load(f)

    try:
        splits = create_shuffle_stratified_splits(dataset, nb_folds,
                                                  train_size_ratio,
                                                  drop_entities)
    except NotEnoughDataError as e:
        print("Skipping metrics computation because of: %s" % e.message)
        return {METRICS: None, PARSING_ERRORS: []}

    intent_list = sorted(list(dataset["intents"]))
    global_metrics = dict()
    global_confusion_matrix = None
    global_errors = []
    total_splits = len(splits)
    for split_index, (train_dataset, test_utterances) in enumerate(splits):
        engine = engine_class()
        engine.fit(train_dataset)
        split_metrics, errors, confusion_matrix = compute_engine_metrics(
            engine, test_utterances, intent_list, include_slot_metrics,
            slot_matching_lambda)
        global_metrics = aggregate_metrics(global_metrics, split_metrics,
                                           include_slot_metrics)
        global_confusion_matrix = aggregate_matrices(global_confusion_matrix,
                                                     confusion_matrix)
        global_errors += errors
        if progression_handler is not None:
            progression_handler(float(split_index + 1) / float(total_splits))

    global_metrics = compute_precision_recall_f1(global_metrics)

    nb_utterances = {
        intent: len(data[UTTERANCES])
        for intent, data in dataset[INTENTS].items()
    }
    for intent, metrics in global_metrics.items():
        metrics[INTENT_UTTERANCES] = nb_utterances.get(intent, 0)

    return {
        METRICS: global_metrics,
        PARSING_ERRORS: global_errors,
        CONFUSION_MATRIX: global_confusion_matrix
    }
Exemplo n.º 3
0
    def test_aggregate_utils_should_work(self):
        # Given
        lhs_metrics = {
            "intent1": {
                "intent": {
                    "false_positive": 4,
                    "true_positive": 6,
                    "false_negative": 9
                },
                "slots": {
                    "slot1": {
                        "false_positive": 1,
                        "true_positive": 2,
                        "false_negative": 3
                    },
                },
            },
            "intent2": {
                "intent": {
                    "false_positive": 3,
                    "true_positive": 2,
                    "false_negative": 5
                },
                "slots": {
                    "slot2": {
                        "false_positive": 4,
                        "true_positive": 2,
                        "false_negative": 2
                    },
                }
            },
        }

        rhs_metrics = {
            "intent1": {
                "intent": {
                    "false_positive": 3,
                    "true_positive": 3,
                    "false_negative": 3
                },
                "slots": {
                    "slot1": {
                        "false_positive": 2,
                        "true_positive": 3,
                        "false_negative": 1
                    },
                }
            },
            "intent2": {
                "intent": {
                    "false_positive": 4,
                    "true_positive": 5,
                    "false_negative": 6
                },
                "slots": {}
            },
            "intent3": {
                "intent": {
                    "false_positive": 1,
                    "true_positive": 7,
                    "false_negative": 2
                },
                "slots": {}
            },
        }

        # When
        aggregated_metrics = aggregate_metrics(lhs_metrics, rhs_metrics, True)

        # Then
        expected_metrics = {
            "intent1": {
                "intent": {
                    "false_positive": 7,
                    "true_positive": 9,
                    "false_negative": 12,
                },
                "slots": {
                    "slot1": {
                        "false_positive": 3,
                        "true_positive": 5,
                        "false_negative": 4
                    },
                }
            },
            "intent2": {
                "intent": {
                    "false_positive": 7,
                    "true_positive": 7,
                    "false_negative": 11,
                },
                "slots": {
                    "slot2": {
                        "false_positive": 4,
                        "true_positive": 2,
                        "false_negative": 2
                    },
                }
            },
            "intent3": {
                "intent": {
                    "false_positive": 1,
                    "true_positive": 7,
                    "false_negative": 2
                },
                "slots": {}
            },
        }

        self.assertDictEqual(expected_metrics, aggregated_metrics)