Пример #1
0
def generate_folds(
        n: int,
        td: TrainingData) -> Iterator[Tuple[TrainingData, TrainingData]]:
    """Generates n cross validation folds for training data td."""

    from sklearn.model_selection import StratifiedKFold

    skf = StratifiedKFold(n_splits=n, shuffle=True)
    x = td.intent_examples
    y = [example.get("intent") for example in x]
    for i_fold, (train_index, test_index) in enumerate(skf.split(x, y)):
        logger.debug("Fold: {}".format(i_fold))
        train = [x[i] for i in train_index]
        test = [x[i] for i in test_index]
        yield (
            TrainingData(
                training_examples=train,
                entity_synonyms=td.entity_synonyms,
                regex_features=td.regex_features,
            ),
            TrainingData(
                training_examples=test,
                entity_synonyms=td.entity_synonyms,
                regex_features=td.regex_features,
            ),
        )
Пример #2
0
def drop_intents_below_freq(td: TrainingData, cutoff: int = 5) -> TrainingData:
    """Remove intent groups with less than cutoff instances."""

    logger.debug("Raw data intent examples: {}".format(len(
        td.intent_examples)))
    keep_examples = [
        ex for ex in td.intent_examples
        if td.examples_per_intent[ex.get("intent")] >= cutoff
    ]

    return TrainingData(keep_examples, td.entity_synonyms, td.regex_features)
Пример #3
0
def compare_nlu(
    configs: List[Text],
    data: TrainingData,
    exclusion_percentages: List[int],
    f_score_results: Dict[Text, Any],
    model_names: List[Text],
    output: Text,
    runs: int,
) -> List[int]:
    """
    Trains and compares multiple NLU models.
    For each run and exclusion percentage a model per config file is trained.
    Thereby, the model is trained only on the current percentage of training data.
    Afterwards, the model is tested on the complete test data of that run.
    All results are stored in the provided output directory.

    Args:
        configs: config files needed for training
        data: training data
        exclusion_percentages: percentages of training data to exclude during comparison
        f_score_results: dictionary of model name to f-score results per run
        model_names: names of the models to train
        output: the output directory
        runs: number of comparison runs

    Returns: training examples per run
    """

    training_examples_per_run = []

    for run in range(runs):

        logger.info("Beginning comparison run {}/{}".format(run + 1, runs))

        run_path = os.path.join(output, "run_{}".format(run + 1))
        create_path(run_path)

        test_path = os.path.join(run_path, TEST_DATA_FILE)
        create_path(test_path)

        train, test = data.train_test_split()
        write_to_file(test_path, test.as_markdown())

        training_examples_per_run = []

        for percentage in exclusion_percentages:
            percent_string = "{}%_exclusion".format(percentage)

            _, train = train.train_test_split(percentage / 100)
            training_examples_per_run.append(len(train.training_examples))

            model_output_path = os.path.join(run_path, percent_string)
            train_split_path = os.path.join(model_output_path, TRAIN_DATA_FILE)
            create_path(train_split_path)
            write_to_file(train_split_path, train.as_markdown())

            for nlu_config, model_name in zip(configs, model_names):
                logger.info(
                    "Evaluating configuration '{}' with {} training data.".
                    format(model_name, percent_string))

                try:
                    model_path = train_nlu(
                        nlu_config,
                        train_split_path,
                        model_output_path,
                        fixed_model_name=model_name,
                    )
                except Exception as e:
                    logger.warning(
                        "Training model '{}' failed. Error: {}".format(
                            model_name, str(e)))
                    f_score_results[model_name][run].append(0.0)
                    continue

                model_path = os.path.join(get_model(model_path), "nlu")

                report_path = os.path.join(model_output_path,
                                           "{}_report".format(model_name))
                errors_path = os.path.join(report_path, "errors.json")
                result = run_evaluation(test_path,
                                        model_path,
                                        report=report_path,
                                        errors=errors_path)

                f1 = result["intent_evaluation"]["f1_score"]
                f_score_results[model_name][run].append(f1)

    return training_examples_per_run