예제 #1
0
def test_fingerprint_is_same_when_loading_data_again():
    from rasa.shared.importers.utils import training_data_from_paths

    files = [
        "data/examples/rasa/demo-rasa.yml",
        "data/examples/rasa/demo-rasa-responses.yml",
    ]
    td1 = training_data_from_paths(files, language="en")
    td2 = training_data_from_paths(files, language="en")
    assert td1.fingerprint() == td2.fingerprint()
예제 #2
0
def test_demo_data(files: List[Text]):
    from rasa.shared.importers.utils import training_data_from_paths

    trainingdata = training_data_from_paths(files, language="en")
    assert trainingdata.intents == {
        "affirm",
        "greet",
        "restaurant_search",
        "goodbye",
        "chitchat",
    }
    assert trainingdata.entities == {"location", "cuisine"}
    assert set(trainingdata.responses.keys()) == {
        "utter_chitchat/ask_name",
        "utter_chitchat/ask_weather",
    }
    assert len(trainingdata.training_examples) == 46
    assert len(trainingdata.intent_examples) == 46
    assert len(trainingdata.response_examples) == 4
    assert len(trainingdata.entity_examples) == 11
    assert len(trainingdata.responses) == 2

    assert trainingdata.entity_synonyms == {
        "Chines": "chinese",
        "Chinese": "chinese",
        "chines": "chinese",
        "vegg": "vegetarian",
        "veggie": "vegetarian",
    }

    assert trainingdata.regex_features == [
        {"name": "greet", "pattern": r"hey[^\s]*"},
        {"name": "zipcode", "pattern": r"[0-9]{5}"},
    ]
예제 #3
0
def test_train_test_split(filepaths: List[Text]):
    from rasa.shared.importers.utils import training_data_from_paths

    training_data = training_data_from_paths(filepaths, language="en")

    assert training_data.intents == {
        "affirm",
        "greet",
        "restaurant_search",
        "goodbye",
        "chitchat",
    }
    assert training_data.entities == {"location", "cuisine"}
    assert set(training_data.responses.keys()) == {
        "utter_chitchat/ask_name",
        "utter_chitchat/ask_weather",
    }

    NUM_TRAIN_EXAMPLES = 46
    NUM_RESPONSE_EXAMPLES = 4

    assert len(training_data.training_examples) == NUM_TRAIN_EXAMPLES
    assert len(training_data.intent_examples) == NUM_TRAIN_EXAMPLES
    assert len(training_data.response_examples) == NUM_RESPONSE_EXAMPLES

    for train_percent in range(50, 95, 5):
        train_frac = train_percent / 100.0
        train_split, test_split = training_data.train_test_split(train_frac)

        assert (len(test_split.training_examples) +
                len(train_split.training_examples) == NUM_TRAIN_EXAMPLES)

        num_classes = (
            len(training_data.number_of_examples_per_intent.keys()) +
            -len(training_data.retrieval_intents) +
            len(training_data.number_of_examples_per_response))

        expected_num_train_examples_floor = int(train_frac *
                                                NUM_TRAIN_EXAMPLES)
        if NUM_TRAIN_EXAMPLES - expected_num_train_examples_floor < num_classes:
            expected_num_train_examples_floor = NUM_TRAIN_EXAMPLES - num_classes - 1

        assert len(
            train_split.training_examples) >= expected_num_train_examples_floor
        assert (len(train_split.training_examples) <=
                expected_num_train_examples_floor + 1)

        assert len(training_data.number_of_examples_per_intent.keys()) == len(
            test_split.number_of_examples_per_intent.keys())
        assert len(training_data.number_of_examples_per_intent.keys()) == len(
            train_split.number_of_examples_per_intent.keys())
        assert len(
            training_data.number_of_examples_per_response.keys()) == len(
                train_split.number_of_examples_per_response.keys())
        assert len(
            training_data.number_of_examples_per_response.keys()) == len(
                train_split.number_of_examples_per_response.keys())
예제 #4
0
def test_fingerprint_is_different_when_lookup_table_has_changed(
    monkeypatch: MonkeyPatch,
):
    from rasa.shared.importers.utils import training_data_from_paths

    files = [
        "data/test/lookup_tables/lookup_table.json",
    ]

    td1 = training_data_from_paths(files, language="en")
    fingerprint1 = td1.fingerprint()

    monkeypatch.setattr(
        TrainingData,
        "_load_lookup_table",
        Mock(return_value={"name": "plates", "elements": "tacos\nbeef"}),
    )
    td2 = training_data_from_paths(files, language="en")
    fingerprint2 = td2.fingerprint()

    assert fingerprint1 != fingerprint2
예제 #5
0
def test_training_data_fingerprint_incorporates_tokens(
    whitespace_tokenizer: WhitespaceTokenizer, ):
    from rasa.shared.importers.utils import training_data_from_paths

    files = [
        "data/examples/rasa/demo-rasa.yml",
        "data/examples/rasa/demo-rasa-responses.yml",
    ]
    training_data = training_data_from_paths(files, language="en")
    fp1 = training_data.fingerprint()
    whitespace_tokenizer.process_training_data(training_data)
    # training data fingerprint has changed
    assert fp1 != training_data.fingerprint()
예제 #6
0
def test_train_test_split_with_random_seed(filepaths):
    from rasa.shared.importers.utils import training_data_from_paths

    td = training_data_from_paths(filepaths, language="en")

    td_train_1, td_test_1 = td.train_test_split(train_frac=0.8, random_seed=1)
    td_train_2, td_test_2 = td.train_test_split(train_frac=0.8, random_seed=1)
    train_1_intent_examples = [e.get(TEXT) for e in td_train_1.intent_examples]
    train_2_intent_examples = [e.get(TEXT) for e in td_train_2.intent_examples]

    test_1_intent_examples = [e.get(TEXT) for e in td_test_1.intent_examples]
    test_2_intent_examples = [e.get(TEXT) for e in td_test_2.intent_examples]

    assert train_1_intent_examples == train_2_intent_examples
    assert test_1_intent_examples == test_2_intent_examples
예제 #7
0
def test_training_data_fingerprint_incorporates_features():
    from rasa.shared.importers.utils import training_data_from_paths

    files = [
        "data/examples/rasa/demo-rasa.yml",
        "data/examples/rasa/demo-rasa-responses.yml",
    ]
    training_data = training_data_from_paths(files, language="en")
    fp1 = training_data.fingerprint()
    big_array = np.random.random((128, 128))

    f1 = Features(big_array, FEATURE_TYPE_SENTENCE, TEXT, "RegexFeaturizer")
    training_data.training_examples[0].add_features(f1)
    # training data fingerprint has changed
    assert fp1 != training_data.fingerprint()
예제 #8
0
def test_demo_data_filter_out_retrieval_intents(files):
    from rasa.shared.importers.utils import training_data_from_paths

    training_data = training_data_from_paths(files, language="en")
    assert len(training_data.training_examples) == 46

    training_data_filtered = training_data.filter_training_examples(
        lambda ex: ex.get(INTENT_RESPONSE_KEY) is None)
    assert len(training_data_filtered.training_examples) == 42

    training_data_filtered_2 = training_data.filter_training_examples(
        lambda ex: ex.get(INTENT_RESPONSE_KEY) is not None)
    assert len(training_data_filtered_2.training_examples) == 4

    # make sure filtering operation doesn't mutate the source training data
    assert len(training_data.training_examples) == 46
예제 #9
0
 async def get_nlu_data(self, languages=True) -> Dict[Text, TrainingData]:
     language = None
     if isinstance(languages, str):
         language = languages
         languages = [language]
     if not isinstance(languages, list):
         languages = self.nlu_config.keys()
     td = {}
     for lang in languages:
         try:
             td[lang] = utils.training_data_from_paths(
                 self.path_for_nlu_lang(lang),
                 lang,
             )
         except ValueError as e:
             if str(e).startswith("Unknown data format"):
                 td[lang] = TrainingData()
     if language: return td.get(language, TrainingData())
     return td
예제 #10
0
def test_train_test_split(filepaths: List[Text]):
    from rasa.shared.importers.utils import training_data_from_paths

    trainingdata = training_data_from_paths(filepaths, language="en")

    assert trainingdata.intents == {
        "affirm",
        "greet",
        "restaurant_search",
        "goodbye",
        "chitchat",
    }
    assert trainingdata.entities == {"location", "cuisine"}
    assert set(trainingdata.responses.keys()) == {
        "utter_chitchat/ask_name",
        "utter_chitchat/ask_weather",
    }

    assert len(trainingdata.training_examples) == 46
    assert len(trainingdata.intent_examples) == 46
    assert len(trainingdata.response_examples) == 4

    trainingdata_train, trainingdata_test = trainingdata.train_test_split(
        train_frac=0.8)

    assert (len(trainingdata_test.training_examples) +
            len(trainingdata_train.training_examples) == 46)
    assert len(trainingdata_train.training_examples) == 34
    assert len(trainingdata_test.training_examples) == 12

    assert len(trainingdata.number_of_examples_per_intent.keys()) == len(
        trainingdata_test.number_of_examples_per_intent.keys())
    assert len(trainingdata.number_of_examples_per_intent.keys()) == len(
        trainingdata_train.number_of_examples_per_intent.keys())
    assert len(trainingdata.number_of_examples_per_response.keys()) == len(
        trainingdata_test.number_of_examples_per_response.keys())
    assert len(trainingdata.number_of_examples_per_response.keys()) == len(
        trainingdata_train.number_of_examples_per_response.keys())
예제 #11
0
파일: rasa.py 프로젝트: ChenHuaYou/rasa
 def get_nlu_data(self, language: Optional[Text] = "en") -> TrainingData:
     """Retrieves NLU training data (see parent class for full docstring)."""
     return utils.training_data_from_paths(self._nlu_files, language)
예제 #12
0
파일: rasa.py 프로젝트: tomasmadeira/Rasa-x
 async def get_nlu_data(self,
                        language: Optional[Text] = "en") -> TrainingData:
     return utils.training_data_from_paths(self._nlu_files, language)