示例#1
0
 def __init__(self, config, **shared):
     if config is None:
         self.config = self.default_config()
     elif isinstance(config, ProcessingUnitConfig):
         self.config = config
     elif isinstance(config, dict):
         self.config = self.config_type.from_dict(config)
     else:
         raise ValueError("Unexpected config type: %s" % type(config))
     if self.config is not None:
         self.config.set_unit_name(self.unit_name)
     self.builtin_entity_parser = shared.get(BUILTIN_ENTITY_PARSER)
     self.custom_entity_parser = shared.get(CUSTOM_ENTITY_PARSER)
     self.resources = shared.get(RESOURCES)
     self.random_state = check_random_state(shared.get(RANDOM_STATE))
    def fit(self, dataset):
        """Fits the intent classifier with a valid Snips dataset

        Returns:
            :class:`LogRegIntentClassifier`: The same instance, trained
        """
        logger.debug("Fitting LogRegIntentClassifier...")
        dataset = validate_and_format_dataset(dataset)
        self.load_resources_if_needed(dataset[LANGUAGE])
        self.fit_builtin_entity_parser_if_needed(dataset)
        self.fit_custom_entity_parser_if_needed(dataset)
        language = dataset[LANGUAGE]
        random_state = check_random_state(self.config.random_seed)

        data_augmentation_config = self.config.data_augmentation_config
        utterances, classes, intent_list = build_training_data(
            dataset, language, data_augmentation_config, self.resources,
            random_state)

        self.intent_list = intent_list
        if len(self.intent_list) <= 1:
            return self

        self.featurizer = Featurizer(
            config=self.config.featurizer_config,
            builtin_entity_parser=self.builtin_entity_parser,
            custom_entity_parser=self.custom_entity_parser,
            resources=self.resources
        )
        self.featurizer.language = language

        none_class = max(classes)
        try:
            self.featurizer = self.featurizer.fit(
                dataset, utterances, classes, none_class)
        except _EmptyDatasetUtterancesError:
            self.featurizer = None
            return self

        x = self.featurizer.transform(utterances)
        alpha = get_regularization_factor(dataset)
        self.classifier = SGDClassifier(random_state=random_state,
                                        alpha=alpha, **LOG_REG_ARGS)
        self.classifier.fit(x, classes)
        logger.debug("%s", DifferedLoggingMessage(self.log_best_features))
        return self
    def compute_features(self, tokens, drop_out=False):
        """Computes features on the provided tokens

        The *drop_out* parameters allows to activate drop out on features that
        have a positive drop out ratio. This should only be used during
        training.
        """

        cache = [{TOKEN_NAME: token} for token in tokens]
        features = []
        random_state = check_random_state(self.config.random_seed)
        for i in range(len(tokens)):
            token_features = UnupdatableDict()
            for feature in self.features:
                f_drop_out = feature.drop_out
                if drop_out and random_state.rand() < f_drop_out:
                    continue
                value = feature.compute(i, cache)
                if value is not None:
                    token_features[feature.name] = value
            features.append(token_features)
        return features
示例#4
0
def train(dataset_path,
          output_path,
          config_path=None,
          verbose=False,
          random_seed=None):
    """Train an NLU engine on the provided dataset"""
    import json
    import logging
    from pathlib import Path

    from snips_nlu import SnipsNLUEngine
    from snips_nlu.cli.utils import set_nlu_logger
    from snips_nlu.common.utils import check_random_state

    if verbose == 1:
        set_nlu_logger(logging.INFO)
    elif verbose >= 2:
        set_nlu_logger(logging.DEBUG)

    with Path(dataset_path).open("r", encoding="utf8") as f:
        dataset = json.load(f)

    config = None
    if config_path is not None:
        with Path(config_path).open("r", encoding="utf8") as f:
            config = json.load(f)

    random_state = check_random_state(random_seed)

    print("Create and train the engine...")
    engine = SnipsNLUEngine(config, random_state=random_state).fit(dataset)

    print("Persisting the engine...")
    engine.persist(output_path)

    print("Saved the trained engine to %s" % output_path)
    def fit(self, dataset, intent):
        """Fits the slot filler

        Args:
            dataset (dict): A valid Snips dataset
            intent (str): The specific intent of the dataset to train
                the slot filler on

        Returns:
            :class:`CRFSlotFiller`: The same instance, trained
        """
        logger.debug("Fitting %s slot filler...", intent)
        dataset = validate_and_format_dataset(dataset)
        self.load_resources_if_needed(dataset[LANGUAGE])
        self.fit_builtin_entity_parser_if_needed(dataset)
        self.fit_custom_entity_parser_if_needed(dataset)

        for factory in self.features_factories:
            factory.custom_entity_parser = self.custom_entity_parser
            factory.builtin_entity_parser = self.builtin_entity_parser
            factory.resources = self.resources

        self.language = dataset[LANGUAGE]
        self.intent = intent
        self.slot_name_mapping = get_slot_name_mapping(dataset, intent)

        if not self.slot_name_mapping:
            # No need to train the CRF if the intent has no slots
            return self

        random_state = check_random_state(self.config.random_seed)
        augmented_intent_utterances = augment_utterances(
            dataset,
            self.intent,
            language=self.language,
            resources=self.resources,
            random_state=random_state,
            **self.config.data_augmentation_config.to_dict())

        crf_samples = [
            utterance_to_sample(u[DATA], self.config.tagging_scheme,
                                self.language)
            for u in augmented_intent_utterances
        ]

        for factory in self.features_factories:
            factory.fit(dataset, intent)

        # Ensure that X, Y are safe and that the OUTSIDE label is learnt to
        # avoid segfault at inference time
        # pylint: disable=C0103
        X = [
            self.compute_features(sample[TOKENS], drop_out=True)
            for sample in crf_samples
        ]
        Y = [[tag for tag in sample[TAGS]] for sample in crf_samples]
        X, Y = _ensure_safe(X, Y)

        # ensure ascii tags
        Y = [[_encode_tag(tag) for tag in y] for y in Y]

        # pylint: enable=C0103
        self.crf_model = _get_crf_model(self.config.crf_args)
        self.crf_model.fit(X, Y)

        logger.debug("Most relevant features for %s:\n%s", self.intent,
                     DifferedLoggingMessage(self.log_weights))
        return self
示例#6
0
 def __init__(self, factory_config, **shared):
     self.factory_config = factory_config
     self.resources = shared.get(RESOURCES)
     self.builtin_entity_parser = shared.get(BUILTIN_ENTITY_PARSER)
     self.custom_entity_parser = shared.get(CUSTOM_ENTITY_PARSER)
     self.random_state = check_random_state(shared.get(RANDOM_STATE))