def prepare_preprocessor(
    X, y,
    model_config: ModelConfig,
    features: Optional[List[List[List[str]]]] = None
):
    preprocessor = get_preprocessor(model_config, features=features)
    batch_text_list_iterable = iter_batch_text_list(
        X, features,
        additional_token_feature_indices=model_config.additional_token_feature_indices,
        text_feature_indices=model_config.text_feature_indices
    )
    if isinstance(preprocessor, WordPreprocessor):
        LOGGER.info('fitting preprocessor (faster)')
        faster_preprocessor_fit(preprocessor, batch_text_list_iterable, y)
    else:
        LOGGER.info('fitting preprocessor (default)')
        preprocessor.fit(batch_text_list_iterable, y)
    if model_config.use_features and features is not None:
        LOGGER.info('fitting features preprocessor')
        preprocessor.fit_features(features)
        if model_config.features_indices != preprocessor.feature_preprocessor.features_indices:
            LOGGER.info('revised features_indices: %s', model_config.features_indices)
            model_config.features_indices = preprocessor.feature_preprocessor.features_indices
        model_config.features_map_to_index = preprocessor.feature_preprocessor.features_map_to_index
    LOGGER.info('done fitting preprocessor')
    return preprocessor
 def test_should_set_props_for_BidLSTM_CRF_FEATURES(
         self, model_config: ModelConfig):
     model_config.model_type = BidLSTM_CRF_FEATURES.name
     model_config.use_features = False
     model_config.use_features_indices_input = False
     updated_implicit_model_config_props(model_config)
     assert model_config.use_features
     assert model_config.use_features_indices_input
Exemplo n.º 3
0
    def test_should_save_and_load_delft_preprocessor_from_json_or_pickle(
            self, temp_dir: Path):
        model_config = ModelConfig()
        preprocessor = DelftWordPreprocessor()
        preprocessor.fit(SAMPLE_X, SAMPLE_Y)
        model = DummyModel(model_config)
        saver = ModelSaver(preprocessor=preprocessor,
                           model_config=model_config)
        loader = ModelLoader()

        saver.save_to(str(temp_dir), model)

        assert (temp_dir / saver.preprocessor_json_file).exists()
        assert (temp_dir / saver.preprocessor_pickle_file).exists()

        (temp_dir / saver.preprocessor_json_file).rename(
            temp_dir / 'preprocessor_hidden.json')

        loaded_preprocessor = loader.load_preprocessor_from_directory(
            str(temp_dir))
        assert (get_normalized_vars_with_type(loaded_preprocessor) ==
                get_normalized_vars_with_type(preprocessor))

        (temp_dir / 'preprocessor_hidden.json').rename(
            temp_dir / saver.preprocessor_json_file)

        (temp_dir / saver.preprocessor_pickle_file).rename(
            temp_dir / 'preprocessor_hidden.pickle')

        loaded_preprocessor = loader.load_preprocessor_from_directory(
            str(temp_dir))
        assert (get_normalized_vars_with_type(loaded_preprocessor) ==
                get_normalized_vars_with_type(preprocessor))
 def test_should_use_default_preprocessor_if_not_using_features(self):
     model_config = ModelConfig(use_features=False)
     preprocessor = get_preprocessor(model_config,
                                     features=[[TOKEN_FEATURES_1]])
     assert isinstance(preprocessor, DelftWordPreprocessor)
     assert not isinstance(preprocessor, ScienceBeamPreprocessor)
     assert preprocessor.feature_preprocessor is None
 def test_should_create_preprocessor_with_feature_preprocessor(self):
     model_config = ModelConfig(use_features=True,
                                use_features_indices_input=False)
     preprocessor = get_preprocessor(model_config,
                                     features=[[TOKEN_FEATURES_1]])
     assert isinstance(preprocessor, DelftWordPreprocessor)
     assert not isinstance(preprocessor, ScienceBeamPreprocessor)
     assert preprocessor.feature_preprocessor is not None
     assert isinstance(preprocessor.feature_preprocessor,
                       ScienceBeamFeaturesPreprocessor)
Exemplo n.º 6
0
 def test_should_tag_tokenized_texts_with_exact_batch_size_if_stateful(
         self, model_mock: MagicMock, model_config: ModelConfig,
         preprocessor: WordPreprocessor):
     model_config.stateful = True
     model_config.batch_size = 2
     tagger = Tagger(model=model_mock,
                     model_config=model_config,
                     preprocessor=preprocessor,
                     max_sequence_length=2,
                     input_window_stride=2)
     model_mock.predict_on_batch.side_effect = get_predict_on_batch_by_token_fn(
         DEFAULT_TAG_BY_TOKEN_MAP,
         preprocessor=preprocessor,
         batch_size=model_config.batch_size)
     tag_result = tagger.tag(
         [[TOKEN_1, TOKEN_2, TOKEN_3], [TOKEN_2, TOKEN_3]],
         output_format=None)
     LOGGER.debug('tag_result: %s', tag_result)
     assert tag_result == [[(TOKEN_1, TAG_1), (TOKEN_2, TAG_2),
                            (TOKEN_3, TAG_3)],
                           [(TOKEN_2, TAG_2), (TOKEN_3, TAG_3)]]
def _model_config():
    config = ModelConfig(word_embedding_size=11,
                         max_char_length=12,
                         max_feature_size=15,
                         dropout=0.5,
                         recurrent_dropout=0.0)
    config.char_vocab_size = 13
    config.char_embedding_size = 14
    config.num_word_lstm_units = 5
    config.num_char_lstm_units = 6
    return config
Exemplo n.º 8
0
    def test_should_save_and_load_sciencebeam_preprocessor_from_json_or_pickle(
            self, temp_dir: Path):
        model_config = ModelConfig()
        feature_preprocessor = ScienceBeamFeaturesPreprocessor([0])
        preprocessor = ScienceBeamPreprocessor(
            feature_preprocessor=feature_preprocessor)
        preprocessor.fit(SAMPLE_X, SAMPLE_Y)
        preprocessor.fit_features(SAMPLE_X)
        model = DummyModel(model_config)
        saver = ModelSaver(preprocessor=preprocessor,
                           model_config=model_config)
        loader = ModelLoader()

        saver.save_to(str(temp_dir), model)

        assert (temp_dir / saver.preprocessor_json_file).exists()
        assert (temp_dir / saver.preprocessor_pickle_file).exists()

        LOGGER.debug('preprocessor_json: %s',
                     (temp_dir / saver.preprocessor_json_file).read_text())

        (temp_dir / saver.preprocessor_json_file).rename(
            temp_dir / 'preprocessor_hidden.json')

        loaded_preprocessor = loader.load_preprocessor_from_directory(
            str(temp_dir))
        assert (get_normalized_vars_with_type(loaded_preprocessor) ==
                get_normalized_vars_with_type(preprocessor))

        (temp_dir / 'preprocessor_hidden.json').rename(
            temp_dir / saver.preprocessor_json_file)

        (temp_dir / saver.preprocessor_pickle_file).rename(
            temp_dir / 'preprocessor_hidden.pickle')

        loaded_preprocessor = loader.load_preprocessor_from_directory(
            str(temp_dir))
        assert (get_normalized_vars_with_type(loaded_preprocessor) ==
                get_normalized_vars_with_type(preprocessor))
 def test_should_be_able_to_build_without_word_embeddings(
         self, model_config: ModelConfig):
     model_config.use_word_embeddings = False
     CustomBidLSTM_CRF(model_config, ntags=5)
 def test_should_be_able_to_build_model(self, model_config: ModelConfig):
     model_config.features_indices = [1, 2, 3]
     model_config.features_vocabulary_size = 11
     model_config.features_embedding_size = 12
     model_config.features_lstm_units = 13
     BidLSTM_CRF_FEATURES(model_config, ntags=5)
 def test_should_be_able_to_build_with_feature_embedding(
         self, model_config: ModelConfig):
     model_config.use_features = True
     model_config.features_embedding_size = 11
     CustomBidLSTM_CRF(model_config, ntags=5)
 def test_should_be_able_to_build_stateful_lstms(self,
                                                 model_config: ModelConfig):
     model_config.use_features = True
     model_config.features_embedding_size = 11
     model_config.stateful = True
     CustomBidLSTM_CRF(model_config, ntags=5)
Exemplo n.º 13
0
 def _save_model_config(self, model_config: ModelConfig, filepath: str):
     LOGGER.debug('model_config: %s', model_config)
     with open_file(filepath, 'w') as fp:
         model_config.save(fp)
     LOGGER.info('model config file saved to %s', filepath)
 def test_should_be_able_to_build_with_features(self,
                                                model_config: ModelConfig):
     model_config.use_features = True
     CustomBidLSTM_CRF(model_config, ntags=5)
 def test_should_be_able_to_pass_in_feature_indices(self):
     model_config = ModelConfig(feature_indices=FEATURE_INDICES_1)
     assert model_config.feature_indices == FEATURE_INDICES_1
     assert model_config.features_indices == FEATURE_INDICES_1
 def test_should_be_able_to_pass_in_features_embedding_size(self):
     model_config = ModelConfig(
         features_embedding_size=FEATURES_EMBEDDING_SIZE_1)
     assert model_config.feature_embedding_size == FEATURES_EMBEDDING_SIZE_1
     assert model_config.features_embedding_size == FEATURES_EMBEDDING_SIZE_1
Exemplo n.º 17
0
 def load_model_config_from_file(self, filepath: str):
     LOGGER.info('loading model config from %s', filepath)
     with open_file(filepath, 'r') as fp:
         return ModelConfig.load(fp)
Exemplo n.º 18
0
def _model_config():
    return ModelConfig(model_name='test_model', batch_size=1)
 def __init__(
         self, *args,
         use_features: bool = False,
         features_indices: List[int] = None,
         features_embedding_size: int = None,
         multiprocessing: bool = False,
         embedding_registry_path: str = None,
         embedding_manager: EmbeddingManager = None,
         config_props: dict = None,
         training_props: dict = None,
         max_sequence_length: int = None,
         input_window_stride: int = None,
         eval_max_sequence_length: int = None,
         eval_input_window_stride: int = None,
         batch_size: int = None,
         eval_batch_size: int = None,
         stateful: bool = None,
         transfer_learning_config: TransferLearningConfig = None,
         tag_transformed: bool = False,
         **kwargs):
     # initialise logging if not already initialised
     logging.basicConfig(level='INFO')
     LOGGER.debug('Sequence, args=%s, kwargs=%s', args, kwargs)
     self.embedding_registry_path = embedding_registry_path or DEFAULT_EMBEDDINGS_PATH
     if embedding_manager is None:
         embedding_manager = EmbeddingManager(
             path=self.embedding_registry_path,
             download_manager=DownloadManager()
         )
     self.download_manager = embedding_manager.download_manager
     self.embedding_manager = embedding_manager
     self.embeddings: Optional[Embeddings] = None
     if not batch_size:
         batch_size = get_default_batch_size()
     if not max_sequence_length:
         max_sequence_length = get_default_max_sequence_length()
     self.max_sequence_length = max_sequence_length
     if not input_window_stride:
         input_window_stride = get_default_input_window_stride()
     self.input_window_stride = input_window_stride
     self.eval_max_sequence_length = eval_max_sequence_length
     self.eval_input_window_stride = eval_input_window_stride
     self.eval_batch_size = eval_batch_size
     self.model_path: Optional[str] = None
     if stateful is None:
         # use a stateful model, if supported
         stateful = get_default_stateful()
     self.stateful = stateful
     self.transfer_learning_config = transfer_learning_config
     self.dataset_transformer_factory = DummyDatasetTransformer
     self.tag_transformed = tag_transformed
     super().__init__(
         *args,
         max_sequence_length=max_sequence_length,
         batch_size=batch_size,
         **kwargs
     )
     LOGGER.debug('use_features=%s', use_features)
     self.model_config: ModelConfig = ModelConfig(
         **{  # type: ignore
             **vars(self.model_config),
             **(config_props or {}),
             'features_indices': features_indices,
             'features_embedding_size': features_embedding_size
         },
         use_features=use_features
     )
     self.update_model_config_word_embedding_size()
     updated_implicit_model_config_props(self.model_config)
     self.update_dataset_transformer_factor()
     self.training_config: TrainingConfig = TrainingConfig(
         **vars(cast(DelftTrainingConfig, self.training_config)),
         **(training_props or {})
     )
     LOGGER.info('training_config: %s', vars(self.training_config))
     self.multiprocessing = multiprocessing
     self.tag_debug_reporter = get_tag_debug_reporter_if_enabled()
     self._load_exception = None
     self.p: Optional[WordPreprocessor] = None
     self.model: Optional[BaseModel] = None
     self.models: List[BaseModel] = []