def test_init_with_processor(self): valid_x, valid_y = SMP2018ECDTCorpus.load_data('valid') processor = ClassificationProcessor() processor.analyze_corpus(valid_x, valid_y) embedding = self.embedding_class(sequence_length=20, processor=processor, **self.config) embedding.analyze_corpus(valid_x, valid_y) assert embedding.embed_one(['我', '想', '看']).shape == (20, self.embedding_size)
def __init__(self, component_config=None, model=None): super(KashgariIntentClassifier, self).__init__(component_config) bert_model_path = self.component_config.get('bert_model_path') sequence_length = self.component_config.get('sequence_length') layer_nums = self.component_config.get('layer_nums') trainable = self.component_config.get('trainable') use_cudnn_cell = self.component_config.get('use_cudnn_cell') self.multi_label = self.component_config.get('multi_label') self.split_symbol = self.component_config.get('split_symbol') kashgari.config.use_cudnn_cell = use_cudnn_cell processor = ClassificationProcessor(multi_label=self.multi_label) self.classifier_model = self.component_config.get('classifier_model') self.bert_embedding = BERTEmbedding(bert_model_path, task=kashgari.CLASSIFICATION, layer_nums=layer_nums, trainable=trainable, processor=processor, sequence_length=sequence_length) self.tokenizer = self.bert_embedding.tokenizer self.model = model
def __init__(self, task: str = None, sequence_length: Union[int, str] = 'auto', embedding_size: int = 100, processor: Optional[BaseProcessor] = None, from_saved_model: bool = False): self.task = task self.embedding_size = embedding_size if processor is None: if task == kashgari.CLASSIFICATION: self.processor = ClassificationProcessor() elif task == kashgari.LABELING: self.processor = LabelingProcessor() elif task == kashgari.SCORING: self.processor = ScoringProcessor() else: raise ValueError( 'Need to set the processor param, value: {labeling, classification, scoring}' ) else: self.processor = processor self.sequence_length: Union[int, str] = sequence_length self.embed_model: Optional[keras.Model] = None self._tokenizer = None
def test_init_with_processor(self): valid_x, valid_y = SMP2018ECDTCorpus.load_data('valid') processor = ClassificationProcessor() processor.analyze_corpus(valid_x, valid_y) if self.embedding_class is BareEmbedding: self.config['embedding_size'] = 55 embedding = self.embedding_class(sequence_length=20, processor=processor, **self.config) if self.embedding_class is BERTEmbedding: seq_len = 16 else: seq_len = 20 assert embedding.embed_one(['我', '想', '看']).shape == (seq_len, embedding.embedding_size)
def __init__(self, hyper_parameters): kashgari.config.use_cudnn_cell = False processor = ClassificationProcessor(multi_label=False) self.bert_embedding = BERTEmbedding( hyper_parameters['model']['bert_model_path'], task=kashgari.CLASSIFICATION, layer_nums=hyper_parameters['model']['layer_nums'], trainable=hyper_parameters['model']['trainable'], processor=processor, sequence_length='auto') print(len(self.bert_embedding._tokenizer._token_dict_inv)) self.tokenizer = self.bert_embedding.tokenizer
def test_multi_label_processor(self): p = ClassificationProcessor(multi_label=True) p.analyze_corpus(sample_train_x, sample_train_y) assert len(p.label2idx) == 3 print(p.process_x_dataset(sample_train_x)) print(p.process_y_dataset(sample_train_y))
def test_multi_label(self): p = ClassificationProcessor(multi_label=True) embedding = BareEmbedding(task='classification', processor=p) model = self.model_class(embedding) model.fit(sample_train_x, sample_train_y, epochs=1) assert len(p.label2idx) == 3 model.evaluate(sample_eval_x, sample_eval_y) assert isinstance(model.predict(sample_eval_x)[0], tuple) report_dict = model.evaluate(sample_eval_x, sample_eval_y, output_dict=True) assert isinstance(report_dict, dict)
def test_processor(self): x_set, y_set = TestMacros.load_classification_corpus() processor = ClassificationProcessor() processor.build_vocab(x_set, y_set) transformed_idx = processor.transform(y_set[20:40]) info_dict = processor.to_dict() p2: ClassificationProcessor = load_data_object(info_dict) assert (transformed_idx == p2.transform(y_set[20:40])).all() assert y_set[20:40] == p2.inverse_transform(transformed_idx)
def test_multi_label(self): p = ClassificationProcessor(multi_label=True) embedding = BareEmbedding(task='classification', processor=p) model = self.model_class(embedding) model.fit(sample_train_x, sample_train_y, epochs=1) assert len(p.label2idx) == 3 model.evaluate(sample_eval_x, sample_eval_y) assert isinstance(model.predict(sample_eval_x)[0], tuple) report_dict = model.evaluate(sample_eval_x, sample_eval_y, output_dict=True) assert isinstance(report_dict, dict) res = model.predict(valid_x[:20]) model_path = os.path.join(tempfile.gettempdir(), str(time.time())) model.save(model_path) new_model = kashgari.utils.load_model(model_path) assert res == new_model.predict(valid_x[:20])
def test_multi_label_processor(self): from kashgari.corpus import JigsawToxicCommentCorpus file_path = TestMacros.jigsaw_mini_corpus_path corpus = JigsawToxicCommentCorpus(file_path) x_set, y_set = corpus.load_data() corpus_gen = CorpusGenerator(x_set, y_set) processor = ClassificationProcessor(multi_label=True) processor.build_vocab_generator([corpus_gen]) transformed_idx = processor.transform(y_set[20:40]) info_dict = processor.to_dict() p2: ClassificationProcessor = load_data_object(info_dict) assert (transformed_idx == p2.transform(y_set[20:40])).all() x1s = y_set[20:40] x2s = p2.inverse_transform(transformed_idx) for sample_x1, sample_x2 in zip(x1s, x2s): assert sorted(sample_x1) == sorted(sample_x2)
def setUpClass(cls): cls.processor = ClassificationProcessor() cls.processor.analyze_corpus(class_train_x, class_train_y)