def test_pipeline1(self, batch_size): """Tests a chain of Batch->Pack->Batch with different batch sizes.""" data_path = data_samples_root + "/random_texts/0.txt" pipeline = Pipeline[DataPack]() pipeline.set_reader(SentenceReader()) pipeline.initialize() text_extractor = AttributeExtractor({ "need_pad": True, "entry_type": Token, "attribute": "text", }) for pack in pipeline.process_dataset(data_path): for instance in pack.get(Sentence): text_extractor.update_vocab(pack, instance) model = DummyModel() predictor = DummyPredictor() predictor_config = { "scope": Sentence, "batch_size": batch_size, "feature_scheme": { "text_tag": { "extractor": text_extractor, "converter": Converter(), "type": TrainPreprocessor.DATA_INPUT }, }, } predictor.load(model) nlp = Pipeline[DataPack]() reader = SentenceReader() nlp.set_reader(reader) nlp.add(predictor, config=predictor_config) nlp.add(DummyEvaluator()) nlp.initialize() num_packs = 0 for _ in nlp.process_dataset(data_path): num_packs += 1 # check that all packs are yielded self.assertEqual(num_packs, reader.count)
def test_FixedSizeDataPackBatcherWithExtractor(self): r"""This funciton tests the corectness of cross_pack.""" pipeline = Pipeline[DataPack]() pipeline.set_reader(CoNLL03Reader()) pipeline.initialize() text_extractor = AttributeExtractor({ "need_pad": True, "entry_type": Token, "attribute": "text", }) pack_num = 0 for pack in pipeline.process_dataset(self.dataset_path): pack_num += 1 for instance in pack.get(Sentence): text_extractor.update_vocab(pack, instance) self.assertEqual(pack_num, 2) batch_size = 2 batcher = FixedSizeDataPackBatcherWithExtractor(cross_pack=True) batcher.initialize({ "scope": Sentence, "batch_size": batch_size, "feature_scheme": { "text_tag": { "extractor": text_extractor, "converter": Converter(), "type": TrainPreprocessor.DATA_INPUT } }, }) batch_num = 0 for pack in pipeline.process_dataset(self.dataset_path): for batch in batcher.get_batch(pack, Sentence, None): batch_num += 1 self.assertEqual(len(batch[0]), batch_size) for _ in batcher.flush(): batch_num += 1 self.assertEqual(batch_num, 1)
def test_AttributeExtractor(self): pipeline = Pipeline[DataPack]() reader = CoNLL03Reader() pipeline.set_reader(reader) pipeline.initialize() config = { "need_pad": True, "entry_type": Token, "attribute": "text", } extractor = AttributeExtractor(config) sentence = "The European Commission said on Thursday it disagreed "\ "with German advice to consumers to shun British lamb "\ "until scientists determine whether mad cow disease "\ "can be transmitted to sheep ." # Check update_vocab. for pack in pipeline.process_dataset(self.dataset_path): for instance in pack.get(Sentence): extractor.update_vocab(pack, instance) # Check extract for pack in pipeline.process_dataset(self.dataset_path): features = [] for instance in pack.get(Sentence): features.append(extractor.extract(pack, instance)) for feat in features: recovered = [extractor.id2element(idx) for idx in feat.data[0]] self.assertEqual(" ".join(recovered), sentence) # Check add_to_pack and pre_evaluation_action. # Vocab_method is indexing, therefore the id of element # is the same as repr. extractor.config.attribute = "pos" extractor.add("TMP") fake_pos_ids = [ extractor.element2repr("TMP") for _ in range(len(sentence.split(" "))) ] # After pre_evaluation_action, the attribute value will # become None. Since vocab_use_unk is true, None will be # mapped to <UNK>. unk_pos_ids = [ extractor.element2repr(None) for _ in range(len(sentence.split(" "))) ] for pack in pipeline.process_dataset(self.dataset_path): for instance in pack.get(Sentence): extractor.add_to_pack(pack, instance, fake_pos_ids) for instance in pack.get(Sentence): feat = extractor.extract(pack, instance) self.assertEqual(feat.data[0], fake_pos_ids) for instance in pack.get(Sentence): extractor.pre_evaluation_action(pack, instance) feat = extractor.extract(pack, instance) self.assertEqual(feat.data[0], unk_pos_ids) # Check state and from_state. new_extractor = pkl.loads(pkl.dumps(extractor)) self.assertEqual(new_extractor.config.attribute, extractor.config.attribute)
def test_Predictor(self): pipeline = Pipeline[DataPack]() pipeline.set_reader(CoNLL03Reader()) pipeline.initialize() text_extractor = AttributeExtractor({ "need_pad": True, "entry_type": Token, "attribute": "text", }) for pack in pipeline.process_dataset(self.dataset_path): for instance in pack.get(Sentence): text_extractor.update_vocab(pack, instance) ner_extractor = BioSeqTaggingExtractor({ "entry_type": EntityMention, "need_pad": True, "attribute": "ner_type", "tagging_unit": Token, }) for pack in pipeline.process_dataset(self.dataset_path): for instance in pack.get(Sentence): ner_extractor.update_vocab(pack, instance) expected_ners = [ ner_extractor.id2element(FAKEOUTPUT)[0] for _ in range(30)] model = DummyModel() predictor_pipeline = Pipeline[DataPack]() predictor_pipeline.set_reader(CoNLL03Reader()) predictor_config = { "scope": Sentence, "batch_size": 2, "feature_scheme": { "text_tag": { "extractor": text_extractor, "converter": Converter(), "type": TrainPreprocessor.DATA_INPUT }, "ner_tag": { "extractor": ner_extractor, "converter": Converter(), "type": TrainPreprocessor.DATA_OUTPUT }, }, } # dummy = DummyRelationExtractor() # config = {"batcher": {"batch_size": 5}} predictor = NERPredictor() predictor.load(model) predictor_pipeline.add(predictor, predictor_config) # predictor_pipeline.add(dummy, config) predictor_pipeline.add(CoNLLNEREvaluator()) predictor_pipeline.initialize() for pack in predictor_pipeline.process_dataset(self.dataset_path): for instance in pack.get(Sentence): ners = [e.ner_type for e in list(pack.get(EntityMention, instance))] self.assertListEqual(ners, expected_ners)
def setUp(self): self.config = { "max_char_length": 45, "train_path": "data_samples/train_pipeline_test", "val_path": "data_samples/train_pipeline_test", "num_epochs": 1, "batch_size_tokens": 5, "learning_rate": 0.01, "momentum": 0.9, "nesterov": True } text_extractor: AttributeExtractor = \ AttributeExtractor(config={"entry_type": Token, "vocab_method": "indexing", "attribute": "text"}) char_extractor: CharExtractor = \ CharExtractor( config={"entry_type": Token, "vocab_method": "indexing", "max_char_length": self.config["max_char_length"]}) # Add output part in request based on different task type ner_extractor: BioSeqTaggingExtractor = \ BioSeqTaggingExtractor(config={"entry_type": EntityMention, "attribute": "ner_type", "tagging_unit": Token, "vocab_method": "indexing"}) self.tp_request = { "scope": Sentence, "schemes": { "text_tag": { "type": TrainPreprocessor.DATA_INPUT, "extractor": text_extractor }, "char_tag": { "type": TrainPreprocessor.DATA_INPUT, "extractor": char_extractor }, "ner_tag": { "type": TrainPreprocessor.DATA_OUTPUT, "extractor": ner_extractor } } } self.reader = CoNLL03Reader() self.evaluator = CoNLLNEREvaluator() self.tp_config = { "dataset": { "batch_size": self.config["batch_size_tokens"] } } train_pl: Pipeline = Pipeline() train_pl.set_reader(self.reader) train_pl.initialize() pack_iterator: Iterator[PackType] = \ train_pl.process_dataset(self.config["train_path"]) self.train_preprocessor = \ TrainPreprocessor(pack_iterator=pack_iterator, request=self.tp_request, config=self.tp_config)