def test_reader_original_span_test(self, value): span_ops, output = ( [ (Span(11, 19), "New"), (Span(19, 20), " Shiny "), (Span(25, 25), " Ends"), ], "<title>The New Shiny Title Ends </title>", ) input_span, expected_span, mode = value pipeline = Pipeline() reader = PlainTextReader() reader.text_replace_operation = lambda _: span_ops pipeline.set_reader(reader, {"file_ext": ".html"}) pipeline.initialize() pack = pipeline.process_one(self.test_dir) self.assertEqual(pack.text, output) output_span = pack.get_original_span(input_span, mode) self.assertEqual( output_span, expected_span, f"Expected: ({expected_span.begin, expected_span.end}" f"), Found: ({output_span.begin, output_span.end})" f" when Input: ({input_span.begin, input_span.end})" f" and Mode: {mode}", )
def test_reader_replace_back_test(self, value): # Reading with replacements - replacing a span and changing it back span_ops, output = value reader = PlainTextReader() reader.text_replace_operation = lambda _: span_ops pack = list(reader.parse_pack(self.file_path))[0] self.assertEqual(pack.text, output) orig_text_from_pack = pack.get_original_text() self.assertEqual(self.orig_text, orig_text_from_pack)
def test_reader_replace_error_test(self, value): # Read with errors in span replacements span_ops, output = value pipeline = Pipeline() reader = PlainTextReader() reader.text_replace_operation = lambda _: span_ops pipeline.set_reader(reader, {"file_ext": ".html"}) pipeline.initialize() with self.assertRaises(ValueError): pipeline.process(self.test_dir)
def test_reader_replace_error_test(self, value): # Read with errors in span replacements span_ops, output = value reader = PlainTextReader() reader.text_replace_operation = lambda _: span_ops try: list(reader.parse_pack(self.file_path))[0] except ValueError: pass except Exception: self.fail('Unexpected exception raised:') else: self.fail('Expected Exception not raised')
def test_reader_replace_back_test(self, value): # Reading with replacements - replacing a span and changing it back span_ops, output = value pipeline = Pipeline() reader = PlainTextReader() reader.text_replace_operation = lambda _: span_ops pipeline.set_reader(reader, {"file_ext": ".html"}) pipeline.initialize() pack: DataPack = pipeline.process_one(self.test_dir) self.assertEqual(pack.text, output) orig_text_from_pack = pack.get_original_text() self.assertEqual(self.orig_text, orig_text_from_pack)
def test_reader_original_span_test(self, value): span_ops, output = ([(Span(11, 19), 'New'), (Span(19, 20), ' Shiny '), (Span(25, 25), ' Ends')], '<title>The New Shiny Title Ends </title>') input_span, expected_span, mode = value reader = PlainTextReader() reader.text_replace_operation = lambda _: span_ops pack = list(reader.parse_pack(self.file_path))[0] self.assertEqual(pack.text, output) output_span = pack.get_original_span(input_span, mode) self.assertEqual( output_span, expected_span, f"Expected: ({expected_span.begin, expected_span.end}" f"), Found: ({output_span.begin, output_span.end})" f" when Input: ({input_span.begin, input_span.end})" f" and Mode: {mode}")
def test_reader_no_replace_test(self): # Read with no replacements pipeline = Pipeline() reader = PlainTextReader() pipeline.set_reader(reader, {"file_ext": ".html"}) pipeline.initialize() pack = pipeline.process_one(self.test_dir) self.assertEqual(pack.text, self.orig_text)
def main(dataset_dir: str, ner_model_path: str, srl_model_path: str): pl = Pipeline() pl.set_reader(PlainTextReader()) pl.add_processor(NLTKSentenceSegmenter()) pl.add_processor(NLTKWordTokenizer()) pl.add_processor(NLTKPOSTagger()) ner_configs = HParams( {'storage_path': os.path.join(ner_model_path, 'resources.pkl')}, CoNLLNERPredictor.default_hparams()) pl.add_processor(CoNLLNERPredictor(), ner_configs) srl_configs = HParams({ 'storage_path': srl_model_path, }, SRLPredictor.default_hparams()) pl.add_processor(SRLPredictor(), srl_configs) pl.initialize() for pack in pl.process_dataset(dataset_dir): print(colored("Document", 'red'), pack.meta.doc_id) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") # first method to get entry in a sentence tokens = [(token.text, token.pos) for token in pack.get(Token, sentence)] entities = [(entity.text, entity.ner_type) for entity in pack.get(EntityMention, sentence)] print(colored("Tokens:", 'red'), tokens, "\n") print(colored("EntityMentions:", 'red'), entities, "\n") # second method to get entry in a sentence print(colored("Semantic role labels:", 'red')) for link in pack.get(PredicateLink, sentence): parent: PredicateMention = link.get_parent() # type: ignore child: PredicateArgument = link.get_child() # type: ignore print(f" - \"{child.text}\" is role {link.arg_type} of " f"predicate \"{parent.text}\"") entities = [ entity.text for entity in pack.get(EntityMention, child) ] print(" Entities in predicate argument:", entities, "\n") print() input(colored("Press ENTER to continue...\n", 'green'))
def main(dataset_dir: str): config = yaml.safe_load(open("config.yml", "r")) config = Config(config, default_hparams=None) pl = Pipeline[DataPack]() pl.set_reader(PlainTextReader()) pl.add(NLTKSentenceSegmenter()) pl.add(NLTKWordTokenizer()) pl.add(NLTKPOSTagger()) pl.add(CoNLLNERPredictor(), config=config.NER) pl.add(SRLPredictor(), config=config.SRL) pl.initialize() for pack in pl.process_dataset(dataset_dir): print(colored("Document", "red"), pack.pack_name) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", "red"), sent_text, "\n") # first method to get entry in a sentence tokens = [ (token.text, token.pos) for token in pack.get(Token, sentence) ] entities = [ (entity.text, entity.ner_type) for entity in pack.get(EntityMention, sentence) ] print(colored("Tokens:", "red"), tokens, "\n") print(colored("EntityMentions:", "red"), entities, "\n") # second method to get entry in a sentence print(colored("Semantic role labels:", "red")) for link in pack.get(PredicateLink, sentence): parent: PredicateMention = link.get_parent() # type: ignore child: PredicateArgument = link.get_child() # type: ignore print( f' - "{child.text}" is role {link.arg_type} of ' f'predicate "{parent.text}"' ) entities = [ entity.text for entity in pack.get(EntityMention, child) ] print(" Entities in predicate argument:", entities, "\n") print() input(colored("Press ENTER to continue...\n", "green"))
def test_two_batch_processors(self, batch_size): nlp = Pipeline[DataPack]() nlp.set_reader(PlainTextReader()) dummy1 = DummyFixedSizeBatchProcessor() dummy2 = DummyFixedSizeBatchProcessor() nlp.add(PeriodSentenceSplitter()) nlp.add( dummy1, config={ "batcher": { "batch_size": batch_size, "context_type": "ft.onto.base_ontology.Sentence", } }, ) nlp.add( dummy2, config={ "batcher": { "batch_size": 2 * batch_size, "context_type": "ft.onto.base_ontology.Sentence", } }, ) nlp.initialize() data_path = os.path.join(data_samples_root, "random_texts") pack = nlp.process(data_path) sent_len = len(list(pack.get(Sentence))) self.assertEqual( dummy1.counter, (sent_len // batch_size + (sent_len % batch_size > 0)), ) self.assertEqual( dummy2.counter, (sent_len // (2 * batch_size) + (sent_len % (2 * batch_size) > 0)), )
def test_two_batch_processors(self, batch_size): nlp = Pipeline() nlp.set_reader(PlainTextReader()) dummy1 = DummmyFixedSizeBatchProcessor() dummy2 = DummmyFixedSizeBatchProcessor() config = {"batcher": {"batch_size": batch_size}} nlp.add_processor(NLTKSentenceSegmenter()) nlp.add_processor(dummy1, config=config) config = {"batcher": {"batch_size": 2 * batch_size}} nlp.add_processor(dummy2, config=config) nlp.initialize() data_path = "data_samples/random_texts" pack = nlp.process(data_path) sent_len = len(list(pack.get(Sentence))) self.assertEqual(dummy1.counter, (sent_len // batch_size + (sent_len % batch_size > 0))) self.assertEqual(dummy2.counter, (sent_len // (2 * batch_size) + (sent_len % (2 * batch_size) > 0)))
def test_reader_no_replace_test(self): # Read with no replacements reader = PlainTextReader() PackManager().set_input_source(reader.component_name) pack = list(reader.parse_pack(self.file_path))[0] self.assertEqual(pack.text, self.orig_text)
def test_reader_no_replace_test(self): # Read with no replacements pack = list(PlainTextReader().parse_pack(self.file_path))[0] self.assertEqual(pack.text, self.orig_text)