def test_reader_original_span_test(self, value):
        span_ops, output = (
            [
                (Span(11, 19), "New"),
                (Span(19, 20), " Shiny "),
                (Span(25, 25), " Ends"),
            ],
            "<title>The New Shiny Title Ends </title>",
        )
        input_span, expected_span, mode = value

        pipeline = Pipeline()
        reader = PlainTextReader()
        reader.text_replace_operation = lambda _: span_ops
        pipeline.set_reader(reader, {"file_ext": ".html"})
        pipeline.initialize()

        pack = pipeline.process_one(self.test_dir)

        self.assertEqual(pack.text, output)

        output_span = pack.get_original_span(input_span, mode)
        self.assertEqual(
            output_span,
            expected_span,
            f"Expected: ({expected_span.begin, expected_span.end}"
            f"), Found: ({output_span.begin, output_span.end})"
            f" when Input: ({input_span.begin, input_span.end})"
            f" and Mode: {mode}",
        )
예제 #2
0
    def test_reader_replace_back_test(self, value):
        # Reading with replacements - replacing a span and changing it back
        span_ops, output = value
        reader = PlainTextReader()
        reader.text_replace_operation = lambda _: span_ops
        pack = list(reader.parse_pack(self.file_path))[0]
        self.assertEqual(pack.text, output)

        orig_text_from_pack = pack.get_original_text()
        self.assertEqual(self.orig_text, orig_text_from_pack)
    def test_reader_replace_error_test(self, value):
        # Read with errors in span replacements
        span_ops, output = value

        pipeline = Pipeline()
        reader = PlainTextReader()
        reader.text_replace_operation = lambda _: span_ops
        pipeline.set_reader(reader, {"file_ext": ".html"})
        pipeline.initialize()

        with self.assertRaises(ValueError):
            pipeline.process(self.test_dir)
예제 #4
0
 def test_reader_replace_error_test(self, value):
     # Read with errors in span replacements
     span_ops, output = value
     reader = PlainTextReader()
     reader.text_replace_operation = lambda _: span_ops
     try:
         list(reader.parse_pack(self.file_path))[0]
     except ValueError:
         pass
     except Exception:
         self.fail('Unexpected exception raised:')
     else:
         self.fail('Expected Exception not raised')
    def test_reader_replace_back_test(self, value):
        # Reading with replacements - replacing a span and changing it back
        span_ops, output = value

        pipeline = Pipeline()
        reader = PlainTextReader()
        reader.text_replace_operation = lambda _: span_ops
        pipeline.set_reader(reader, {"file_ext": ".html"})
        pipeline.initialize()

        pack: DataPack = pipeline.process_one(self.test_dir)
        self.assertEqual(pack.text, output)

        orig_text_from_pack = pack.get_original_text()
        self.assertEqual(self.orig_text, orig_text_from_pack)
예제 #6
0
    def test_reader_original_span_test(self, value):
        span_ops, output = ([(Span(11, 19), 'New'), (Span(19, 20), ' Shiny '),
                             (Span(25, 25), ' Ends')],
                            '<title>The New Shiny Title Ends </title>')
        input_span, expected_span, mode = value
        reader = PlainTextReader()
        reader.text_replace_operation = lambda _: span_ops
        pack = list(reader.parse_pack(self.file_path))[0]
        self.assertEqual(pack.text, output)

        output_span = pack.get_original_span(input_span, mode)
        self.assertEqual(
            output_span, expected_span,
            f"Expected: ({expected_span.begin, expected_span.end}"
            f"), Found: ({output_span.begin, output_span.end})"
            f" when Input: ({input_span.begin, input_span.end})"
            f" and Mode: {mode}")
    def test_reader_no_replace_test(self):
        # Read with no replacements
        pipeline = Pipeline()
        reader = PlainTextReader()
        pipeline.set_reader(reader, {"file_ext": ".html"})
        pipeline.initialize()

        pack = pipeline.process_one(self.test_dir)
        self.assertEqual(pack.text, self.orig_text)
예제 #8
0
def main(dataset_dir: str, ner_model_path: str, srl_model_path: str):
    pl = Pipeline()
    pl.set_reader(PlainTextReader())
    pl.add_processor(NLTKSentenceSegmenter())
    pl.add_processor(NLTKWordTokenizer())
    pl.add_processor(NLTKPOSTagger())

    ner_configs = HParams(
        {'storage_path': os.path.join(ner_model_path, 'resources.pkl')},
        CoNLLNERPredictor.default_hparams())

    pl.add_processor(CoNLLNERPredictor(), ner_configs)

    srl_configs = HParams({
        'storage_path': srl_model_path,
    }, SRLPredictor.default_hparams())
    pl.add_processor(SRLPredictor(), srl_configs)
    pl.initialize()

    for pack in pl.process_dataset(dataset_dir):
        print(colored("Document", 'red'), pack.meta.doc_id)
        for sentence in pack.get(Sentence):
            sent_text = sentence.text
            print(colored("Sentence:", 'red'), sent_text, "\n")
            # first method to get entry in a sentence
            tokens = [(token.text, token.pos)
                      for token in pack.get(Token, sentence)]
            entities = [(entity.text, entity.ner_type)
                        for entity in pack.get(EntityMention, sentence)]
            print(colored("Tokens:", 'red'), tokens, "\n")
            print(colored("EntityMentions:", 'red'), entities, "\n")

            # second method to get entry in a sentence
            print(colored("Semantic role labels:", 'red'))
            for link in pack.get(PredicateLink, sentence):
                parent: PredicateMention = link.get_parent()  # type: ignore
                child: PredicateArgument = link.get_child()  # type: ignore
                print(f"  - \"{child.text}\" is role {link.arg_type} of "
                      f"predicate \"{parent.text}\"")
                entities = [
                    entity.text for entity in pack.get(EntityMention, child)
                ]
                print("      Entities in predicate argument:", entities, "\n")
            print()

            input(colored("Press ENTER to continue...\n", 'green'))
예제 #9
0
def main(dataset_dir: str):
    config = yaml.safe_load(open("config.yml", "r"))
    config = Config(config, default_hparams=None)

    pl = Pipeline[DataPack]()
    pl.set_reader(PlainTextReader())
    pl.add(NLTKSentenceSegmenter())
    pl.add(NLTKWordTokenizer())
    pl.add(NLTKPOSTagger())
    pl.add(CoNLLNERPredictor(), config=config.NER)
    pl.add(SRLPredictor(), config=config.SRL)

    pl.initialize()

    for pack in pl.process_dataset(dataset_dir):
        print(colored("Document", "red"), pack.pack_name)
        for sentence in pack.get(Sentence):
            sent_text = sentence.text
            print(colored("Sentence:", "red"), sent_text, "\n")
            # first method to get entry in a sentence
            tokens = [
                (token.text, token.pos) for token in pack.get(Token, sentence)
            ]
            entities = [
                (entity.text, entity.ner_type)
                for entity in pack.get(EntityMention, sentence)
            ]
            print(colored("Tokens:", "red"), tokens, "\n")
            print(colored("EntityMentions:", "red"), entities, "\n")

            # second method to get entry in a sentence
            print(colored("Semantic role labels:", "red"))
            for link in pack.get(PredicateLink, sentence):
                parent: PredicateMention = link.get_parent()  # type: ignore
                child: PredicateArgument = link.get_child()  # type: ignore
                print(
                    f'  - "{child.text}" is role {link.arg_type} of '
                    f'predicate "{parent.text}"'
                )
                entities = [
                    entity.text for entity in pack.get(EntityMention, child)
                ]
                print("      Entities in predicate argument:", entities, "\n")
            print()

            input(colored("Press ENTER to continue...\n", "green"))
예제 #10
0
    def test_two_batch_processors(self, batch_size):
        nlp = Pipeline[DataPack]()
        nlp.set_reader(PlainTextReader())
        dummy1 = DummyFixedSizeBatchProcessor()
        dummy2 = DummyFixedSizeBatchProcessor()

        nlp.add(PeriodSentenceSplitter())
        nlp.add(
            dummy1,
            config={
                "batcher": {
                    "batch_size": batch_size,
                    "context_type": "ft.onto.base_ontology.Sentence",
                }
            },
        )

        nlp.add(
            dummy2,
            config={
                "batcher": {
                    "batch_size": 2 * batch_size,
                    "context_type": "ft.onto.base_ontology.Sentence",
                }
            },
        )

        nlp.initialize()
        data_path = os.path.join(data_samples_root, "random_texts")
        pack = nlp.process(data_path)
        sent_len = len(list(pack.get(Sentence)))

        self.assertEqual(
            dummy1.counter,
            (sent_len // batch_size + (sent_len % batch_size > 0)),
        )

        self.assertEqual(
            dummy2.counter,
            (sent_len // (2 * batch_size) + (sent_len % (2 * batch_size) > 0)),
        )
예제 #11
0
    def test_two_batch_processors(self, batch_size):
        nlp = Pipeline()
        nlp.set_reader(PlainTextReader())
        dummy1 = DummmyFixedSizeBatchProcessor()
        dummy2 = DummmyFixedSizeBatchProcessor()
        config = {"batcher": {"batch_size": batch_size}}
        nlp.add_processor(NLTKSentenceSegmenter())

        nlp.add_processor(dummy1, config=config)
        config = {"batcher": {"batch_size": 2 * batch_size}}
        nlp.add_processor(dummy2, config=config)

        nlp.initialize()
        data_path = "data_samples/random_texts"
        pack = nlp.process(data_path)
        sent_len = len(list(pack.get(Sentence)))

        self.assertEqual(dummy1.counter, (sent_len // batch_size +
                                          (sent_len % batch_size > 0)))

        self.assertEqual(dummy2.counter, (sent_len // (2 * batch_size) +
                                          (sent_len % (2 * batch_size) > 0)))
예제 #12
0
 def test_reader_no_replace_test(self):
     # Read with no replacements
     reader = PlainTextReader()
     PackManager().set_input_source(reader.component_name)
     pack = list(reader.parse_pack(self.file_path))[0]
     self.assertEqual(pack.text, self.orig_text)
예제 #13
0
 def test_reader_no_replace_test(self):
     # Read with no replacements
     pack = list(PlainTextReader().parse_pack(self.file_path))[0]
     self.assertEqual(pack.text, self.orig_text)