def test_reader_original_span_test(self, value):
        span_ops, output = (
            [
                (Span(11, 19), "New"),
                (Span(19, 20), " Shiny "),
                (Span(25, 25), " Ends"),
            ],
            "<title>The New Shiny Title Ends </title>",
        )
        input_span, expected_span, mode = value

        pipeline = Pipeline()
        reader = PlainTextReader()
        reader.text_replace_operation = lambda _: span_ops
        pipeline.set_reader(reader, {"file_ext": ".html"})
        pipeline.initialize()

        pack = pipeline.process_one(self.test_dir)

        self.assertEqual(pack.text, output)

        output_span = pack.get_original_span(input_span, mode)
        self.assertEqual(
            output_span,
            expected_span,
            f"Expected: ({expected_span.begin, expected_span.end}"
            f"), Found: ({output_span.begin, output_span.end})"
            f" when Input: ({input_span.begin, input_span.end})"
            f" and Mode: {mode}",
        )
    def test_reader_no_replace_test(self):
        # Read with no replacements
        pipeline = Pipeline()
        reader = PlainTextReader()
        pipeline.set_reader(reader, {"file_ext": ".html"})
        pipeline.initialize()

        pack = pipeline.process_one(self.test_dir)
        self.assertEqual(pack.text, self.orig_text)
Пример #3
0
    def setUp(self) -> None:
        file_dir_path = os.path.dirname(__file__)
        data_path = os.path.join(file_dir_path, os.pardir, os.pardir,
                                 'test_data', 'ontonotes')

        pipeline = Pipeline()
        pipeline.set_reader(OntonotesReader())
        pipeline.initialize()
        self.data_pack: DataPack = pipeline.process_one(data_path)
Пример #4
0
def string_processor_example(ner_model_dir: str, srl_model_dir: str):
    pl = Pipeline()
    pl.set_reader(StringReader())
    pl.add_processor(NLTKSentenceSegmenter())
    pl.add_processor(NLTKWordTokenizer())
    pl.add_processor(NLTKPOSTagger())

    ner_configs = HParams(
        {'storage_path': os.path.join(ner_model_dir, 'resources.pkl')},
        CoNLLNERPredictor.default_hparams())

    ner_predictor = CoNLLNERPredictor()

    pl.add_processor(ner_predictor, ner_configs)

    srl_configs = HParams({
        'storage_path': srl_model_dir,
    }, SRLPredictor.default_hparams())
    pl.add_processor(SRLPredictor(), srl_configs)

    pl.initialize()

    text = (
        "The plain green Norway spruce is displayed in the gallery's foyer. "
        "Wentworth worked as an assistant to sculptor Henry Moore in the "
        "late 1960s. His reputation as a sculptor grew in the 1980s.")

    pack = pl.process_one(text)

    for sentence in pack.get(Sentence):
        sent_text = sentence.text
        print(colored("Sentence:", 'red'), sent_text, "\n")
        # first method to get entry in a sentence
        tokens = [(token.text, token.pos)
                  for token in pack.get(Token, sentence)]
        entities = [(entity.text, entity.ner_type)
                    for entity in pack.get(EntityMention, sentence)]
        print(colored("Tokens:", 'red'), tokens, "\n")
        print(colored("EntityMentions:", 'red'), entities, "\n")

        # second method to get entry in a sentence
        print(colored("Semantic role labels:", 'red'))
        for link in pack.get(PredicateLink, sentence):
            parent: PredicateMention = link.get_parent()  # type: ignore
            child: PredicateArgument = link.get_child()  # type: ignore
            print(f"  - \"{child.text}\" is role {link.arg_type} of "
                  f"predicate \"{parent.text}\"")
            entities = [
                entity.text for entity in pack.get(EntityMention, child)
            ]
            print("      Entities in predicate argument:", entities, "\n")
        print()

        input(colored("Press ENTER to continue...\n", 'green'))
    def test_reader_replace_back_test(self, value):
        # Reading with replacements - replacing a span and changing it back
        span_ops, output = value

        pipeline = Pipeline()
        reader = PlainTextReader()
        reader.text_replace_operation = lambda _: span_ops
        pipeline.set_reader(reader, {"file_ext": ".html"})
        pipeline.initialize()

        pack: DataPack = pipeline.process_one(self.test_dir)
        self.assertEqual(pack.text, output)

        orig_text_from_pack = pack.get_original_text()
        self.assertEqual(self.orig_text, orig_text_from_pack)
Пример #6
0
    def test_parse_pack(self, text, annotation_length):

        file_path = os.path.join(self.test_dir, 'test.txt')
        with open(file_path, 'w') as f:
            f.write(text)

        pl = Pipeline()
        pl.set_reader(MultiPackSentenceReader())
        pl.initialize()

        multipack: MultiPack = pl.process_one(self.test_dir)
        input_pack = multipack.get_pack('input_src')
        self.assertEqual(len(multipack.packs), 2)
        self.assertEqual(multipack._pack_names, ['input_src', 'output_tgt'])
        self.assertEqual(len(input_pack.annotations), annotation_length)
        self.assertEqual(input_pack.text, text + "\n")
Пример #7
0
def main():
    pl = Pipeline()
    pl.set_reader(StringReader())
    pl.add_processor(NLTKSentenceSegmenter())
    pl.add_processor(NLTKWordTokenizer())
    pl.add_processor(NLTKPOSTagger())

    pl.add_processor(CoNLLNERPredictor(), config=config.NER)
    pl.add_processor(SRLPredictor(), config=config.SRL)

    pl.initialize()

    text = (
        "So I was excited to see Journey to the Far Side of the Sun finally "
        "get released on an affordable DVD (the previous print had been "
        "fetching $100 on eBay - I'm sure those people wish they had their "
        "money back - but more about that in a second).")

    pack = pl.process_one(text)

    for sentence in pack.get(Sentence):
        sent_text = sentence.text
        print(colored("Sentence:", 'red'), sent_text, "\n")
        # first method to get entry in a sentence
        tokens = [(token.text, token.pos)
                  for token in pack.get(Token, sentence)]
        entities = [(entity.text, entity.ner_type)
                    for entity in pack.get(EntityMention, sentence)]
        print(colored("Tokens:", 'red'), tokens, "\n")
        print(colored("EntityMentions:", 'red'), entities, "\n")

        # second method to get entry in a sentence
        print(colored("Semantic role labels:", 'red'))
        for link in pack.get(PredicateLink, sentence):
            parent: PredicateMention = link.get_parent()  # type: ignore
            child: PredicateArgument = link.get_child()  # type: ignore
            print(f"  - \"{child.text}\" is role {link.arg_type} of "
                  f"predicate \"{parent.text}\"")
            entities = [
                entity.text for entity in pack.get(EntityMention, child)
            ]
            print("      Entities in predicate argument:", entities, "\n")
        print()

        input(colored("Press ENTER to continue...\n", 'green'))
Пример #8
0
                     "with the added feature of being able to read" \
                     " out the results. Scottish speech technology firm CEC " \
                     "Systems launched the site in November. But experts have" \
                     " questioned whether talking search engines are of any " \
                     "real benefit to people with visual impairments. The" \
                     " Edinburgh-based firm CEC has married speech " \
                     "technology with ever-popular internet search. The " \
                     "ability to search is becoming increasingly crucial to " \
                     "surfers baffled by the huge amount of information " \
                     "available on the web."

win_medal_text = "British hurdler Sarah Claxton is confident she can win her " \
                 "first major medal at next month's European Indoor " \
                 "Championships in Madrid."

pack = pl.process_one(win_medal_text)

for sentence in pack.get(Sentence):
    sent_text = sentence.text
    print(colored("Sentence:", 'red'), sent_text, "\n")

for sentence in pack.get(Sentence):
    tokens = [(token.text, token.pos_tag)
              for token in pack.get(Token, sentence)]
    print(colored("Tokens:", 'red'), tokens, "\n")
    break

for sentence in pack.get(Sentence):
    for entity in pack.get(EntityMention, sentence):
        print(colored("EntityMention:", 'red'), entity.text, 'has type',
              colored(entity.ner_type, 'blue'), "\n")