예제 #1
0
def pack_example(input_path, output_path):
    """
    This example read data from input path and serialize to output path.
    Args:
        input_path:
        output_path:

    Returns:

    """
    print("Pack serialization example.")
    nlp = Pipeline[DataPack]()

    nlp.set_reader(OntonotesReader())
    nlp.add(NLTKSentenceSegmenter())
    nlp.add(NLTKWordTokenizer())
    nlp.add(NLTKPOSTagger())

    # This is a simple writer that serialize the result to the current
    # directory and will use the DocID field in the data pack as the file name.
    nlp.add(
        PackNameJsonPackWriter(),
        {
            "output_dir": output_path,
            "indent": 2,
            "overwrite": True,
        },
    )

    nlp.run(input_path)
예제 #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--config_file",
                        default="./config.yml",
                        help="Config YAML filepath")
    args = parser.parse_args()

    # loading config
    config = yaml.safe_load(open(args.config_file, "r"))

    nlp: Pipeline[MultiPack] = Pipeline()
    nlp.set_reader(RandomDataSelector(), config=config["data_selector_config"])
    nlp.add(component=MultiPackBoxer(), config=config["boxer_config"])
    nlp.add(component=NLTKWordTokenizer(), selector=AllPackSelector())
    nlp.add(component=NLTKPOSTagger(), selector=AllPackSelector())
    nlp.add(
        component=ReplacementDataAugmentProcessor(),
        config=config["da_processor_config"],
    )

    nlp.initialize()

    for _, m_pack in enumerate(nlp.process_dataset()):
        aug_pack = m_pack.get_pack("augmented_input")
        logging.info(aug_pack.text)
예제 #3
0
 def setUp(self):
     self.nltk = Pipeline[DataPack](enforce_consistency=True)
     self.nltk.set_reader(StringReader())
     self.nltk.add(NLTKSentenceSegmenter())
     self.nltk.add(NLTKWordTokenizer())
     self.nltk.add(NLTKPOSTagger())
     self.nltk.initialize()
예제 #4
0
 def setUp(self):
     self.nltk = Pipeline[DataPack](enforce_consistency=True)
     self.nltk.set_reader(StringReader())
     self.nltk.add(NLTKSentenceSegmenter())
     self.nltk.add(NLTKWordTokenizer())
     self.nltk.add(NLTKPOSTagger())
     config = {"pattern": "NP: {<DT>?<JJ>*<NN>}"}
     self.nltk.add(NLTKChunker(), config=config)
     self.nltk.initialize()
예제 #5
0
def main():
    pl = Pipeline[DataPack]()
    pl.set_reader(StringReader())
    pl.add(NLTKSentenceSegmenter())
    pl.add(NLTKWordTokenizer())
    pl.add(NLTKPOSTagger())

    config = yaml.safe_load(open("config.yml", "r"))

    config = Config(config, default_hparams=None)

    pl.add(CoNLLNERPredictor(), config=config.NER)
    pl.add(SRLPredictor(), config=config.SRL)

    pl.initialize()

    text = (
        "So I was excited to see Journey to the Far Side of the Sun finally "
        "get released on an affordable DVD (the previous print had been "
        "fetching $100 on eBay - I'm sure those people wish they had their "
        "money back - but more about that in a second)."
    )

    pack = pl.process_one(text)

    for sentence in pack.get(Sentence):
        sent_text = sentence.text
        print(colored("Sentence:", "red"), sent_text, "\n")
        # first method to get entry in a sentence
        tokens = [
            (token.text, token.pos) for token in pack.get(Token, sentence)
        ]
        entities = [
            (entity.text, entity.ner_type)
            for entity in pack.get(EntityMention, sentence)
        ]
        print(colored("Tokens:", "red"), tokens, "\n")
        print(colored("EntityMentions:", "red"), entities, "\n")

        # second method to get entry in a sentence
        print(colored("Semantic role labels:", "red"))
        for link in pack.get(PredicateLink, sentence):
            parent: PredicateMention = link.get_parent()
            child: PredicateArgument = link.get_child()
            print(
                f'  - "{child.text}" is role {link.arg_type} of '
                f'predicate "{parent.text}"'
            )
            entities = [
                entity.text for entity in pack.get(EntityMention, child)
            ]
            print("      Entities in predicate argument:", entities, "\n")
        print()

        input(colored("Press ENTER to continue...\n", "green"))
예제 #6
0
def main(dataset_dir: str):
    config = yaml.safe_load(open("config.yml", "r"))
    config = Config(config, default_hparams=None)

    pl = Pipeline[DataPack]()
    pl.set_reader(PlainTextReader())
    pl.add(NLTKSentenceSegmenter())
    pl.add(NLTKWordTokenizer())
    pl.add(NLTKPOSTagger())
    pl.add(CoNLLNERPredictor(), config=config.NER)
    pl.add(SRLPredictor(), config=config.SRL)

    pl.initialize()

    for pack in pl.process_dataset(dataset_dir):
        print(colored("Document", "red"), pack.pack_name)
        for sentence in pack.get(Sentence):
            sent_text = sentence.text
            print(colored("Sentence:", "red"), sent_text, "\n")
            # first method to get entry in a sentence
            tokens = [
                (token.text, token.pos) for token in pack.get(Token, sentence)
            ]
            entities = [
                (entity.text, entity.ner_type)
                for entity in pack.get(EntityMention, sentence)
            ]
            print(colored("Tokens:", "red"), tokens, "\n")
            print(colored("EntityMentions:", "red"), entities, "\n")

            # second method to get entry in a sentence
            print(colored("Semantic role labels:", "red"))
            for link in pack.get(PredicateLink, sentence):
                parent: PredicateMention = link.get_parent()  # type: ignore
                child: PredicateArgument = link.get_child()  # type: ignore
                print(
                    f'  - "{child.text}" is role {link.arg_type} of '
                    f'predicate "{parent.text}"'
                )
                entities = [
                    entity.text for entity in pack.get(EntityMention, child)
                ]
                print("      Entities in predicate argument:", entities, "\n")
            print()

            input(colored("Press ENTER to continue...\n", "green"))
예제 #7
0
def setup(config: Config) -> Pipeline:
    resource = Resources()
    query_pipeline = Pipeline[MultiPack](resource=resource)
    query_pipeline.set_reader(
        reader=MultiPackTerminalReader(), config=config.reader
    )
    query_pipeline.add(
        component=MicrosoftBingTranslator(), config=config.translator
    )
    query_pipeline.add(
        component=BertBasedQueryCreator(), config=config.query_creator
    )
    query_pipeline.add(component=SearchProcessor(), config=config.searcher)

    top_response_pack_name = config.indexer.response_pack_name + "_0"

    query_pipeline.add(
        component=NLTKSentenceSegmenter(),
        selector=NameMatchSelector(select_name=top_response_pack_name),
    )
    query_pipeline.add(
        component=NLTKWordTokenizer(),
        selector=NameMatchSelector(select_name=top_response_pack_name),
    )
    query_pipeline.add(
        component=NLTKPOSTagger(),
        selector=NameMatchSelector(select_name=top_response_pack_name),
    )
    query_pipeline.add(
        component=SRLPredictor(),
        config=config.SRL,
        selector=NameMatchSelector(select_name=top_response_pack_name),
    )
    query_pipeline.add(
        component=MicrosoftBingTranslator(), config=config.back_translator
    )

    query_pipeline.initialize()

    return query_pipeline