def write_results(pl: Pipeline, output_path: str, input_data: str): pl.add( WikiArticleWriter(), config={ "output_dir": output_path, "zip_pack": True, "drop_record": True, }, ) pl.run(input_data)
def multi_example(input_path, output_path): """ This example reads data from input path, and write multi pack output to output path. Args: input_path: output_path: Returns: """ print("Multi Pack serialization example.") print( "We first read the data, and add multi-packs to them, and then " "save the results." ) coref_pl = Pipeline() coref_pl.set_reader(DirPackReader()) coref_pl.add(MultiPackBoxer()) coref_pl.add(PackCopier()) coref_pl.add(ExampleCoreferencer()) coref_pl.add(ExampleCorefCounter()) coref_pl.add( MultiPackWriter(), config={ "output_dir": output_path, "indent": 2, "overwrite": True, }, ) coref_pl.run(input_path) print( "We can then load the saved results, and see if everything is OK. " "We should see the same number of multi packs there. " ) reading_pl = Pipeline() reading_pl.set_reader( MultiPackDirectoryReader(), config={ "multi_pack_dir": os.path.join(output_path, "multi"), "data_pack_dir": os.path.join(output_path, "packs"), }, ) reading_pl.add(ExampleCorefCounter()) reading_pl.run()
def multi_example(input_path, output_path): """ This example reads data from input path, and write multi pack output to output path. Args: input_path: output_path: Returns: """ print("Multi Pack serialization example.") print("We first read the data, and add multi-packs to them, and then " "save the results.") coref_pl = Pipeline() coref_pl.set_reader(DirPackReader()) coref_pl.add(MultiPackBoxer()) coref_pl.add(PackCopier()) coref_pl.add(ExampleCoreferencer()) coref_pl.add(ExampleCorefCounter()) coref_pl.add( MultiPackWriter(), { 'output_dir': output_path, 'indent': 2, 'overwrite': True, } ) coref_pl.run(input_path) print("We can then load the saved results, and see if everything is OK. " "We should see the same number of multi packs there. ") reading_pl = Pipeline() reading_pl.set_reader(MultiPackDiskReader(), {'data_path': output_path}) reading_pl.add(ExampleCorefCounter()) reading_pl.run()
class TestQuestionAnsweringMulti(unittest.TestCase): def setUp(self): self.nlp = Pipeline() self.nlp.set_reader(StringReader()) self.nlp.add(NLTKSentenceSegmenter()) boxer_config = {"pack_name": "question"} self.nlp.add(MultiPackBoxer(), boxer_config) self.nlp.add(MutliDocPackAdder()) self.nlp.add(QuestionAnsweringMulti()) self.nlp.initialize() def test_huggingface_qa_multi_processor(self): question = "Name synonym of Acrokeratosis paraneoplastica." packs: MultiPack = self.nlp.process(question) expected_ans = { "doc_1": "Bazex syndrome", "doc_2": "Bazex syndrome", "doc_3": "Bazex syndrome", } for doc_id in packs.pack_names: if doc_id == "question": continue pack = packs.get_pack(doc_id) for idx, phrase in enumerate(pack.get(entry_type=Phrase)): self.assertEqual(phrase.text, expected_ans[doc_id]) linked_texts = [] for link in packs.get(entry_type=MultiPackLink): parent_text = link.get_parent().text child_text = link.get_child().text linked_texts.append((parent_text, child_text)) self.assertListEqual( sorted(linked_texts), sorted([ (question, expected_ans["doc_1"]), (question, expected_ans["doc_2"]), (question, expected_ans["doc_3"]), ]), )
def testMultiPackWriting(self): coref_pl = Pipeline() coref_pl.set_reader(DirPackReader()) coref_pl.add(MultiPackBoxer()) coref_pl.add(CopySentence()) coref_pl.add(PackIdMultiPackWriter(), config={ 'output_dir': os.path.join(self.main_output.name, 'multi'), 'indent': 2, 'overwrite': True, }) coref_pl.run(os.path.join(self.main_output.name, 'packs')) self.assertTrue(os.path.exists(os.path.join('multi_out', 'multi.idx'))) self.assertTrue(os.path.exists(os.path.join('multi_out', 'pack.idx'))) self.assertTrue(os.path.exists(os.path.join('multi_out', 'packs'))) self.assertTrue(os.path.exists(os.path.join('multi_out', 'multi')))
def testMultiPackWriting(self): coref_pl = Pipeline() coref_pl.set_reader(DirPackReader()) coref_pl.add(MultiPackBoxer()) coref_pl.add(CopySentence()) coref_pl.add( PackIdMultiPackWriter(), config={ "output_dir": os.path.join(self.main_output.name, "multi"), "indent": 2, "overwrite": True, }, ) coref_pl.run(os.path.join(self.main_output.name, "packs")) self.assertTrue(os.path.exists(os.path.join("multi_out", "multi.idx"))) self.assertTrue(os.path.exists(os.path.join("multi_out", "pack.idx"))) self.assertTrue(os.path.exists(os.path.join("multi_out", "packs"))) self.assertTrue(os.path.exists(os.path.join("multi_out", "multi")))
def prepare(self): prepare_pl = Pipeline() prepare_pl.set_reader(self.train_reader) for p in self.preprocessors: prepare_pl.add(p) prepare_pl.run(self.configs.config_data.train_path)
task = sys.argv[1] assert task in ["ner", "pos"], "Not supported nlp task type: {}".format(task) config_predict = yaml.safe_load(open("configs/config_predict.yml", "r")) saved_model = torch.load(config_predict["model_path"]) train_state = torch.load(config_predict["train_state_path"]) reader = CoNLL03Reader() predictor = TaggingPredictor() evaluator = CoNLLNEREvaluator() pl = Pipeline() pl.set_reader(reader) pl.add(predictor) pl.add(evaluator) pl.initialize() for pack in pl.process_dataset(config_predict["test_path"]): print("---- pack ----") for instance in pack.get(Sentence): sent = instance.text output_tags = [] if task == "ner": for entry in pack.get(EntityMention, instance): output_tags.append((entry.text, entry.ner_type)) else: for entry in pack.get(Token, instance): output_tags.append((entry.text, entry.pos)) print("---- example -----")
def testMultiPackWriting(self, config_data): zip_pack, method = config_data # Use different sub-directory to avoid conflicting. subdir = f"{zip_pack}_{method}" with tempfile.TemporaryDirectory() as main_output: # Prepare input data. prepared_input: str = os.path.join(main_output, subdir, "input_packs") data_output: str = os.path.join(main_output, subdir, "output") suffix = ".pickle" if method == "pickle" else ".json" if zip_pack: suffix = suffix + ".gz" nlp = Pipeline[DataPack]() nlp.set_reader(OntonotesReader()) nlp.add( PackIdJsonPackWriter(), { "output_dir": prepared_input, "overwrite": True, "serialize_method": method, "zip_pack": zip_pack, }, ) nlp.run(self.data_path) # Convert to multi pack. coref_pl = Pipeline() coref_pl.set_reader( DirPackReader(), { "serialize_method": method, "zip_pack": zip_pack, "suffix": suffix, }, ) coref_pl.add(MultiPackBoxer()) coref_pl.add(CopySentence()) coref_pl.add(NaiveCoref()) coref_pl.add( PackIdMultiPackWriter(), config={ "output_dir": data_output, "overwrite": True, "serialize_method": method, "zip_pack": zip_pack, }, ) coref_pl.run(prepared_input) self.assertTrue( os.path.exists(os.path.join(data_output, "multi.idx"))) self.assertTrue( os.path.exists(os.path.join(data_output, "pack.idx"))) self.assertTrue(os.path.exists(os.path.join(data_output, "packs"))) self.assertTrue(os.path.exists(os.path.join(data_output, "multi"))) # Read the multi pack again. mp_pipeline = Pipeline() mp_pipeline.set_reader( MultiPackDirectoryReader(), config={ "suffix": suffix, "zip_pack": zip_pack, "serialize_method": method, "data_pack_dir": os.path.join(data_output, "packs"), "multi_pack_dir": os.path.join(data_output, "multi"), }, ).initialize() re: CrossDocEntityRelation for mp in mp_pipeline.process_dataset(): for re in mp.get(CrossDocEntityRelation): self.assertEqual(re.get_parent().text, re.get_child().text)