def test_process_multi_next(self): from forte.data.readers import OntonotesReader # Define and config the Pipeline nlp = Pipeline[DataPack]() nlp.set_reader(OntonotesReader()) pack_name = 'test_pack' nlp.add(MultiPackBoxer(), {'pack_name': pack_name}) nlp.add(DummyRelationExtractor(), config={"batcher": { "batch_size": 5 }}, selector=NameMatchSelector(select_name=pack_name)) nlp.initialize() dataset_path = data_samples_root + "/ontonotes/00" # get processed pack from dataset m_pack: MultiPack for m_pack in nlp.process_dataset(dataset_path): pack = m_pack.get_pack(pack_name) # get sentence from pack for sentence in pack.get(Sentence): sent_text = sentence.text # second method to get entry in a sentence tokens = [token.text for token in pack.get(Token, sentence)] self.assertEqual(sent_text, " ".join(tokens))
def build_pipeline(result_dir: str, word_counter: Counter, tag_counter: Counter): r"""Build the pipeline to parse IU Xray report with tokenizer, lowercase and non-alpha removal to generate forte json file with the same name with preprocessed content and information of impression, findings and path to the parent image. Args: result_dir: the directory to save the forte json files. Return: pipeline: built pipeline to process the xml files """ pipeline = Pipeline[MultiPack]() pipeline.resource.update(word_counter=word_counter) pipeline.resource.update(tag_counter=tag_counter) pipeline.set_reader(IUXrayReportReader()) pipeline.add(MultiPackBoxer()) pipeline.add(PackNameJsonPackWriter(), { 'indent': 2, 'output_dir': result_dir, 'overwrite': True }, NameMatchSelector(select_name='default')) pipeline.initialize() return pipeline
def test_pipeline(self, texts, expected_outputs, expected_tokens): nlp = Pipeline[MultiPack]() boxer_config = {"pack_name": "input"} replacer_op = TmpReplacer.__module__ + "." + TmpReplacer.__qualname__ processor_config = { "augment_entry": "ft.onto.base_ontology.Token", "other_entry_policy": { "ft.onto.base_ontology.Document": "auto_align", "ft.onto.base_ontology.Sentence": "auto_align", }, "type": "data_augmentation_op", "data_aug_op": replacer_op, "data_aug_op_config": {}, "augment_pack_names": {}, } nlp.set_reader(reader=StringReader()) nlp.add(component=MultiPackBoxer(), config=boxer_config) nlp.add(component=WhiteSpaceTokenizer(), selector=AllPackSelector()) nlp.add( component=ReplacementDataAugmentProcessor(), config=processor_config ) nlp.initialize() for idx, m_pack in enumerate(nlp.process_dataset(texts)): aug_pack = m_pack.get_pack("augmented_input") self.assertEqual(aug_pack.text, expected_outputs[idx]) for j, token in enumerate(aug_pack.get(Token)): self.assertEqual(token.text, expected_tokens[idx][j])
def test_caster_all_selector(self): """ Test if the caster and all pack selector works well. The caster is used to convert a single pack to multi pack, and then pack copier is used to create a new pack. The all pack selector selects all the pack from the multi pack. This test make sure this pipeline works OK. """ mp: MultiPack for mp in ( Pipeline() .set_reader(SentenceReader()) .add(MultiPackBoxer()) .add(MultiPackCopier()) .add(DummyPackProcessor(), selector=AllPackSelector()) .initialize() .process_dataset( os.path.join(data_samples_root, "random_texts", "0.txt") ) ): num_pack = 0 for pack in mp.packs: num_pack += 1 entries = list(pack.get(NewType)) self.assertEqual(len(entries), 1) self.assertEqual(entries[0].value, "[PACK]") self.assertEqual(num_pack, 2)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--config_file", default="./config.yml", help="Config YAML filepath") args = parser.parse_args() # loading config config = yaml.safe_load(open(args.config_file, "r")) nlp: Pipeline[MultiPack] = Pipeline() nlp.set_reader(RandomDataSelector(), config=config["data_selector_config"]) nlp.add(component=MultiPackBoxer(), config=config["boxer_config"]) nlp.add(component=NLTKWordTokenizer(), selector=AllPackSelector()) nlp.add(component=NLTKPOSTagger(), selector=AllPackSelector()) nlp.add( component=ReplacementDataAugmentProcessor(), config=config["da_processor_config"], ) nlp.initialize() for _, m_pack in enumerate(nlp.process_dataset()): aug_pack = m_pack.get_pack("augmented_input") logging.info(aug_pack.text)
def setUp(self): self.nlp = Pipeline() self.nlp.set_reader(StringReader()) self.nlp.add(NLTKSentenceSegmenter()) boxer_config = {"pack_name": "question"} self.nlp.add(MultiPackBoxer(), boxer_config) self.nlp.add(MutliDocPackAdder()) self.nlp.add(QuestionAnsweringMulti()) self.nlp.initialize()
def setUp(self): random.seed(0) self.nlp = Pipeline[MultiPack]() boxer_config = {'pack_name': 'input_src'} self.nlp.set_reader(reader=StringReader()) self.nlp.add(component=MultiPackBoxer(), config=boxer_config) self.nlp.add(component=NLTKWordTokenizer(), selector=AllPackSelector()) self.nlp.add(component=NLTKPOSTagger(), selector=AllPackSelector())
def setUp(self): random.seed(0) self.nlp = Pipeline[MultiPack]() boxer_config = {"pack_name": "input_src"} self.nlp.set_reader(reader=StringReader()) self.nlp.add(component=MultiPackBoxer(), config=boxer_config) self.nlp.add( component=WhiteSpaceTokenizer(), selector=AllPackSelector() )
def setUp(self): random.seed(8) self.nlp = Pipeline[MultiPack]() boxer_config = {"pack_name": "input_src"} entity_config = {"entities_to_insert": ["Mary", "station"]} self.nlp.set_reader(reader=StringReader()) self.nlp.add(component=EntityMentionInserter(), config=entity_config) self.nlp.add(PeriodSentenceSplitter()) self.nlp.add(component=MultiPackBoxer(), config=boxer_config) self.nlp.add(component=WhiteSpaceTokenizer(), selector=AllPackSelector())
def multi_example(input_path, output_path): """ This example reads data from input path, and write multi pack output to output path. Args: input_path: output_path: Returns: """ print("Multi Pack serialization example.") print( "We first read the data, and add multi-packs to them, and then " "save the results." ) coref_pl = Pipeline() coref_pl.set_reader(DirPackReader()) coref_pl.add(MultiPackBoxer()) coref_pl.add(PackCopier()) coref_pl.add(ExampleCoreferencer()) coref_pl.add(ExampleCorefCounter()) coref_pl.add( MultiPackWriter(), config={ "output_dir": output_path, "indent": 2, "overwrite": True, }, ) coref_pl.run(input_path) print( "We can then load the saved results, and see if everything is OK. " "We should see the same number of multi packs there. " ) reading_pl = Pipeline() reading_pl.set_reader( MultiPackDirectoryReader(), config={ "multi_pack_dir": os.path.join(output_path, "multi"), "data_pack_dir": os.path.join(output_path, "packs"), }, ) reading_pl.add(ExampleCorefCounter()) reading_pl.run()
def testMultiPackWriting(self): coref_pl = Pipeline() coref_pl.set_reader(DirPackReader()) coref_pl.add(MultiPackBoxer()) coref_pl.add(CopySentence()) coref_pl.add(PackIdMultiPackWriter(), config={ 'output_dir': os.path.join(self.main_output.name, 'multi'), 'indent': 2, 'overwrite': True, }) coref_pl.run(os.path.join(self.main_output.name, 'packs')) self.assertTrue(os.path.exists(os.path.join('multi_out', 'multi.idx'))) self.assertTrue(os.path.exists(os.path.join('multi_out', 'pack.idx'))) self.assertTrue(os.path.exists(os.path.join('multi_out', 'packs'))) self.assertTrue(os.path.exists(os.path.join('multi_out', 'multi')))
def testMultiPackWriting(self): coref_pl = Pipeline() coref_pl.set_reader(DirPackReader()) coref_pl.add(MultiPackBoxer()) coref_pl.add(CopySentence()) coref_pl.add( PackIdMultiPackWriter(), config={ "output_dir": os.path.join(self.main_output.name, "multi"), "indent": 2, "overwrite": True, }, ) coref_pl.run(os.path.join(self.main_output.name, "packs")) self.assertTrue(os.path.exists(os.path.join("multi_out", "multi.idx"))) self.assertTrue(os.path.exists(os.path.join("multi_out", "pack.idx"))) self.assertTrue(os.path.exists(os.path.join("multi_out", "packs"))) self.assertTrue(os.path.exists(os.path.join("multi_out", "multi")))
def test_pipeline(self, texts, expected_outputs, expected_tokens): nlp = Pipeline[MultiPack]() boxer_config = { 'pack_name': 'input' } processor_config = { 'augment_entry': "ft.onto.base_ontology.Token", 'other_entry_policy': { 'type': '', 'kwargs': { "ft.onto.base_ontology.Document": "auto_align", "ft.onto.base_ontology.Sentence": "auto_align" } }, 'type': 'data_augmentation_op', 'data_aug_op': 'tests.forte.processors.base.data_augment_replacement_processor_test.TmpReplacer', 'data_aug_op_config': { 'type': '', 'kwargs': {} }, 'augment_pack_names': { 'kwargs': { 'input': 'augmented_input' } } } nlp.set_reader(reader=StringReader()) nlp.add(component=MultiPackBoxer(), config=boxer_config) nlp.add(component=NLTKWordTokenizer(), selector=AllPackSelector()) nlp.add(component=NLTKPOSTagger(), selector=AllPackSelector()) nlp.add(component=ReplacementDataAugmentProcessor(), config=processor_config) nlp.initialize() for idx, m_pack in enumerate(nlp.process_dataset(texts)): aug_pack = m_pack.get_pack('augmented_input') self.assertEqual(aug_pack.text, expected_outputs[idx]) for j, token in enumerate(aug_pack.get(Token)): self.assertEqual(token.text, expected_tokens[idx][j])
def multi_example(input_path, output_path): """ This example reads data from input path, and write multi pack output to output path. Args: input_path: output_path: Returns: """ print("Multi Pack serialization example.") print("We first read the data, and add multi-packs to them, and then " "save the results.") coref_pl = Pipeline() coref_pl.set_reader(DirPackReader()) coref_pl.add(MultiPackBoxer()) coref_pl.add(PackCopier()) coref_pl.add(ExampleCoreferencer()) coref_pl.add(ExampleCorefCounter()) coref_pl.add( MultiPackWriter(), { 'output_dir': output_path, 'indent': 2, 'overwrite': True, } ) coref_pl.run(input_path) print("We can then load the saved results, and see if everything is OK. " "We should see the same number of multi packs there. ") reading_pl = Pipeline() reading_pl.set_reader(MultiPackDiskReader(), {'data_path': output_path}) reading_pl.add(ExampleCorefCounter()) reading_pl.run()
def test_pipeline(self, texts, expected_outputs): nlp = Pipeline[MultiPack]() boxer_config = {"pack_name": "input"} nlp.set_reader(reader=StringReader()) nlp.add(component=MultiPackBoxer(), config=boxer_config) nlp.add(component=WhiteSpaceTokenizer(), selector=AllPackSelector()) processor_config = { "augment_entry": "ft.onto.base_ontology.Token", "other_entry_policy": { "ft.onto.base_ontology.Document": "auto_align", "ft.onto.base_ontology.Sentence": "auto_align", }, "type": "data_augmentation_op", "data_aug_op": "forte.processors.data_augment.algorithms" ".embedding_similarity_replacement_op." "EmbeddingSimilarityReplacementOp", "data_aug_op_config": { "vocab_path": self.abs_vocab_path, "embed_hparams": self.embed_hparams, "top_k": 1, }, "augment_pack_names": { "input": "augmented_input" }, } nlp.add(component=ReplacementDataAugmentProcessor(), config=processor_config) nlp.initialize() for idx, m_pack in enumerate(nlp.process_dataset(texts)): aug_pack = m_pack.get_pack("augmented_input") self.assertEqual(aug_pack.text, expected_outputs[idx])
Pipeline().set_reader( MultiNLIReader() ).add( # Call spacy on remote. RemoteProcessor(), config={ "url": "http://localhost:8008" }, ).add( # Call allennlp on remote. RemoteProcessor(), config={ "url": "http://localhost:8009" }, ).add( MultiPackBoxer() ).add( TweakData() ).add( NLIProcessor(), selector=NameMatchSelector(), selector_config={ "select_name": "default", "reverse_selection": True, } ).add( PackNameMultiPackWriter(), config={ "output_dir": output_dir } ).add(
from forte.data.selector import RegexNameMatchSelector if __name__ == "__main__": # Load config file config_file = os.path.join(os.path.dirname(__file__), "config.yml") config = yaml.safe_load(open(config_file, "r")) config = Config(config, default_hparams=None) # Build pipeline and add the reader, which will read query from terminal. nlp: Pipeline = Pipeline() nlp.set_reader(reader=TerminalReader()) # Start to work on multi-packs in the rest of the pipeline, so we use a # boxer to change this. nlp.add(MultiPackBoxer(), config=config.boxer) # Search tweets. nlp.add(TweetSearchProcessor(), config=config.twitter_search) # Conduct sentiment analysis. pattern = rf"{config.twitter_search.response_pack_name_prefix}_\d" selector_hit = RegexNameMatchSelector(select_name=pattern) nlp.add( component=VaderSentimentProcessor(), selector=selector_hit, config=config.vader_sentiment, ) nlp.initialize()
def testMultiPackWriting(self, config_data): zip_pack, method = config_data # Use different sub-directory to avoid conflicting. subdir = f"{zip_pack}_{method}" with tempfile.TemporaryDirectory() as main_output: # Prepare input data. prepared_input: str = os.path.join(main_output, subdir, "input_packs") data_output: str = os.path.join(main_output, subdir, "output") suffix = ".pickle" if method == "pickle" else ".json" if zip_pack: suffix = suffix + ".gz" nlp = Pipeline[DataPack]() nlp.set_reader(OntonotesReader()) nlp.add( PackIdJsonPackWriter(), { "output_dir": prepared_input, "overwrite": True, "serialize_method": method, "zip_pack": zip_pack, }, ) nlp.run(self.data_path) # Convert to multi pack. coref_pl = Pipeline() coref_pl.set_reader( DirPackReader(), { "serialize_method": method, "zip_pack": zip_pack, "suffix": suffix, }, ) coref_pl.add(MultiPackBoxer()) coref_pl.add(CopySentence()) coref_pl.add(NaiveCoref()) coref_pl.add( PackIdMultiPackWriter(), config={ "output_dir": data_output, "overwrite": True, "serialize_method": method, "zip_pack": zip_pack, }, ) coref_pl.run(prepared_input) self.assertTrue( os.path.exists(os.path.join(data_output, "multi.idx"))) self.assertTrue( os.path.exists(os.path.join(data_output, "pack.idx"))) self.assertTrue(os.path.exists(os.path.join(data_output, "packs"))) self.assertTrue(os.path.exists(os.path.join(data_output, "multi"))) # Read the multi pack again. mp_pipeline = Pipeline() mp_pipeline.set_reader( MultiPackDirectoryReader(), config={ "suffix": suffix, "zip_pack": zip_pack, "serialize_method": method, "data_pack_dir": os.path.join(data_output, "packs"), "multi_pack_dir": os.path.join(data_output, "multi"), }, ).initialize() re: CrossDocEntityRelation for mp in mp_pipeline.process_dataset(): for re in mp.get(CrossDocEntityRelation): self.assertEqual(re.get_parent().text, re.get_child().text)
def test_reuse_processor(self): # Create a basic pipeline of multi packs that have two pack (by copying) nlp = ( Pipeline() .set_reader(SentenceReader()) .add(MultiPackBoxer()) .add(MultiPackCopier()) ) # Create one shared instance of this extractor dummy = DummyPackProcessor() nlp.add( dummy, config={"test": "dummy1"}, selector=NameMatchSelector(), selector_config={"select_name": "default"}, ) # This will not add the component successfully because the processor is # initialized. with self.assertRaises(ProcessorConfigError): nlp.add(dummy, config={"test": "dummy2"}) # This will add the component, with a different selector nlp.add( dummy, selector=NameMatchSelector(), selector_config={"select_name": "copy"}, ) nlp.initialize() # Check that the two processors have the same name. self.assertEqual( nlp.components[2].name, get_full_module_name(DummyPackProcessor) ) self.assertEqual( nlp.components[3].name, get_full_module_name(DummyPackProcessor) ) # Check that the two processors are also the same instance. self.assertEqual(nlp.components[2], nlp.components[3]) # Check that the initialization is only done once, here the count # will only be 1. self.assertEqual(nlp.components[2].initialize_count, 1) self.assertEqual(nlp.components[3].initialize_count, 1) # Check that the configuration is not changed by the second insertion. self.assertEqual(nlp.components[3].configs.test, "dummy1") # Run it once to make sure it can run. dataset_path = os.path.join(data_samples_root, "random_texts", "0.txt") nlp.run(dataset_path) # Check that initialization will be false after `run`, because it # calls the `finish` function of all components. self.assertFalse(nlp.components[2].is_initialized) self.assertFalse(nlp.components[3].is_initialized) # Check that we are able to re-initialize the pipeline. nlp.initialize() # initialize the first time. nlp.initialize() # re-initialize. # Check the name again after re-initialize. self.assertEqual( nlp.components[2].name, get_full_module_name(DummyPackProcessor) ) self.assertEqual( nlp.components[3].name, get_full_module_name(DummyPackProcessor) ) # Obtain the results from the multipack. mp: MultiPack = nlp.process(dataset_path) pack: DataPack = mp.get_pack("default") pack_copy: DataPack = mp.get_pack("copy") # Check both pack are processed by the DummyProcessor once, because # we use different selector. pack.get_single(NewType).value = "[PACK]" pack_copy.get_single(NewType).value = "[PACK]"