def complete_and_tokens(): # Define paths pack_input = os.path.join(pack_dir, "nif_raw_struct_links") pack_output = os.path.join(pack_dir, "nif_raw_struct_links_token") # Store which documents are processed, try to make input output structure # similar. pack_input_index = os.path.join(pack_input, "article.idx") pack_output_index = os.path.join(pack_output, "article.idx") logging.basicConfig( format="%(asctime)s - %(message)s", level=logging.INFO, filename=os.path.join(pack_dir, "complete_tokenize.log"), ) pipeline = Pipeline(loaded_resource).set_reader( DirPackReader(), config={ "suffix": ".json.gz", "zip_pack": True }, # ).add( # WikiEntityCompletion() ).add(WikiAddTitle()).add(SpacyProcessor(), config={ "processors": ["sentence", "tokenize"], }).add(SubwordTokenizer(), config={ "tokenizer_configs": { "pretrained_model_name": "bert-base-uncased" }, "token_source": "ft.onto.base_ontology.Token", }).add( WikiArticleWriter(), config={ "output_dir": pack_output, "zip_pack": True, "drop_record": True, "input_index_file": pack_input_index, "output_index_file": pack_output_index, "use_input_index": True, "serialize_method": "jsonpickle" }, ).add(ProgressPrinter()) pipeline.run(pack_input)
def main(input_path: str): pipeline = Pipeline() pipeline.set_reader(DirPackReader(), ).add(TbfWriter(), config={ "output_path": None, "system_name": None, }).initialize() pipeline.process(input_path)
Pipeline().set_reader( MultiNLIReader() ).add( # Call spacy on remote. RemoteProcessor(), config={ "url": "http://localhost:8008" }, ).add( # Call allennlp on remote. RemoteProcessor(), config={ "url": "http://localhost:8009" }, ).add( MultiPackBoxer() ).add( TweakData() ).add( NLIProcessor(), selector=NameMatchSelector(), selector_config={ "select_name": "default", "reverse_selection": True, } ).add( PackNameMultiPackWriter(), config={ "output_dir": output_dir } ).add( ProgressPrinter(), ).run()
pack_output = os.path.join(pack_dir, "category") # Store which documents have category. pack_input_index = os.path.join(pack_input, "article.idx") # Store which documents have category. pack_output_index = os.path.join(pack_output, "category.idx") logging.basicConfig( format="%(asctime)s - %(message)s", level=logging.INFO, filename=os.path.join(pack_dir, "category.log"), ) Pipeline(resources).set_reader( WikiCategoryReader(), config={ "pack_index": pack_input_index, "pack_dir": pack_input, }, ).add( WikiArticleWriter(), config={ "output_dir": pack_output, "zip_pack": True, "drop_record": True, "input_index_file": pack_input_index, "output_index_file": pack_output_index, "use_input_index": True, "overwrite": True, }, ).run(os.path.join(base_dir, "article_categories_en.tql.bz2"))
from forte.huggingface import ZeroShotClassifier from forte.stanza import StandfordNLPProcessor from forte import Pipeline from forte.data.readers import TerminalReader from forte.processors.stave import StaveProcessor nlp = Pipeline() nlp.set_reader(TerminalReader()) nlp.add(StandfordNLPProcessor()) nlp.add( ZeroShotClassifier(), config={ "candidate_labels": [ "travel", "cooking", "dancing", "exploration", ], }, ) nlp.add(StaveProcessor()) nlp.initialize() nlp.run()
from facets.kbp_reader import EREReader from facets.nli.analysis import DebugProcessor from forte import Pipeline import sys kbp_dir = sys.argv[1] Pipeline().set_reader(EREReader()).run([kbp_dir])
from forte import Pipeline from forte.data.readers import DirPackReader from forte.processors.stave import StaveProcessor Pipeline( ontology_file="conf/full.json" ).set_reader( DirPackReader() ).add( StaveProcessor(), config={ "port": 8880, "use_pack_name": True, } ).run( # "/home/hector/data/kbp/train" "/Users/hector.liu/Downloads/train" )
from fortex.allennlp import AllenNLPProcessor from forte import Pipeline from forte.data.readers import RawDataDeserializeReader Pipeline().set_reader(RawDataDeserializeReader()).add( AllenNLPProcessor(), config={ "processors": ["tokenize", "pos", "lemma", "depparse", "srl"], "infer_batch_size": 1, } ).serve(port=8009)
import sys from forte import Pipeline from forte.data.readers import DirPackReader from forte.processors.stave import StaveProcessor if __name__ == '__main__': input_dir = sys.argv[1] onto_file = sys.argv[2] nlp = Pipeline(ontology_file=onto_file) nlp.set_reader( DirPackReader(), config={ "suffix": ".json.gz" } ).add( StaveProcessor() ).run(input_dir)
import sys import IPython from facets.utils import ProgressPrinter from forte import Pipeline from forte.data import DataPack from forte.data.readers.deserialize_reader import DirPackReader from forte.processors.base import PackProcessor class PackExplorer(PackProcessor): def _process(self, pack: DataPack): IPython.embed() if __name__ == "__main__": Pipeline().set_reader( DirPackReader(), config={ "suffix": ".pickle.gz", "zip_pack": True, "serialize_method": "pickle" }, ).add(ProgressPrinter()).run(sys.argv[1])
from forte import Pipeline from forte.data.readers import RawDataDeserializeReader from fortex.spacy import SpacyProcessor Pipeline().set_reader(RawDataDeserializeReader()).add( SpacyProcessor(), config={ "processors": ["sentence"] } ).serve(port=8008)