def test_issue4402(): json_data = { "id": 0, "paragraphs": [ { "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.", "sentences": [ { "tokens": [ {"id": 0, "orth": "How", "ner": "O"}, {"id": 1, "orth": "should", "ner": "O"}, {"id": 2, "orth": "I", "ner": "O"}, {"id": 3, "orth": "cook", "ner": "O"}, {"id": 4, "orth": "bacon", "ner": "O"}, {"id": 5, "orth": "in", "ner": "O"}, {"id": 6, "orth": "an", "ner": "O"}, {"id": 7, "orth": "oven", "ner": "O"}, {"id": 8, "orth": "?", "ner": "O"}, ], "brackets": [], }, { "tokens": [ {"id": 9, "orth": "\n", "ner": "O"}, {"id": 10, "orth": "I", "ner": "O"}, {"id": 11, "orth": "'ve", "ner": "O"}, {"id": 12, "orth": "heard", "ner": "O"}, {"id": 13, "orth": "of", "ner": "O"}, {"id": 14, "orth": "people", "ner": "O"}, {"id": 15, "orth": "cooking", "ner": "O"}, {"id": 16, "orth": "bacon", "ner": "O"}, {"id": 17, "orth": "in", "ner": "O"}, {"id": 18, "orth": "an", "ner": "O"}, {"id": 19, "orth": "oven", "ner": "O"}, {"id": 20, "orth": ".", "ner": "O"}, ], "brackets": [], }, ], "cats": [ {"label": "baking", "value": 1.0}, {"label": "not_baking", "value": 0.0}, ], }, { "raw": "What is the difference between white and brown eggs?\n", "sentences": [ { "tokens": [ {"id": 0, "orth": "What", "ner": "O"}, {"id": 1, "orth": "is", "ner": "O"}, {"id": 2, "orth": "the", "ner": "O"}, {"id": 3, "orth": "difference", "ner": "O"}, {"id": 4, "orth": "between", "ner": "O"}, {"id": 5, "orth": "white", "ner": "O"}, {"id": 6, "orth": "and", "ner": "O"}, {"id": 7, "orth": "brown", "ner": "O"}, {"id": 8, "orth": "eggs", "ner": "O"}, {"id": 9, "orth": "?", "ner": "O"}, ], "brackets": [], }, {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []}, ], "cats": [ {"label": "baking", "value": 0.0}, {"label": "not_baking", "value": 1.0}, ], }, ], } nlp = English() attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"] with make_tempdir() as tmpdir: output_file = tmpdir / "test4402.spacy" docs = json_to_docs([json_data]) data = DocBin(docs=docs, attrs=attrs).to_bytes() with output_file.open("wb") as file_: file_.write(data) reader = Corpus(output_file) train_data = list(reader(nlp)) assert len(train_data) == 2 split_train_data = [] for eg in train_data: split_train_data.extend(eg.split_sents()) assert len(split_train_data) == 4
def test_issue4367(): """Test that docbin init goes well""" DocBin() DocBin(attrs=["LEMMA"]) DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
def process(self): """ Reads text and outputs entities per text body. """ # prepare staging area staging_area = self.dataset.get_staging_area() self.dataset.update_status("Preparing data") # go through all archived token sets and vectorise them results = [] # Load the spacy goods nlp = spacy.load("en_core_web_sm") nlp.tokenizer = self.custom_tokenizer( nlp) # Keep words with a dash in between # Disable what has _not_ been selected options = ["parser", "tagger", "ner"] enable = self.parameters.get("enable", False) if not enable: self.dataset.update_status("Select at least one of the options.") self.dataset.finish(0) return disable = [option for option in options if option not in enable] # Get all ze text first so we can process it in batches posts = [ post["body"] if post["body"] else "" for post in self.iterate_items(self.source_file) ] # Process the text in batches if len(posts) < 100000: self.dataset.update_status("Extracting linguistic features") else: self.dataset.update_status( "Extracting linguistic features is currently only available for datasets with less than 100.000 items." ) self.dataset.finish(0) return # Make sure only the needed information is extracted. attrs = [] if "tagger" not in disable: attrs.append("POS") if "parser" not in disable: attrs.append("DEP") if "ner": attrs.append("ENT_IOB") attrs.append("ENT_TYPE") attrs.append("ENT_ID") attrs.append("ENT_KB_ID") # DocBin for quick saving doc_bin = DocBin(attrs=attrs) # Start the processing! try: for i, doc in enumerate(nlp.pipe(posts, disable=disable)): doc_bin.add(doc) # It's quite a heavy process, so make sure it can be interrupted if self.interrupted: raise ProcessorInterruptedException( "Processor interrupted while iterating through CSV file" ) if i % 1000 == 0: self.dataset.update_status("Done with post %s out of %s" % (i, len(posts))) except MemoryError: self.dataset.update_status( "Out of memory. The dataset may be too large to process. Try again with a smaller dataset.", is_final=True) return self.dataset.update_status( "Serializing results - this will take a while") # Then serialize the NLP docs and the vocab doc_bytes = doc_bin.to_bytes() # Dump ze data in a temporary folder with staging_area.joinpath("spacy_docs.pb").open("wb") as outputfile: pickle.dump(doc_bytes, outputfile) # create zip of archive and delete temporary files and folder self.write_archive_and_finish(staging_area, compression=zipfile.ZIP_LZMA)
import json import spacy from spacy.matcher import Matcher from spacy.tokens import Span, DocBin with open("exercises/zh/iphone.json", encoding="utf8") as f: TEXTS = json.loads(f.read()) nlp = spacy.blank("zh") matcher = Matcher(nlp.vocab) # 将pattern加入mattcher中 pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}] pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True}] matcher.add("GADGET", [pattern1, pattern2]) docs = [] for doc in nlp.pipe(TEXTS): matches = matcher(doc) spans = [ Span(doc, start, end, label=match_id) for match_id, start, end in matches ] doc.ents = spans docs.append(doc) doc_bin = DocBin(docs=docs) doc_bin.to_disk("./train.spacy")
from tqdm import tqdm import spacy from spacy.tokens import DocBin import generateTrainData #loads training data from generateTrainData training_data = generateTrainData.generateTrainData() nlp = spacy.blank('de') # load a new spacy model db = DocBin() # create a DocBin object for text, annot in tqdm(training_data): # data in previous format doc = nlp.make_doc(text) # create doc object from text ents = [] for start, end, label in annot["entities"]: # add character indexes span = doc.char_span(start, end, label=label, alignment_mode="contract") if span is None: print("Skipping entity") else: ents.append(span) doc.ents = ents # label the text with the ents db.add(doc) db.to_disk("./train.spacy") # save the docbin object
def main(trained_pipeline: Path, test_data: Path, print_details: bool): print_details = False nlp = spacy.load(trained_pipeline) doc_bin = DocBin(store_user_data=True).from_disk(test_data) docs = doc_bin.get_docs(nlp.vocab) examples = [] for gold in docs: pred = Doc( nlp.vocab, words=[t.text for t in gold], spaces=[t.whitespace_ for t in gold], ) pred.ents = gold.ents for name, proc in nlp.pipeline: pred = proc(pred) examples.append(Example(pred, gold)) # Print the gold and prediction, if gold label is not 0 if print_details: print() print(f"Text: {gold.text}") print(f"spans: {[(e.start, e.text, e.label_) for e in pred.ents]}") for value, rel_dict in pred._.rel.items(): gold_labels = [ k for (k, v) in gold._.rel[value].items() if v == 1.0 ] if gold_labels: print( f" pair: {value} --> gold labels: {gold_labels} --> predicted values: {rel_dict}" ) print() random_examples = [] docs = doc_bin.get_docs(nlp.vocab) for gold in docs: pred = Doc( nlp.vocab, words=[t.text for t in gold], spaces=[t.whitespace_ for t in gold], ) pred.ents = gold.ents relation_extractor = nlp.get_pipe("relation_extractor") get_instances = relation_extractor.model.attrs["get_instances"] for (e1, e2) in get_instances(pred): offset = (e1.start, e2.start) if offset not in pred._.rel: pred._.rel[offset] = {} for label in relation_extractor.labels: pred._.rel[offset][label] = random.uniform(0, 1) random_examples.append(Example(pred, gold)) thresholds = [ 0.000, 0.050, 0.100, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 0.999 ] print() print("Random baseline:") _score_and_format(random_examples, thresholds) print() print("Results of the trained model:") _score_and_format(examples, thresholds)