def _make_gold(nlp, text, sent_annots, drop_deps=0.0): # Flatten the conll annotations, and adjust the head indices flat = defaultdict(list) sent_starts = [] for sent in sent_annots: flat["heads"].extend( len(flat["words"]) + head for head in sent["heads"]) for field in [ "words", "tags", "deps", "morphology", "entities", "spaces" ]: flat[field].extend(sent[field]) sent_starts.append(True) sent_starts.extend([False] * (len(sent["words"]) - 1)) # Construct text if necessary assert len(flat["words"]) == len(flat["spaces"]) if text is None: text = "".join(word + " " * space for word, space in zip(flat["words"], flat["spaces"])) doc = nlp.make_doc(text) flat.pop("spaces") gold = GoldParse(doc, **flat) gold.sent_starts = sent_starts for i in range(len(gold.heads)): if random.random() < drop_deps: gold.heads[i] = None gold.labels[i] = None return doc, gold
def Train(): print("\nThe outcomes of Training and Updating are:") from spacy.tokens import Doc from spacy.vocab import Vocab from spacy.gold import GoldParse vocab = Vocab(tag_map={'N': {'pos': 'NOUN'}, 'V': {'pos': 'VERB'}}) doc = Doc(vocab, words=['用户', '体验', 'APP']) gold = GoldParse(doc, tags=['N', 'V', 'N']) doc = Doc(Vocab(), words=['陆金所', '成立', 'AI实验室', '已经', '一年']) gold = GoldParse(doc, entities=['U-ORG', 'O', 'U-TECHNOLOGY', 'O', 'U-DATE']) doc = Doc(nlp.vocab, words=[u'刘强东', u'章泽天', u'大学生', u'遇见'], spaces=[False, False, False, False]) gold = GoldParse(doc, entities=[u'PERSON', u'PERSON', u'PRODUCT', u'O']) train_data = convert_JSON_python('/home/wangdi498/SpaCy/NER_example2.json') with nlp.disable_pipes(*[pipe for pipe in nlp.pipe_names if pipe != 'ner']): optimizer = nlp.begin_training() for i in range(10): random.shuffle(train_data) # 每轮都会shuffle训练数据,保证模型不会根据训练顺序来做generalizations。也可以设置dropout rate让模型以一定几率放弃一些features和representations来避免模型过牢地记住训练数据。 for text, annotations in train_data: # doc = nlp.make_doc(text) # gold = GoldParse(doc, entities=entity_offsets) # nlp.update([doc], [gold], drop=0.5, sgd=optimizer) nlp.update([text], [annotations], sgd=optimizer) # 用得到的数据更新模型。 nlp.to_disk("/home/wangdi498/SpaCy/models")
def _make_gold(nlp, text, sent_annots, drop_deps=0.0): # Flatten the conll annotations, and adjust the head indices flat = defaultdict(list) sent_starts = [] for sent in sent_annots: flat["heads"].extend(len(flat["words"]) + head for head in sent["heads"]) for field in ["words", "tags", "deps", "entities", "spaces"]: flat[field].extend(sent[field]) sent_starts.append(True) sent_starts.extend([False] * (len(sent["words"]) - 1)) # Construct text if necessary assert len(flat["words"]) == len(flat["spaces"]) if text is None: text = "".join( word + " " * space for word, space in zip(flat["words"], flat["spaces"]) ) doc = nlp.make_doc(text) flat.pop("spaces") gold = GoldParse(doc, **flat) gold.sent_starts = sent_starts for i in range(len(gold.heads)): if random.random() < drop_deps: gold.heads[i] = None gold.labels[i] = None return doc, gold
def train_ner(_nlp, train_data, iterations, learn_rate=1e-3, dropout=0., tags_complete=True): """ Train spacy entity recogniser (either the new on or update existing _nlp.entity) :param _nlp: spacy.lang.Language class, containing EntityRecogniser which is to be trained :param train_data: dataset in spacy format for training :param iterations: num of full iterations through the dataset :param learn_rate: :param dropout: :param tags_complete: if True, then assume that provided entity tags are complete :return: """ _nlp.entity.model.learn_rate = learn_rate for itn in range(1, iterations+1): random.shuffle(train_data) loss = 0. for old_doc, entity_offsets in train_data: doc = _nlp.make_doc(old_doc.text) # it is needed despite that the data is already preprocessed (by _nlp() call) gold = GoldParse(doc, entities=entity_offsets) # By default, the GoldParse class assumes that the entities # described by offset are complete, and all other words should # have the tag 'O'. You can tell it to make no assumptions # about the tag of a word by giving it the tag '-'. if not tags_complete: for i in range(len(gold.ner)): if gold.ner[i] == 'O': gold.ner[i] = '-' _nlp.tagger(doc) # todo: why is that? is it needed for updating existing? is it needed for new model? loss += _nlp.entity.update(doc, gold, drop=dropout) log.info('train_ner: iter #{}/{}, loss: {}'.format(itn, iterations, loss)) if loss == 0: break
def test_get_oracle_moves_negative_O(tsys, vocab): doc = Doc(vocab, words=["A", "B", "C", "D"]) gold = GoldParse(doc, entities=[]) gold.ner = ["O", "!O", "O", "!O"] tsys.preprocess_gold(gold) act_classes = tsys.get_oracle_sequence(doc, gold) names = [tsys.get_class_name(act) for act in act_classes] assert names
def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots): entity_annots = [(s, e, "!" + label) for s, e, label in entity_annots] gold = GoldParse(doc, entities=entity_annots) for i, tag in enumerate(gold.ner): if tag == "L-!GPE": gold.ner[i] = "-" tsys.preprocess_gold(gold) act_classes = tsys.get_oracle_sequence(doc, gold) names = [tsys.get_class_name(act) for act in act_classes] assert names
def test_ner_update_batch(ner, nlp): doc1 = nlp("Hello world. This is sentence 2.") doc2 = nlp("Hi again. This is sentence 4.") ents1 = ["O"] * len(doc1) ents2 = ["O"] * len(doc2) assert len(list(doc1.sents)) == 2 assert len(list(doc2.sents)) == 2 optimizer = nlp.resume_training() golds = [GoldParse(doc1, entities=ents1), GoldParse(doc2, entities=ents2)] losses = {} ner.update([doc1, doc2], golds, sgd=optimizer, losses=losses) assert PIPES.ner in losses
def test_textcat_update_batch(textcat, nlp): doc1 = nlp("Hello world. This is sentence 2.") doc2 = nlp("Hi again. This is sentence 4.") assert len(list(doc1.sents)) == 2 assert len(list(doc2.sents)) == 2 optimizer = nlp.resume_training() golds = [ GoldParse(doc1, cats={"Hello": 1.0}), GoldParse(doc2, cats={"Hello": 0.0}) ] losses = {} textcat.update([doc1, doc2], golds, sgd=optimizer, losses=losses) assert "pytt_textcat" in losses
def test_ner_per_type(en_vocab): # Gold and Doc are identical scorer = Scorer() for input_, annot in test_ner_cardinal: doc = get_doc( en_vocab, words=input_.split(" "), ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]], ) gold = GoldParse(doc, entities=annot["entities"]) scorer.score(doc, gold) results = scorer.scores assert results["ents_p"] == 100 assert results["ents_f"] == 100 assert results["ents_r"] == 100 assert results["ents_per_type"]["CARDINAL"]["p"] == 100 assert results["ents_per_type"]["CARDINAL"]["f"] == 100 assert results["ents_per_type"]["CARDINAL"]["r"] == 100 # Doc has one missing and one extra entity # Entity type MONEY is not present in Doc scorer = Scorer() for input_, annot in test_ner_apple: doc = get_doc( en_vocab, words=input_.split(" "), ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]], ) gold = GoldParse(doc, entities=annot["entities"]) scorer.score(doc, gold) results = scorer.scores assert results["ents_p"] == approx(66.66666) assert results["ents_r"] == approx(66.66666) assert results["ents_f"] == approx(66.66666) assert "GPE" in results["ents_per_type"] assert "MONEY" in results["ents_per_type"] assert "ORG" in results["ents_per_type"] assert results["ents_per_type"]["GPE"]["p"] == 100 assert results["ents_per_type"]["GPE"]["r"] == 100 assert results["ents_per_type"]["GPE"]["f"] == 100 assert results["ents_per_type"]["MONEY"]["p"] == 0 assert results["ents_per_type"]["MONEY"]["r"] == 0 assert results["ents_per_type"]["MONEY"]["f"] == 0 assert results["ents_per_type"]["ORG"]["p"] == 50 assert results["ents_per_type"]["ORG"]["r"] == 100 assert results["ents_per_type"]["ORG"]["f"] == approx(66.66666)
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, beam_width=None): nlp = Language(data_dir=model_dir) if beam_width is not None: nlp.parser.cfg.beam_width = beam_width scorer = Scorer() for raw_text, sents in gold_tuples: if gold_preproc: raw_text = None else: sents = _merge_sents(sents) for annot_tuples, brackets in sents: if raw_text is None: tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) nlp.tagger(tokens) nlp.entity(tokens) nlp.parser(tokens) else: tokens = nlp(raw_text, merge_mwes=False) gold = GoldParse(tokens, annot_tuples) scorer.score(tokens, gold, verbose=verbose) return scorer
def _get_gold_parse(doc, entities, dev, kb, labels_discard): gold_entities = {} tagged_ent_positions = {(ent.start_char, ent.end_char): ent for ent in doc.ents if ent.label_ not in labels_discard} for entity in entities: entity_id = entity["entity"] alias = entity["alias"] start = entity["start"] end = entity["end"] candidate_ids = [] if kb and not dev: candidates = kb.get_candidates(alias) candidate_ids = [cand.entity_ for cand in candidates] tagged_ent = tagged_ent_positions.get((start, end), None) if tagged_ent: # TODO: check that alias == doc.text[start:end] should_add_ent = (dev or entity_id in candidate_ids) and is_valid_sentence( tagged_ent.sent.text) if should_add_ent: value_by_id = {entity_id: 1.0} if not dev: random.shuffle(candidate_ids) value_by_id.update({ kb_id: 0.0 for kb_id in candidate_ids if kb_id != entity_id }) gold_entities[(start, end)] = value_by_id return GoldParse(doc, links=gold_entities)
def test_goldparse_startswith_space(en_tokenizer): text = " a" doc = en_tokenizer(text) g = GoldParse(doc, words=["a"], entities=["U-DATE"], deps=["ROOT"], heads=[0]) assert g.words == [" ", "a"] assert g.ner == [None, "U-DATE"] assert g.labels == [None, "ROOT"]
def train_recognizer(self): # Tentons une technique de NER par patrons en utilisant la librairie spaCy comp = self.nlp.create_pipe('ner') self.nlp.add_pipe(comp) comp.add_label("Task") comp.add_label("Material") comp.add_label("Process") # nlp.from_disk('C:/Users/Lobar/Desktop/TP3_NLP/spacy_models') optimizer = self.nlp.begin_training() losses = {} for training in self.data.data_train: f = open(self.repertory + self.train_file + "/" + training[0], 'r', encoding="utf-8") text = f.readlines() text = text[0] entities = [] for ent in training[1]: if ent[0] == "T": splitted = re.split(r'\W+', ent[1]) entity = (int(splitted[1]), int(splitted[2]), splitted[0]) entities.append(entity) doc = self.nlp.make_doc(text) gold = GoldParse(doc, entities=entities) self.nlp.update([doc], [gold], drop=0.5, losses=losses, sgd=optimizer) f.close() ''' A MODIFIER LE CHEMIN D'ACCES ''' self.nlp.to_disk( "C:\\Users\\Lobar\\Desktop\\TP3_NLP\\spacy_models")
def _from_json_to_crf(self, message, entity_offsets): # type: (Message, List[Tuple[int, int, Text]]) -> List[Tuple[Text, Text, Text, Text]] """Takes the json examples and switches them to a format which crfsuite likes.""" from spacy.gold import GoldParse doc = message.get("spacy_doc") gold = GoldParse(doc, entities=entity_offsets) ents = [l[5] for l in gold.orig_annot] if '-' in ents: logger.warn( "Misaligned entity annotation in sentence '{}'. ".format( doc.text) + "Make sure the start and end values of the annotated training " + "examples end at token boundaries (e.g. don't include trailing whitespaces)." ) if not self.BILOU_flag: for i, entity in enumerate(ents): if entity.startswith('B-') or \ entity.startswith('I-') or \ entity.startswith('U-') or \ entity.startswith('L-'): ents[i] = entity[2:] # removes the BILOU tags return self._from_text_to_crf(message, ents)
def _from_json_to_crf(self, json_eg, spacy_nlp): # type: (Tuple[Text, List[Tuple[int, int, Text]]], Language) -> List[Tuple[Text, Text, Text]] """Takes the json examples and switches them to a format which crfsuite likes.""" from spacy.language import Language from spacy.gold import GoldParse doc = spacy_nlp(json_eg[0]) entity_offsets = json_eg[1] gold = GoldParse(doc, entities=entity_offsets) ents = [l[5] for l in gold.orig_annot] if '-' in ents: logger.warn("Misaligned entity annotation in sentence '{}'. ".format(doc.text) + "Make sure the start and end values of the annotated training " + "examples end at token boundaries (e.g. don't include trailing whitespaces).") if not self.BILOU_flag: def ent_clean(entity): if entity.startswith('B-') or entity.startswith('I-') or entity.startswith('U-') or entity.startswith( 'L-'): return entity[2:] else: return entity else: def ent_clean(entity): return entity crf_format = [(doc[i].text, doc[i].tag_, ent_clean(ents[i])) for i in range(len(doc))] return crf_format
def main(output_dir=None): if output_dir is not None: output_dir = Path(output_dir) ensure_dir(output_dir) ensure_dir(output_dir / "pos") ensure_dir(output_dir / "vocab") vocab = Vocab(tag_map=TAG_MAP) # The default_templates argument is where features are specified. See # spacy/tagger.pyx for the defaults. tagger = Tagger(vocab) for i in range(25): for words, tags in DATA: doc = Doc(vocab, words=words) gold = GoldParse(doc, tags=tags) tagger.update(doc, gold) random.shuffle(DATA) tagger.model.end_training() doc = Doc(vocab, orths_and_spaces=zip(["I", "like", "blue", "eggs"], [True] * 4)) tagger(doc) for word in doc: print(word.text, word.tag_, word.pos_) if output_dir is not None: tagger.model.dump(str(output_dir / 'pos' / 'model')) with (output_dir / 'vocab' / 'strings.json').open('w') as file_: tagger.vocab.strings.dump(file_)
def main(n_iter=10): nlp = spacy.blank('en') ner = nlp.create_pipe('ner') ner.add_multitask_objective(get_position_label) nlp.add_pipe(ner) print("Create data", len(TRAIN_DATA)) optimizer = nlp.begin_training(get_gold_tuples=lambda: TRAIN_DATA) for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} for text, annot_brackets in TRAIN_DATA: annotations, _ = annot_brackets doc = nlp.make_doc(text) gold = GoldParse.from_annot_tuples(doc, annotations[0]) nlp.update( [doc], # batch of texts [gold], # batch of annotations drop=0.2, # dropout - make it harder to memorise data sgd=optimizer, # callable to update weights losses=losses) print(losses.get('nn_labeller', 0.0), losses['ner']) # test the trained model for text, _ in TRAIN_DATA: doc = nlp(text) print('Entities', [(ent.text, ent.label_) for ent in doc.ents]) print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
def train_NER(train_data, t1Files): nlp = spacy.load('en', entity=False, parser=False) ner = EntityRecognizer(nlp.vocab, entity_types=[ 'ID', 'INCIDENT', 'WEAPON', 'PERP INDIV', 'PERP ORG', 'TARGET', 'VICTIM' ]) for itn in range(5): random.shuffle(train_data) for raw_text, entity_offset in train_data: doc = nlp.make_doc(raw_text) gold = GoldParse(doc, entities=entity_offset) nlp.tagger(doc) ner.update(doc, gold) ner.model.end_training() for i in range(len(t1Files)): test_file_sentences = t1Files[i][1][1] for j in range(len(test_file_sentences)): s = unicode(test_file_sentences[j]) doc = nlp(s, entity=False) ner(doc) print("Entites on fine tuned NER:") for word in doc: print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob)
def train_model(labels, examples, epochs=10, verbose=False): nlp = spacy.blank('ru') ner = create_ner(nlp) nlp.add_pipe(ner, last=True) for l in labels: print("Label:", l) ner.add_label(l) optimizer = nlp.begin_training() if verbose: print("Training data:") for t in examples: # print(t['text']) for ls, le, lt in t['labels']: print('{} : "{}"'.format(lt, t['text'][ls: le])) for e in tqdm(range(epochs)): for batch in minibatch([e for e in examples], size=1): # print([t['labels'] for t in batch]) docs = [nlp.tokenizer(t['text']) for t in batch] goldparses = [GoldParse(d, entities=t['labels']) for d, t in zip(docs, batch)] losses = {} nlp.update(docs, goldparses, drop=0.5, losses=losses, sgd=optimizer) return nlp
def _update_ner_model(self, ner, nlp, train_data): for itn in range(5): random.shuffle(train_data) for raw_text, entity_offsets in train_data: doc = nlp.make_doc(raw_text) gold = GoldParse(doc, entities=entity_offsets) ner.update(doc, gold)
def extract_docs_and_golds_from_opencorpora(nlp, opencorpora_file): parsed_sentences = [] gold_sentences = [] with open(opencorpora_file, "r") as f: opencorpora = f.read().encode('utf-8') page_tree = html.fromstring(opencorpora) for text in page_tree.xpath('//text'): for paragraphs in text.xpath('./paragraphs'): for paragraph in paragraphs.xpath('./paragraph'): for sentence in paragraph.xpath('./sentence'): text = sentence.xpath('./source')[0].text parsed_sentences.append(nlp(text)) sent_words = [ token.attrib['text'] for token in sentence.xpath('./tokens/token') ] gold = GoldParse( Doc(nlp.vocab, words=sent_words), words=sent_words, # heads=sent_heads, # tags=sent_tags, deps=sent_deps, entities=['-'] * len(sent_words)) gold_sentences.append(gold) return parsed_sentences, gold_sentences
def evaluate(self, verbose=1): """Do evaluation on test data Parameters ---------- verbose : bool print out the wrong case from prediction """ scorer = Scorer() wrong_case = 0 for input_, annot in self.data: doc_gold_text = self.nlp.make_doc(input_) gold = GoldParse(doc_gold_text, entities=annot['entities']) pred_value = self.nlp(input_) #return gold current_score = scorer.ents_f scorer.score(pred_value, gold) if (current_score > scorer.ents_f): wrong_case += 1 if (verbose == 1): print_beauty_NER(prediction_to_IOB(pred_value, gold)) current_score = scorer.ents_f return scorer.scores #, wrong_case, len(self.data)
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, beam_width=None, cand_preproc=None): nlp = Language(data_dir=model_dir) if nlp.lang == 'de': nlp.vocab.morphology.lemmatizer = lambda string, pos: set([string]) if beam_width is not None: nlp.parser.cfg.beam_width = beam_width scorer = Scorer() for raw_text, sents in gold_tuples: if gold_preproc: raw_text = None else: sents = _merge_sents(sents) for annot_tuples, brackets in sents: if raw_text is None: tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) nlp.tagger(tokens) nlp.parser(tokens) nlp.entity(tokens) else: tokens = nlp(raw_text) gold = GoldParse(tokens, annot_tuples) scorer.score(tokens, gold, verbose=verbose) return scorer
def test_textcat_learns_multilabel(): random.seed(5) numpy.random.seed(5) docs = [] nlp = Language() letters = ["a", "b", "c"] for w1 in letters: for w2 in letters: cats = {letter: float(w2 == letter) for letter in letters} docs.append((Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3), cats)) random.shuffle(docs) model = TextCategorizer(nlp.vocab, width=8) for letter in letters: model.add_label(letter) optimizer = model.begin_training() for i in range(30): losses = {} Ys = [GoldParse(doc, cats=cats) for doc, cats in docs] Xs = [doc for doc, cats in docs] model.update(Xs, Ys, sgd=optimizer, losses=losses) random.shuffle(docs) for w1 in letters: for w2 in letters: doc = Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3) truth = {letter: w2 == letter for letter in letters} model(doc) for cat, score in doc.cats.items(): if not truth[cat]: assert score < 0.5 else: assert score > 0.5
def main(n_iter=10): nlp = spacy.blank("en") ner = nlp.create_pipe("ner") ner.add_multitask_objective(get_position_label) nlp.add_pipe(ner) print(nlp.pipeline) print("Create data", len(TRAIN_DATA)) optimizer = nlp.begin_training(get_gold_tuples=lambda: TRAIN_DATA) for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} for text, annot_brackets in TRAIN_DATA: for annotations, _ in annot_brackets: doc = Doc(nlp.vocab, words=annotations[1]) gold = GoldParse.from_annot_tuples(doc, annotations) nlp.update( [doc], # batch of texts [gold], # batch of annotations drop=0.2, # dropout - make it harder to memorise data sgd=optimizer, # callable to update weights losses=losses, ) print(losses.get("nn_labeller", 0.0), losses["ner"]) # test the trained model for text, _ in TRAIN_DATA: if text is not None: doc = nlp(text) print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
def test_tokenizer(): doc: Doc = pytest.nlp.make_doc("Ceci est un test.") offsets = [(0, 4, "PERS"), (9, 11, "PERS")] gold: GoldParse = GoldParse(doc, entities=offsets) word_extracted = [doc.char_span(o[0], o[1]) for o in offsets] count_ent = sum([1 for item in gold.ner if item != "O"]) assert count_ent == len(word_extracted) offsets = [(0, 4, "PERS"), (9, 12, "PERS")] gold: GoldParse = GoldParse(doc, entities=offsets) word_extracted = [ doc.char_span(o[0], o[1]) for o in offsets if doc.char_span(o[0], o[1]) is not None ] count_ent = sum([1 for item in gold.ner if item != "O"]) assert count_ent > len(word_extracted)
def _from_json_to_crf(self, json_eg, spacy_nlp): # type: (Tuple[Text, List[Tuple[int, int, Text]]], Language) -> List[Tuple[Text, Text, Text]] """Takes the json examples and switches them to a format which crfsuite likes.""" from spacy.language import Language from spacy.gold import GoldParse doc = spacy_nlp(json_eg[0]) entity_offsets = json_eg[1] gold = GoldParse(doc, entities=entity_offsets) ents = [l[5] for l in gold.orig_annot] if not self.BILOU_flag: def ent_clean(entity): if entity.startswith('B-') or entity.startswith( 'I-') or entity.startswith('U-') or entity.startswith( 'L-'): return entity[2:] else: return entity else: def ent_clean(entity): return entity crf_format = [(doc[i].text, doc[i].tag_, ent_clean(ents[i])) for i in range(len(doc))] return crf_format
def evaluate_ner(self, model, eval_set, ent_types): """Evaluate the performance of a Named Entity model Arguments: model (spacy model object) -- trained Named Entity model to evaluate eval_set (list) -- Evaluation set passed in the format [["<doc_text>",{"entities:[[<start_pos>,<end_pos>,"<ENTITY_TYPE>"], [<start_pos>,<end_pos>,"<ENTITY_TYPE>"]]}]] ent_types (list) -- list with what entities types to extract Returns: (Spacy.scorer.scores) -- scored metrics for the model """ scorer = Scorer() for data, expected_result in eval_set: selected_entities = [] for ent in expected_result.get('entities'): if ent[-1] in ent_types: selected_entities.append(ent) ground_truth_text = model.make_doc(data) ground_truth = GoldParse(ground_truth_text, entities=selected_entities) pred_value = model(data) scorer.score(pred_value, ground_truth) return scorer.scores
def predict(self, list_data): """ Method that performs prediction on a given dataset :param list_data: list of data given in the format expected by spaCy E.g. [(This is a nice summer, {"entities":(15, 21, SEASON)})] :return: dict_performance - dictionary where keys are precision, recall and F1 and values are the corresponding values of such metrics """ # load customized NER model nlp_custom = spacy.load(self.output_dir) # instantiate scorer scorer = Scorer() # loop over list of data given for input_, annotation_ in list_data: doc_gold_text = nlp_custom.make_doc(input_) gold = GoldParse(doc_gold_text, entities=annotation_['entities']) pred_value = nlp_custom(input_) scorer.score(pred_value, gold) # create dictionary to be returned... dict_perf = scorer.scores dict_perf_out = { 'precision': dict_perf['ents_p'], 'recall': dict_perf['ents_r'], 'F1': dict_perf['ents_f'] } self.dict_performance = dict_perf_out # ... and return it return self.dict_performance
def evaluate(tokenizer, nlp, valid_data, labels): """Evaluate model performance on a test dataset.""" texts, cats = zip(*valid_data) golds = [] # Use the model's ops module # to make sure this is compatible with GPU (cupy array) # or without (numpy array) scores = np.zeros((len(cats), len(labels)), dtype="f") if is_transformer(nlp): textcat = nlp.get_pipe(PIPES.textcat) else: textcat = nlp.get_pipe("textcat") scores = textcat.model.ops.asarray(scores) num_correct = 0 for i, doc in enumerate(nlp.pipe(texts)): gold_cats = cats[i]["cats"] for j, (label, score) in enumerate(doc.cats.items()): if label not in gold_cats: raise ValueError(f"Prediction for unexpected label: {label}") scores[i, j] = score doc_prediction = score > 0.5 if doc_prediction == bool(gold_cats[label]): num_correct += 1 golds.append(GoldParse(doc, cats=gold_cats)) accuracy = num_correct / ((len(texts) * len(labels)) + 1e-8) loss, _ = textcat.get_loss(texts, golds, scores) return accuracy, loss
def _from_json_to_crf( self, message, # type: Message entity_offsets # type: List[Tuple[int, int, Text]] ): # type: (...) -> List[Tuple[Text, Text, Text, Text]] """Convert json examples to format of underlying crfsuite.""" from spacy.gold import GoldParse doc = message.get("spacy_doc") gold = GoldParse(doc, entities=entity_offsets) ents = [l[5] for l in gold.orig_annot] if '-' in ents: logger.warn("Misaligned entity annotation in sentence '{}'. " "Make sure the start and end values of the " "annotated training examples end at token " "boundaries (e.g. don't include trailing " "whitespaces).".format(doc.text)) if not self.component_config["BILOU_flag"]: for i, label in enumerate(ents): if self._bilou_from_label(label) in {"B", "I", "U", "L"}: # removes BILOU prefix from label ents[i] = self._entity_from_label(label) return self._from_text_to_crf(message, ents)
def train(nlp, data, ents, num_iterations=20): """ :param nlp: nlp instance :param data: training data(look at required format below) :param ents: list of entities :param num_iterations: number iterations to train :return: trained NER tagger """ # Example : # train_data = [ # ( # 'Who is Shaka Khan?', # [(len('Who is '), len('Who is Shaka Khan'), 'PERSON')] # ), ... # ] for sent, _ in data: doc = nlp.make_doc(sent) for word in doc: _ = nlp.vocab[word.orth] result_NER = EntityRecognizer(nlp.vocab, entity_types=ents) for _ in range(num_iterations): random.shuffle(data) for sent, entity_offsets in data: doc = nlp.make_doc(sent) gold = GoldParse(doc, entities=entity_offsets) result_NER.update(doc, gold) return result_NER