def test_issue3345(): """Test case where preset entity crosses sentence boundary.""" nlp = English() doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) doc[4].is_sent_start = True ruler = EntityRuler(nlp, patterns=[{ "label": "GPE", "pattern": "New York" }]) config = { "learn_tokens": False, "min_action_freq": 30, "update_with_oracle_cut_size": 100, } cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] ner = EntityRecognizer(doc.vocab, model, **config) # Add the OUT action. I wouldn't have thought this would be necessary... ner.moves.add_action(5, "") ner.add_label("GPE") doc = ruler(doc) # Get into the state just before "New" state = ner.moves.init_batch([doc])[0] ner.moves.apply_transition(state, "O") ner.moves.apply_transition(state, "O") ner.moves.apply_transition(state, "O") # Check that B-GPE is valid. assert ner.moves.is_valid(state, "B-GPE")
def train(nlp, data, ents, num_iterations=20): """ :param nlp: nlp instance :param data: training data(look at required format below) :param ents: list of entities :param num_iterations: number iterations to train :return: trained NER tagger """ # Example : # train_data = [ # ( # 'Who is Shaka Khan?', # [(len('Who is '), len('Who is Shaka Khan'), 'PERSON')] # ), ... # ] for sent, _ in data: doc = nlp.make_doc(sent) for word in doc: _ = nlp.vocab[word.orth] result_NER = EntityRecognizer(nlp.vocab, entity_types=ents) for _ in range(num_iterations): random.shuffle(data) for sent, entity_offsets in data: doc = nlp.make_doc(sent) gold = GoldParse(doc, entities=entity_offsets) result_NER.update(doc, gold) return result_NER
def test_ents_reset(en_vocab): text = ["This", "is", "a", "lion"] doc = get_doc(en_vocab, text) ner = EntityRecognizer(en_vocab) ner.begin_training([]) ner(doc) assert [t.ent_iob_ for t in doc] == (["O"] * len(doc)) doc.ents = list(doc.ents) assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
def test_add_label_deserializes_correctly(): ner1 = EntityRecognizer(Vocab()) ner1.add_label("C") ner1.add_label("B") ner1.add_label("A") ner1.begin_training([]) ner2 = EntityRecognizer(Vocab()).from_bytes(ner1.to_bytes()) assert ner1.moves.n_moves == ner2.moves.n_moves for i in range(ner1.moves.n_moves): assert ner1.moves.get_class_name(i) == ner2.moves.get_class_name(i)
def train_ner(nlp, train_data, entity_types): ner = EntityRecognizer(nlp.vocab, entity_types=entity_types) for itn in range(5): random.shuffle(train_data) for raw_text, entity_offsets in train_data: doc = nlp.make_doc(raw_text) gold = GoldParse(doc, entities=entity_offsets) ner.update(doc, gold) ner.model.end_training() return ner
def train_NER(filepath, vocab, iterations=20): print("Training {} iterations".format(iterations)) docs, postags, entities = read_connl(filepath, vocab) ner = EntityRecognizer(vocab, entity_types=LABELS) for i in range(iterations): if i % 5 == 0: print("Iteration {}...".format(i)) for doc, entity_list in zip(docs, entities): ner.update(doc, GoldParse(doc, entities=entity_list)) print("Done training.") return docs, ner
def test_doc_add_entities_set_ents_iob(en_vocab): doc = Doc(en_vocab, words=["This", "is", "a", "lion"]) ner = EntityRecognizer(en_vocab) ner.begin_training([]) ner(doc) assert len(list(doc.ents)) == 0 assert [w.ent_iob_ for w in doc] == (["O"] * len(doc)) doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)] assert [w.ent_iob_ for w in doc] == ["", "", "", "B"] doc.ents = [(doc.vocab.strings["WORD"], 0, 2)] assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""]
def test_ents_reset(en_vocab): """Ensure that resetting doc.ents does not change anything""" text = ["This", "is", "a", "lion"] doc = Doc(en_vocab, words=text) cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] ner = EntityRecognizer(en_vocab, model) ner.initialize(lambda: [_ner_example(ner)]) ner(doc) orig_iobs = [t.ent_iob_ for t in doc] doc.ents = list(doc.ents) assert [t.ent_iob_ for t in doc] == orig_iobs
def train(self, nlp, entity_examples): train_data = self.convert_examples(entity_examples) ent_types = [[ent["entity"] for ent in ex["entities"]] for ex in entity_examples] entity_types = list(set(sum(ent_types, []))) self.ner = EntityRecognizer(nlp.vocab, entity_types=entity_types) for itn in range(5): random.shuffle(train_data) for raw_text, entity_offsets in train_data: doc = nlp.make_doc(raw_text) gold = GoldParse(doc, entities=entity_offsets) self.ner.update(doc, gold) self.ner.model.end_training()
def test_doc_add_entities_set_ents_iob(en_vocab): text = ["This", "is", "a", "lion"] doc = Doc(en_vocab, words=text) cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] ner = EntityRecognizer(en_vocab, model) ner.initialize(lambda: [_ner_example(ner)]) ner(doc) doc.ents = [("ANIMAL", 3, 4)] assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"] doc.ents = [("WORD", 0, 2)] assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"]
def runspacymodel(sentences, tagger, model): # model = 'en_core_web_sm' nlp = spacy.load(model) ner = EntityRecognizer(nlp.vocab) ner.from_disk(tagger) result = [] for sentence in sentences: doc = spacy.tokens.doc.Doc(nlp.vocab, words=sentence) # run ner against every sentence processed = ner(doc) for token in processed: result.append([token.text, token.ent_type_]) return result
def train_ner(nlp, train_data, entity_types): # Add new words to vocab. for raw_text, _ in train_data: doc = nlp.make_doc(raw_text) for word in doc: _ = nlp.vocab[word.orth] # Train NER. ner = EntityRecognizer(nlp.vocab, entity_types=entity_types) for itn in range(5): random.shuffle(train_data) for raw_text, entity_offsets in train_data: doc = nlp.make_doc(raw_text) gold = GoldParse(doc, entities=entity_offsets) ner.update(doc, gold) return ner
def train_query(queryObj): global nlp # Our query string story = queryObj.story querystring = queryObj.querystring parsed_ner = queryObj.parsed_ner # Where our model is located model_path = os.path.normpath( os.path.join(settings.SPACYMODEL_DIR, str(story.name))) ENTITY_OFFSETS = [] ENTITY_LIST = [] for txt in parsed_ner: cur_entity = parsed_ner[txt] cur_index = querystring.find(txt) # If string is found in querystring if cur_index != -1: ENTITY_OFFSETS.append( (cur_index, cur_index + len(txt), cur_entity)) # Add entity to entity list if its not in there if cur_entity not in ENTITY_LIST: ENTITY_LIST.append(cur_entity) # Our training data TRAIN_DATA = [ (querystring, ENTITY_OFFSETS), ] # Trains the model # loads up existing data if they exist ner = EntityRecognizer(nlp.vocab, entity_types=ENTITY_LIST) # If our model exists, we load it for itn in range(25): random.shuffle(TRAIN_DATA) for raw_text, entity_offsets in TRAIN_DATA: doc = nlp.make_doc(raw_text) gold = GoldParse(doc, entities=entity_offsets) ner.update(doc, gold) ner.model.end_training() # Save model ner.model.dump(model_path)
def test_ents_reset(en_vocab): """Ensure that resetting doc.ents does not change anything""" text = ["This", "is", "a", "lion"] doc = Doc(en_vocab, words=text) config = { "learn_tokens": False, "min_action_freq": 30, "update_with_oracle_cut_size": 100, } cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] ner = EntityRecognizer(en_vocab, model, **config) ner.initialize(lambda: [_ner_example(ner)]) ner(doc) orig_iobs = [t.ent_iob_ for t in doc] doc.ents = list(doc.ents) assert [t.ent_iob_ for t in doc] == orig_iobs
def load_model(model_dir): model_dir = pathlib.Path(model_dir) nlp = spacy.load('en', parser=False, entity=False, add_vectors=False) with (model_dir / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_: nlp.vocab.strings.load(file_) nlp.vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin') ner = EntityRecognizer.load(model_dir, nlp.vocab, require=True) return (nlp, ner)
def load_model(model_dir): model_dir = pathlib.Path(model_dir) nlp = en_core_web_sm.load() with (model_dir / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_: nlp.vocab.strings.load(file_) nlp.vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin') ner = EntityRecognizer.load(model_dir, nlp.vocab, require=True) return nlp, ner
def test_issue3345(): """Test case where preset entity crosses sentence boundary.""" nlp = English() doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) doc[4].is_sent_start = True ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}]) ner = EntityRecognizer(doc.vocab) # Add the OUT action. I wouldn't have thought this would be necessary... ner.moves.add_action(5, "") ner.add_label("GPE") doc = ruler(doc) # Get into the state just before "New" state = ner.moves.init_batch([doc])[0] ner.moves.apply_transition(state, "O") ner.moves.apply_transition(state, "O") ner.moves.apply_transition(state, "O") # Check that B-GPE is valid. assert ner.moves.is_valid(state, "B-GPE")
class SpacyEntityExtractor(object): def __init__(self, nlp=None, extractor_file=None): if extractor_file: self.ner = EntityRecognizer.load(pathlib.Path(extractor_file), nlp.vocab) else: self.ner = None def convert_examples(self, entity_examples): def convert_entity(ent): return ent["start"], ent["end"], ent["entity"] def convert_example(ex): return ex["text"], [convert_entity(ent) for ent in ex["entities"]] return [convert_example(ex) for ex in entity_examples] def train(self, nlp, entity_examples): train_data = self.convert_examples(entity_examples) ent_types = [[ent["entity"] for ent in ex["entities"]] for ex in entity_examples] entity_types = list(set(sum(ent_types, []))) self.ner = EntityRecognizer(nlp.vocab, entity_types=entity_types) for itn in range(5): random.shuffle(train_data) for raw_text, entity_offsets in train_data: doc = nlp.make_doc(raw_text) gold = GoldParse(doc, entities=entity_offsets) self.ner.update(doc, gold) self.ner.model.end_training() def extract_entities(self, nlp, sentence): doc = nlp.make_doc(sentence) nlp.tagger(doc) self.ner(doc) entities = [ { "entity": ent.label_, "value": ent.text, "start": ent.start_char, "end": ent.end_char } for ent in doc.ents] return entities
def load(cls, model_dir, entity_extractor_spacy, fine_tune_spacy_ner, spacy_nlp): # type: (Text, Text, bool, Language) -> SpacyEntityExtractor from spacy.pipeline import EntityRecognizer if model_dir and entity_extractor_spacy: ner_dir = os.path.join(model_dir, entity_extractor_spacy) ner = EntityRecognizer.load(pathlib.Path(ner_dir), spacy_nlp.vocab) return SpacyEntityExtractor(fine_tune_spacy_ner, ner) else: return SpacyEntityExtractor(fine_tune_spacy_ner)
def test_doc_add_entities_set_ents_iob(en_vocab): text = ["This", "is", "a", "lion"] doc = Doc(en_vocab, words=text) config = { "learn_tokens": False, "min_action_freq": 30, "update_with_oracle_cut_size": 100, } cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] ner = EntityRecognizer(en_vocab, model, **config) ner.initialize(lambda: [_ner_example(ner)]) ner(doc) doc.ents = [("ANIMAL", 3, 4)] assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"] doc.ents = [("WORD", 0, 2)] assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"]
def __init__(self, nlp=None, extractor_file=None, should_fine_tune_spacy_ner=False): self.nlp = nlp if extractor_file: self.ner = EntityRecognizer.load(pathlib.Path(extractor_file), nlp.vocab) else: self.ner = None self.should_fine_tune_spacy_ner = should_fine_tune_spacy_ner
def test_issue3345(entity_ruler_factory): """Test case where preset entity crosses sentence boundary.""" nlp = English() doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) doc[4].is_sent_start = True ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") ruler.add_patterns([{"label": "GPE", "pattern": "New York"}]) cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] ner = EntityRecognizer(doc.vocab, model) # Add the OUT action. I wouldn't have thought this would be necessary... ner.moves.add_action(5, "") ner.add_label("GPE") doc = ruler(doc) # Get into the state just before "New" state = ner.moves.init_batch([doc])[0] ner.moves.apply_transition(state, "O") ner.moves.apply_transition(state, "O") ner.moves.apply_transition(state, "O") # Check that B-GPE is valid. assert ner.moves.is_valid(state, "B-GPE")
def test_overwrite_token(): nlp = English() ner1 = nlp.create_pipe("ner") nlp.add_pipe(ner1, name="ner") nlp.begin_training() # The untrained NER will predict O for each token doc = nlp("I live in New York") assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"] assert [token.ent_type_ for token in doc] == ["", "", "", "", ""] # Check that a new ner can overwrite O ner2 = EntityRecognizer(doc.vocab) ner2.moves.add_action(5, "") ner2.add_label("GPE") state = ner2.moves.init_batch([doc])[0] assert ner2.moves.is_valid(state, "B-GPE") assert ner2.moves.is_valid(state, "U-GPE") ner2.moves.apply_transition(state, "B-GPE") assert ner2.moves.is_valid(state, "I-GPE") assert ner2.moves.is_valid(state, "L-GPE")
def test_issue4313(): """ This should not crash or exit with some strange error code """ beam_width = 16 beam_density = 0.0001 nlp = English() ner = EntityRecognizer(nlp.vocab) ner.add_label("SOME_LABEL") ner.begin_training([]) nlp.add_pipe(ner) # add a new label to the doc doc = nlp("What do you think about Apple ?") assert len(ner.labels) == 1 assert "SOME_LABEL" in ner.labels apple_ent = Span(doc, 5, 6, label="MY_ORG") doc.ents = list(doc.ents) + [apple_ent] # ensure the beam_parse still works with the new label docs = [doc] beams = nlp.entity.beam_parse(docs, beam_width=beam_width, beam_density=beam_density) for doc, beam in zip(docs, beams): entity_scores = defaultdict(float) for score, ents in nlp.entity.moves.get_beam_parses(beam): for start, end, label in ents: entity_scores[(start, end, label)] += score
def get_model(model_name): if model_name not in _models: model = spacy.load(model_name) if model.tagger is None: model.tagger = Tagger(model.vocab, features=Tagger.feature_templates) if model.entity is None: model.entity = EntityRecognizer(model.vocab, entity_types=['PERSON', 'NORP', 'FACILITY', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']) model.pipeline = [model.tagger, model.entity, model.parser] _models[model_name] = model return _models[model_name]
def predictEnt(query): nlp = spacy.load('en', parser=False) doc = nlp.make_doc(query) vocab_dir = pathlib.Path('ner/vocab') with (vocab_dir / 'strings.json').open('r', encoding='utf8') as file_: nlp.vocab.strings.load(file_) nlp.vocab.load_lexemes(vocab_dir / 'lexemes.bin') ner = EntityRecognizer.load(pathlib.Path("ner"), nlp.vocab, require=True) nlp.tagger(doc) ner(doc) for word in doc: if word.ent_type_ == 'PRODUCT': return word.text
def test_issue4042_bug2(): """ Test that serialization of an NER works fine when new labels were added. This is the second bug of two bugs underlying the issue 4042. """ nlp1 = English() vocab = nlp1.vocab # add ner pipe ner1 = nlp1.create_pipe("ner") ner1.add_label("SOME_LABEL") nlp1.add_pipe(ner1) nlp1.begin_training() # add a new label to the doc doc1 = nlp1("What do you think about Apple ?") assert len(ner1.labels) == 1 assert "SOME_LABEL" in ner1.labels apple_ent = Span(doc1, 5, 6, label="MY_ORG") doc1.ents = list(doc1.ents) + [apple_ent] # reapply the NER - at this point it should resize itself ner1(doc1) assert len(ner1.labels) == 2 assert "SOME_LABEL" in ner1.labels assert "MY_ORG" in ner1.labels with make_tempdir() as d: # assert IO goes fine output_dir = ensure_path(d) if not output_dir.exists(): output_dir.mkdir() ner1.to_disk(output_dir) nlp2 = English(vocab) ner2 = EntityRecognizer(vocab) ner2.from_disk(output_dir) assert len(ner2.labels) == 2
def test_accept_blocked_token(): """Test succesful blocking of tokens to be in an entity.""" # 1. test normal behaviour nlp1 = English() doc1 = nlp1("I live in New York") ner1 = EntityRecognizer(doc1.vocab) assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""] assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""] # Add the OUT action ner1.moves.add_action(5, "") ner1.add_label("GPE") # Get into the state just before "New" state1 = ner1.moves.init_batch([doc1])[0] ner1.moves.apply_transition(state1, "O") ner1.moves.apply_transition(state1, "O") ner1.moves.apply_transition(state1, "O") # Check that B-GPE is valid. assert ner1.moves.is_valid(state1, "B-GPE") # 2. test blocking behaviour nlp2 = English() doc2 = nlp2("I live in New York") ner2 = EntityRecognizer(doc2.vocab) # set "New York" to a blocked entity doc2.ents = [(0, 3, 5)] assert [token.ent_iob_ for token in doc2] == ["", "", "", "B", "B"] assert [token.ent_type_ for token in doc2] == ["", "", "", "", ""] # Check that B-GPE is now invalid. ner2.moves.add_action(4, "") ner2.moves.add_action(5, "") ner2.add_label("GPE") state2 = ner2.moves.init_batch([doc2])[0] ner2.moves.apply_transition(state2, "O") ner2.moves.apply_transition(state2, "O") ner2.moves.apply_transition(state2, "O") # we can only use U- for "New" assert not ner2.moves.is_valid(state2, "B-GPE") assert ner2.moves.is_valid(state2, "U-") ner2.moves.apply_transition(state2, "U-") # we can only use U- for "York" assert not ner2.moves.is_valid(state2, "B-GPE") assert ner2.moves.is_valid(state2, "U-")
def predict(query): # Load NER nlp = spacy.load('en', parser=False, entity=False, add_vectors=False) vocab_dir = pathlib.Path('ner/vocab') with (vocab_dir / 'strings.json').open('r',encoding='utf8') as file_: nlp.vocab.strings.load(file_) nlp.vocab.load_lexemes(vocab_dir / 'lexemes.bin') ner = EntityRecognizer.load(pathlib.Path("ner"), nlp.vocab, require=False) doc = nlp.make_doc(query) #nlp.tagger(doc) ner(doc) for word in doc: print(word.text, word.orth, word.lower, word.ent_type_) for word in doc: if word.ent_type_: print ('word -> {} and entity-> {}'.format(word.text,word.ent_type_))
def get_query(queryObj): global nlp # Our query string story = queryObj.story querystring = queryObj.querystring # Where our model is located model_path = os.path.normpath( os.path.join(settings.SPACYMODEL_DIR, str(story.name))) ENTITY_LIST = [] for attribute in story.storyattribute_set.all(): ENTITY_LIST.append(str(attribute.attribute)) # Initialize Spacy modules ner = EntityRecognizer(nlp.vocab, entity_types=ENTITY_LIST) # Only tag ners if there is an existing dataset if os.path.isfile(model_path): ner.model.load(model_path) # Creates a tagger doc = nlp.make_doc(querystring) nlp.tagger(doc) ner(doc) # Formatted Dic, or in JSON format ner_dict = {} for word in doc: if word.ent_type_ is not None and word.ent_type_ is not '': ner_dict[word.text] = word.ent_type_ # Save dict as our parsed ner queryObj.parsed_ner = ner_dict # Returns empty dict else: queryObj.parsed_ner = {}
def load_ner_model(vocab, path): return EntityRecognizer.load(path, vocab)