def load_saved_model(): ner_model = NERCRF() ner_model.build(model_info['sentence_len'], model_info['word_len'], model_info['num_of_labels'], model_info['word_vocab'], model_info['vocab_size'], model_info['char_vocab_size'], word_embedding_dims=model_info['word_embedding_dims'], char_embedding_dims=model_info['char_embedding_dims'], word_lstm_dims=model_info['word_lstm_dims'], tagger_lstm_dims=model_info['tagger_lstm_dims'], dropout=model_info['dropout'], external_embedding_model=model_info['external_embedding_model']) ner_model.load(args.model_path) return ner_model
def load_saved_model(): ner_model = NERCRF() ner_model.build( model_info['sentence_len'], model_info['word_len'], model_info['num_of_labels'], model_info['word_vocab'], model_info['vocab_size'], model_info['char_vocab_size'], word_embedding_dims=model_info['word_embedding_dims'], char_embedding_dims=model_info['char_embedding_dims'], word_lstm_dims=model_info['word_lstm_dims'], tagger_lstm_dims=model_info['tagger_lstm_dims'], dropout=model_info['dropout'], external_embedding_model=model_info['external_embedding_model']) ner_model.load(args.model_path) return ner_model
x_test, x_char_test, y_test = dataset.test_set num_y_labels = len(dataset.y_labels) + 1 vocabulary_size = dataset.word_vocab_size char_vocabulary_size = dataset.char_vocab_size y_test = keras.utils.to_categorical(y_test, num_y_labels) y_train = keras.utils.to_categorical(y_train, num_y_labels) ner_model = NERCRF(use_cudnn=args.use_cudnn) # pylint: disable=unexpected-keyword-arg ner_model.build( args.word_length, num_y_labels, vocabulary_size, char_vocabulary_size, word_embedding_dims=args.word_embedding_dims, char_embedding_dims=args.character_embedding_dims, tagger_lstm_dims=args.entity_tagger_lstm_dims, dropout=args.dropout, ) # initialize word embedding if external model selected if args.embedding_model is not None: embedding_model, _ = load_word_embeddings(args.embedding_model) embedding_mat = get_embedding_matrix(embedding_model, dataset.word_vocab) ner_model.load_embedding_weights(embedding_mat) train_inputs = [x_train, x_char_train] test_inputs = [x_test, x_char_test]
x_test, x_char_test, y_test = dataset.test num_y_labels = len(dataset.y_labels) + 1 vocabulary_size = dataset.word_vocab_size + 1 char_vocabulary_size = dataset.char_vocab_size + 1 y_test = to_categorical(y_test, num_y_labels) y_train = to_categorical(y_train, num_y_labels) ner_model = NERCRF() ner_model.build(args.sentence_length, args.word_length, num_y_labels, dataset.word_vocab, vocabulary_size, char_vocabulary_size, word_embedding_dims=args.word_embedding_dims, char_embedding_dims=args.character_embedding_dims, word_lstm_dims=args.char_features_lstm_dims, tagger_lstm_dims=args.entity_tagger_lstm_dims, dropout=args.dropout, external_embedding_model=args.embedding_model) conll_cb = ConllCallback([x_test, x_char_test], y_test, dataset.y_labels, batch_size=args.b) ner_model.fit(x=[x_train, x_char_train], y=y_train, batch_size=args.b, epochs=args.e, callbacks=[conll_cb], validation=([x_test, x_char_test], y_test))
class NerApi(AbstractApi): """ Ner model API """ dir = path.dirname(path.realpath(__file__)) pretrained_model = path.join(dir, 'ner-pretrained', 'model.h5') pretrained_model_info = path.join(dir, 'ner-pretrained', 'model_info.dat') def __init__(self, ner_model=None, prompt=True): self.model = None self.model_info = None self.model_path = NerApi.pretrained_model self.model_info_path = NerApi.pretrained_model_info self._download_pretrained_model(prompt) def encode_word(self, word): return self.model_info['word_vocab'].get(word, 1.0) def encode_word_chars(self, word): return [self.model_info['char_vocab'].get(c, 1.0) for c in word] def encode_input(self, text_arr): sentence = [] sentence_chars = [] for word in text_arr: sentence.append(self.encode_word(word)) sentence_chars.append(self.encode_word_chars(word)) encoded_sentence = pad_sequences( [np.asarray(sentence)], maxlen=self.model_info['sentence_len']) chars_padded = pad_sequences(sentence_chars, maxlen=self.model_info['word_len']) if self.model_info['sentence_len'] - chars_padded.shape[0] > 0: chars_padded = np.concatenate((np.zeros( (self.model_info['sentence_len'] - chars_padded.shape[0], self.model_info['word_len'])), chars_padded)) encoded_chars = chars_padded.reshape(1, self.model_info['sentence_len'], self.model_info['word_len']) return encoded_sentence, encoded_chars def _prompt(self): response = input( '\nTo download \'{}\', please enter YES: '.format('ner')) res = response.lower().strip() if res == "yes" or (len(res) == 1 and res == 'y'): print('Downloading {}...'.format('ner')) responded_yes = True else: print('Download declined. Response received {} != YES|Y. '.format( res)) responded_yes = False return responded_yes def _download_pretrained_model(self, prompt=True): """Downloads the pre-trained BIST model if non-existent.""" dir_path = path.join(self.dir, 'ner-pretrained') if not path.isfile(path.join(dir_path, 'model.h5')): print( 'The pre-trained models to be downloaded for the NER dataset ' 'are licensed under Apache 2.0. By downloading, you accept the terms ' 'and conditions provided by the license') makedirs(dir_path, exist_ok=True) if prompt is True: agreed = self._prompt() if agreed is False: sys.exit(0) download_unlicensed_file( 'http://nervana-modelzoo.s3.amazonaws.com/NLP/NER/', 'model.h5', self.model_path) download_unlicensed_file( 'http://nervana-modelzoo.s3.amazonaws.com/NLP/NER/', 'model_info.dat', self.model_info_path) print('Done.') def load_model(self): with open(self.model_info_path, 'rb') as fp: self.model_info = pickle.load(fp) self.model = NERCRF() self.model.build( self.model_info['sentence_len'], self.model_info['word_len'], self.model_info['num_of_labels'], self.model_info['word_vocab'], self.model_info['vocab_size'], self.model_info['char_vocab_size'], word_embedding_dims=self.model_info['word_embedding_dims'], char_embedding_dims=self.model_info['char_embedding_dims'], word_lstm_dims=self.model_info['word_lstm_dims'], tagger_lstm_dims=self.model_info['tagger_lstm_dims'], dropout=self.model_info['dropout'], external_embedding_model=self. model_info['external_embedding_model']) self.model.load(self.model_path) def pretty_print(self, text, tags): tags_str = [ self.model_info['labels_id_to_word'].get(t, None) for t in tags[0] ][-len(text):] mapped = [{ 'index': idx, 'word': el, 'label': tags_str[idx] } for idx, el in enumerate(text)] counter = 0 ents = [] spans = [] for obj in mapped: if (obj['label'] != 'O'): spans.append({ 'start': counter, 'end': (counter + len(obj['word'])), 'type': obj['label'] }) counter += len(obj['word']) + 1 ents = dict((obj['type'].lower(), obj) for obj in spans).keys() ret = {} ret['doc_text'] = ' '.join(text) ret['annotation_set'] = list(ents) ret['spans'] = spans ret['title'] = 'None' return {"doc": ret, 'type': 'high_level'} def process_text(self, text): input_text = ' '.join(text.strip().split()) return nlp.tokenize(input_text) def inference(self, doc): text_arr = self.process_text(doc) words, chars = self.encode_input(text_arr) tags = self.model.predict([words, chars]) tags = tags.argmax(2) return self.pretty_print(text_arr, tags)