예제 #1
0
def load_saved_model():
    ner_model = NERCRF()
    ner_model.build(model_info['sentence_len'],
                    model_info['word_len'],
                    model_info['num_of_labels'],
                    model_info['word_vocab'],
                    model_info['vocab_size'],
                    model_info['char_vocab_size'],
                    word_embedding_dims=model_info['word_embedding_dims'],
                    char_embedding_dims=model_info['char_embedding_dims'],
                    word_lstm_dims=model_info['word_lstm_dims'],
                    tagger_lstm_dims=model_info['tagger_lstm_dims'],
                    dropout=model_info['dropout'],
                    external_embedding_model=model_info['external_embedding_model'])
    ner_model.load(args.model_path)
    return ner_model
예제 #2
0
def load_saved_model():
    ner_model = NERCRF()
    ner_model.build(
        model_info['sentence_len'],
        model_info['word_len'],
        model_info['num_of_labels'],
        model_info['word_vocab'],
        model_info['vocab_size'],
        model_info['char_vocab_size'],
        word_embedding_dims=model_info['word_embedding_dims'],
        char_embedding_dims=model_info['char_embedding_dims'],
        word_lstm_dims=model_info['word_lstm_dims'],
        tagger_lstm_dims=model_info['tagger_lstm_dims'],
        dropout=model_info['dropout'],
        external_embedding_model=model_info['external_embedding_model'])
    ner_model.load(args.model_path)
    return ner_model
예제 #3
0
    x_test, x_char_test, y_test = dataset.test_set

    num_y_labels = len(dataset.y_labels) + 1
    vocabulary_size = dataset.word_vocab_size
    char_vocabulary_size = dataset.char_vocab_size

    y_test = keras.utils.to_categorical(y_test, num_y_labels)
    y_train = keras.utils.to_categorical(y_train, num_y_labels)

    ner_model = NERCRF(use_cudnn=args.use_cudnn)
    # pylint: disable=unexpected-keyword-arg
    ner_model.build(
        args.word_length,
        num_y_labels,
        vocabulary_size,
        char_vocabulary_size,
        word_embedding_dims=args.word_embedding_dims,
        char_embedding_dims=args.character_embedding_dims,
        tagger_lstm_dims=args.entity_tagger_lstm_dims,
        dropout=args.dropout,
    )

    # initialize word embedding if external model selected
    if args.embedding_model is not None:
        embedding_model, _ = load_word_embeddings(args.embedding_model)
        embedding_mat = get_embedding_matrix(embedding_model,
                                             dataset.word_vocab)
        ner_model.load_embedding_weights(embedding_mat)

    train_inputs = [x_train, x_char_train]
    test_inputs = [x_test, x_char_test]
예제 #4
0
    x_test, x_char_test, y_test = dataset.test

    num_y_labels = len(dataset.y_labels) + 1
    vocabulary_size = dataset.word_vocab_size + 1
    char_vocabulary_size = dataset.char_vocab_size + 1

    y_test = to_categorical(y_test, num_y_labels)
    y_train = to_categorical(y_train, num_y_labels)

    ner_model = NERCRF()
    ner_model.build(args.sentence_length,
                    args.word_length,
                    num_y_labels,
                    dataset.word_vocab,
                    vocabulary_size,
                    char_vocabulary_size,
                    word_embedding_dims=args.word_embedding_dims,
                    char_embedding_dims=args.character_embedding_dims,
                    word_lstm_dims=args.char_features_lstm_dims,
                    tagger_lstm_dims=args.entity_tagger_lstm_dims,
                    dropout=args.dropout,
                    external_embedding_model=args.embedding_model)

    conll_cb = ConllCallback([x_test, x_char_test], y_test, dataset.y_labels,
                             batch_size=args.b)

    ner_model.fit(x=[x_train, x_char_train], y=y_train,
                  batch_size=args.b,
                  epochs=args.e,
                  callbacks=[conll_cb],
                  validation=([x_test, x_char_test], y_test))
예제 #5
0
class NerApi(AbstractApi):
    """
    Ner model API
    """
    dir = path.dirname(path.realpath(__file__))
    pretrained_model = path.join(dir, 'ner-pretrained', 'model.h5')
    pretrained_model_info = path.join(dir, 'ner-pretrained', 'model_info.dat')

    def __init__(self, ner_model=None, prompt=True):
        self.model = None
        self.model_info = None
        self.model_path = NerApi.pretrained_model
        self.model_info_path = NerApi.pretrained_model_info
        self._download_pretrained_model(prompt)

    def encode_word(self, word):
        return self.model_info['word_vocab'].get(word, 1.0)

    def encode_word_chars(self, word):
        return [self.model_info['char_vocab'].get(c, 1.0) for c in word]

    def encode_input(self, text_arr):
        sentence = []
        sentence_chars = []
        for word in text_arr:
            sentence.append(self.encode_word(word))
            sentence_chars.append(self.encode_word_chars(word))
        encoded_sentence = pad_sequences(
            [np.asarray(sentence)], maxlen=self.model_info['sentence_len'])
        chars_padded = pad_sequences(sentence_chars,
                                     maxlen=self.model_info['word_len'])
        if self.model_info['sentence_len'] - chars_padded.shape[0] > 0:
            chars_padded = np.concatenate((np.zeros(
                (self.model_info['sentence_len'] - chars_padded.shape[0],
                 self.model_info['word_len'])), chars_padded))
        encoded_chars = chars_padded.reshape(1,
                                             self.model_info['sentence_len'],
                                             self.model_info['word_len'])
        return encoded_sentence, encoded_chars

    def _prompt(self):
        response = input(
            '\nTo download \'{}\', please enter YES: '.format('ner'))
        res = response.lower().strip()
        if res == "yes" or (len(res) == 1 and res == 'y'):
            print('Downloading {}...'.format('ner'))
            responded_yes = True
        else:
            print('Download declined. Response received {} != YES|Y. '.format(
                res))
            responded_yes = False
        return responded_yes

    def _download_pretrained_model(self, prompt=True):
        """Downloads the pre-trained BIST model if non-existent."""
        dir_path = path.join(self.dir, 'ner-pretrained')
        if not path.isfile(path.join(dir_path, 'model.h5')):
            print(
                'The pre-trained models to be downloaded for the NER dataset '
                'are licensed under Apache 2.0. By downloading, you accept the terms '
                'and conditions provided by the license')
            makedirs(dir_path, exist_ok=True)
            if prompt is True:
                agreed = self._prompt()
                if agreed is False:
                    sys.exit(0)
            download_unlicensed_file(
                'http://nervana-modelzoo.s3.amazonaws.com/NLP/NER/',
                'model.h5', self.model_path)
            download_unlicensed_file(
                'http://nervana-modelzoo.s3.amazonaws.com/NLP/NER/',
                'model_info.dat', self.model_info_path)
            print('Done.')

    def load_model(self):
        with open(self.model_info_path, 'rb') as fp:
            self.model_info = pickle.load(fp)
            self.model = NERCRF()
            self.model.build(
                self.model_info['sentence_len'],
                self.model_info['word_len'],
                self.model_info['num_of_labels'],
                self.model_info['word_vocab'],
                self.model_info['vocab_size'],
                self.model_info['char_vocab_size'],
                word_embedding_dims=self.model_info['word_embedding_dims'],
                char_embedding_dims=self.model_info['char_embedding_dims'],
                word_lstm_dims=self.model_info['word_lstm_dims'],
                tagger_lstm_dims=self.model_info['tagger_lstm_dims'],
                dropout=self.model_info['dropout'],
                external_embedding_model=self.
                model_info['external_embedding_model'])
            self.model.load(self.model_path)

    def pretty_print(self, text, tags):
        tags_str = [
            self.model_info['labels_id_to_word'].get(t, None) for t in tags[0]
        ][-len(text):]
        mapped = [{
            'index': idx,
            'word': el,
            'label': tags_str[idx]
        } for idx, el in enumerate(text)]
        counter = 0
        ents = []
        spans = []
        for obj in mapped:
            if (obj['label'] != 'O'):
                spans.append({
                    'start': counter,
                    'end': (counter + len(obj['word'])),
                    'type': obj['label']
                })
            counter += len(obj['word']) + 1
        ents = dict((obj['type'].lower(), obj) for obj in spans).keys()
        ret = {}
        ret['doc_text'] = ' '.join(text)
        ret['annotation_set'] = list(ents)
        ret['spans'] = spans
        ret['title'] = 'None'
        return {"doc": ret, 'type': 'high_level'}

    def process_text(self, text):
        input_text = ' '.join(text.strip().split())
        return nlp.tokenize(input_text)

    def inference(self, doc):
        text_arr = self.process_text(doc)
        words, chars = self.encode_input(text_arr)
        tags = self.model.predict([words, chars])
        tags = tags.argmax(2)
        return self.pretty_print(text_arr, tags)