예제 #1
0
 def load_model(self):
     self.model = NERCRF()
     self.model.load(self.model_path)
     with open(self.model_info_path, 'rb') as fp:
         model_info = pickle.load(fp)
     self.word_vocab = model_info['word_vocab']
     self.y_vocab = {v: k for k, v in model_info['y_vocab'].items()}
     self.char_vocab = model_info['char_vocab']
예제 #2
0
 def load_model(self):
     self.model = NERCRF()
     self.model.load(self.pretrained_model)
     with open(self.pretrained_model_info, "rb") as fp:
         model_info = pickle.load(fp)
     self.word_vocab = model_info["word_vocab"]
     self.y_vocab = {v: k for k, v in model_info["y_vocab"].items()}
     self.char_vocab = model_info["char_vocab"]
예제 #3
0
def load_saved_model():
    ner_model = NERCRF()
    ner_model.build(model_info['sentence_len'],
                    model_info['word_len'],
                    model_info['num_of_labels'],
                    model_info['word_vocab'],
                    model_info['vocab_size'],
                    model_info['char_vocab_size'],
                    word_embedding_dims=model_info['word_embedding_dims'],
                    char_embedding_dims=model_info['char_embedding_dims'],
                    word_lstm_dims=model_info['word_lstm_dims'],
                    tagger_lstm_dims=model_info['tagger_lstm_dims'],
                    dropout=model_info['dropout'],
                    external_embedding_model=model_info['external_embedding_model'])
    ner_model.load(args.model_path)
    return ner_model
예제 #4
0
 def load_model(self):
     with open(self.model_info_path, 'rb') as fp:
         self.model_info = pickle.load(fp)
         self.model = NERCRF()
         self.model.build(
             self.model_info['sentence_len'],
             self.model_info['word_len'],
             self.model_info['num_of_labels'],
             self.model_info['word_vocab'],
             self.model_info['vocab_size'],
             self.model_info['char_vocab_size'],
             word_embedding_dims=self.model_info['word_embedding_dims'],
             char_embedding_dims=self.model_info['char_embedding_dims'],
             word_lstm_dims=self.model_info['word_lstm_dims'],
             tagger_lstm_dims=self.model_info['tagger_lstm_dims'],
             dropout=self.model_info['dropout'],
             external_embedding_model=self.
             model_info['external_embedding_model'])
         self.model.load(self.model_path)
예제 #5
0
def load_saved_model():
    ner_model = NERCRF()
    ner_model.build(
        model_info['sentence_len'],
        model_info['word_len'],
        model_info['num_of_labels'],
        model_info['word_vocab'],
        model_info['vocab_size'],
        model_info['char_vocab_size'],
        word_embedding_dims=model_info['word_embedding_dims'],
        char_embedding_dims=model_info['char_embedding_dims'],
        word_lstm_dims=model_info['word_lstm_dims'],
        tagger_lstm_dims=model_info['tagger_lstm_dims'],
        dropout=model_info['dropout'],
        external_embedding_model=model_info['external_embedding_model'])
    ner_model.load(args.model_path)
    return ner_model
예제 #6
0
        max_word_length=args.word_length,
        tag_field_no=args.tag_num,
    )

    # get the train and test data sets
    x_train, x_char_train, y_train = dataset.train_set
    x_test, x_char_test, y_test = dataset.test_set

    num_y_labels = len(dataset.y_labels) + 1
    vocabulary_size = dataset.word_vocab_size
    char_vocabulary_size = dataset.char_vocab_size

    y_test = keras.utils.to_categorical(y_test, num_y_labels)
    y_train = keras.utils.to_categorical(y_train, num_y_labels)

    ner_model = NERCRF(use_cudnn=args.use_cudnn)
    # pylint: disable=unexpected-keyword-arg
    ner_model.build(
        args.word_length,
        num_y_labels,
        vocabulary_size,
        char_vocabulary_size,
        word_embedding_dims=args.word_embedding_dims,
        char_embedding_dims=args.character_embedding_dims,
        tagger_lstm_dims=args.entity_tagger_lstm_dims,
        dropout=args.dropout,
    )

    # initialize word embedding if external model selected
    if args.embedding_model is not None:
        embedding_model, _ = load_word_embeddings(args.embedding_model)
예제 #7
0
class NerApi(AbstractApi):
    """
    NER model API
    """

    model_dir = str(LIBRARY_OUT / "ner-pretrained")
    pretrained_model = path.join(model_dir, "model_v4.h5")
    pretrained_model_info = path.join(model_dir, "model_info_v4.dat")

    def __init__(self, prompt=True):
        self.model = None
        self.model_info = None
        self.word_vocab = None
        self.y_vocab = None
        self.char_vocab = None
        self._download_pretrained_model(prompt)
        self.nlp = SpacyInstance(
            disable=["tagger", "ner", "parser", "vectors", "textcat"])

    @staticmethod
    def _prompt():
        response = input(
            "\nTo download '{}', please enter YES: ".format("ner"))
        res = response.lower().strip()
        if res == "yes" or (len(res) == 1 and res == "y"):
            print("Downloading {}...".format("ner"))
            responded_yes = True
        else:
            print("Download declined. Response received {} != YES|Y. ".format(
                res))
            responded_yes = False
        return responded_yes

    def _download_pretrained_model(self, prompt=True):
        """Downloads the pre-trained BIST model if non-existent."""
        model_exists = path.isfile(self.pretrained_model)
        model_info_exists = path.isfile(self.pretrained_model_info)
        if not model_exists or not model_info_exists:
            print(
                "The pre-trained models to be downloaded for the NER dataset "
                "are licensed under Apache 2.0. By downloading, you accept the terms "
                "and conditions provided by the license")
            makedirs(self.model_dir, exist_ok=True)
            if prompt is True:
                agreed = NerApi._prompt()
                if agreed is False:
                    sys.exit(0)
            download_unlicensed_file(
                "https://s3-us-west-2.amazonaws.com/nlp-architect-data"
                "/models/ner/",
                "model_v4.h5",
                self.pretrained_model,
            )
            download_unlicensed_file(
                "https://s3-us-west-2.amazonaws.com/nlp-architect-data"
                "/models/ner/",
                "model_info_v4.dat",
                self.pretrained_model_info,
            )
            print("Done.")

    def load_model(self):
        self.model = NERCRF()
        self.model.load(self.pretrained_model)
        with open(self.pretrained_model_info, "rb") as fp:
            model_info = pickle.load(fp)
        self.word_vocab = model_info["word_vocab"]
        self.y_vocab = {v: k for k, v in model_info["y_vocab"].items()}
        self.char_vocab = model_info["char_vocab"]

    @staticmethod
    def pretty_print(text, tags):
        spans = []
        for s, e, tag in bio_to_spans(text, tags):
            spans.append({"start": s, "end": e, "type": tag})
        ents = dict((obj["type"].lower(), obj) for obj in spans).keys()
        ret = {
            "doc_text": " ".join(text),
            "annotation_set": list(ents),
            "spans": spans,
            "title": "None",
        }
        print({"doc": ret, "type": "high_level"})
        return {"doc": ret, "type": "high_level"}

    def process_text(self, text):
        input_text = " ".join(text.strip().split())
        return self.nlp.tokenize(input_text)

    def vectorize(self, doc, vocab, char_vocab):
        words = np.asarray([
            vocab[w.lower()] if w.lower() in vocab else 1 for w in doc
        ]).reshape(1, -1)
        sentence_chars = []
        for w in doc:
            word_chars = []
            for c in w:
                if c in char_vocab:
                    _cid = char_vocab[c]
                else:
                    _cid = 1
                word_chars.append(_cid)
            sentence_chars.append(word_chars)
        sentence_chars = np.expand_dims(pad_sentences(sentence_chars,
                                                      self.model.word_length),
                                        axis=0)
        return words, sentence_chars

    def inference(self, doc):
        text_arr = self.process_text(doc)
        doc_vec = self.vectorize(text_arr, self.word_vocab, self.char_vocab)
        seq_len = np.array([len(text_arr)]).reshape(-1, 1)
        inputs = list(doc_vec)
        # pylint: disable=no-member
        inputs = list(doc_vec) + [seq_len]
        doc_ner = self.model.predict(inputs, batch_size=1).argmax(2).flatten()
        tags = [self.y_vocab.get(n, None) for n in doc_ner]
        return self.pretty_print(text_arr, tags)
예제 #8
0
def load_saved_model():
    ner_model = NERCRF()
    ner_model.load(args.model_path)
    return ner_model
예제 #9
0
class NerApi(AbstractApi):
    """
    NER model API
    """
    dir = path.dirname(path.realpath(__file__))
    pretrained_model = path.join(dir, 'ner-pretrained', 'model.h5')
    pretrained_model_info = path.join(dir, 'ner-pretrained', 'model_info.dat')

    def __init__(self, prompt=True):
        self.model = None
        self.model_info = None
        self.model_path = NerApi.pretrained_model
        self.model_info_path = NerApi.pretrained_model_info
        self.word_vocab = None
        self.y_vocab = None
        self.char_vocab = None
        self._download_pretrained_model(prompt)

    @staticmethod
    def _prompt():
        response = input(
            '\nTo download \'{}\', please enter YES: '.format('ner'))
        res = response.lower().strip()
        if res == "yes" or (len(res) == 1 and res == 'y'):
            print('Downloading {}...'.format('ner'))
            responded_yes = True
        else:
            print('Download declined. Response received {} != YES|Y. '.format(
                res))
            responded_yes = False
        return responded_yes

    def _download_pretrained_model(self, prompt=True):
        """Downloads the pre-trained BIST model if non-existent."""
        dir_path = path.join(self.dir, 'ner-pretrained')
        model_exists = path.isfile(path.join(dir_path, 'model.h5'))
        model_info_exists = path.isfile(path.join(dir_path, 'model_info.dat'))
        if (not model_exists or not model_info_exists):
            print(
                'The pre-trained models to be downloaded for the NER dataset '
                'are licensed under Apache 2.0. By downloading, you accept the terms '
                'and conditions provided by the license')
            makedirs(dir_path, exist_ok=True)
            if prompt is True:
                agreed = NerApi._prompt()
                if agreed is False:
                    sys.exit(0)
            download_unlicensed_file(
                'http://nervana-modelzoo.s3.amazonaws.com/NLP/ner/',
                'model.h5', self.model_path)
            download_unlicensed_file(
                'http://nervana-modelzoo.s3.amazonaws.com/NLP/ner/',
                'model_info.dat', self.model_info_path)
            print('Done.')

    def load_model(self):
        self.model = NERCRF()
        self.model.load(self.model_path)
        with open(self.model_info_path, 'rb') as fp:
            model_info = pickle.load(fp)
        self.word_vocab = model_info['word_vocab']
        self.y_vocab = {v: k for k, v in model_info['y_vocab'].items()}
        self.char_vocab = model_info['char_vocab']

    @staticmethod
    def pretty_print(text, tags):
        mapped = [{
            'index': idx,
            'word': el,
            'label': tags[idx]
        } for idx, el in enumerate(text)]
        counter = 0
        spans = []
        for obj in mapped:
            if obj['label'] != 'O':
                spans.append({
                    'start': counter,
                    'end': (counter + len(obj['word'])),
                    'type': obj['label']
                })
            counter += len(obj['word']) + 1
        ents = dict((obj['type'].lower(), obj) for obj in spans).keys()
        ret = {
            'doc_text': ' '.join(text),
            'annotation_set': list(ents),
            'spans': spans,
            'title': 'None'
        }
        return {"doc": ret, 'type': 'high_level'}

    @staticmethod
    def process_text(text):
        input_text = ' '.join(text.strip().split())
        return nlp.tokenize(input_text)

    def vectorize(self, doc, vocab, char_vocab):
        words = np.asarray([vocab[w.lower()] if w.lower() in vocab else 1 for w in doc]) \
            .reshape(1, -1)
        sentence_chars = []
        for w in doc:
            word_chars = []
            for c in w:
                if c in char_vocab:
                    _cid = char_vocab[c]
                else:
                    _cid = 1
                word_chars.append(_cid)
            sentence_chars.append(word_chars)
        sentence_chars = np.expand_dims(pad_sentences(sentence_chars,
                                                      self.model.word_length),
                                        axis=0)
        return words, sentence_chars

    def inference(self, doc):
        text_arr = self.process_text(doc)
        doc_vec = self.vectorize(text_arr, self.word_vocab, self.char_vocab)
        seq_len = np.array([len(text_arr)]).reshape(-1, 1)
        inputs = list(doc_vec)
        if self.model.crf_mode == 'pad':
            inputs = list(doc_vec) + [seq_len]
        doc_ner = self.model.predict(inputs, batch_size=1).argmax(2).flatten()
        tags = [self.y_vocab.get(n, None) for n in doc_ner]
        return self.pretty_print(text_arr, tags)
예제 #10
0
                                       max_sentence_length=args.sentence_length,
                                       max_word_length=args.word_length,
                                       tag_field_no=args.tag_num)

    # get the train and test data sets
    x_train, x_char_train, y_train = dataset.train
    x_test, x_char_test, y_test = dataset.test

    num_y_labels = len(dataset.y_labels) + 1
    vocabulary_size = dataset.word_vocab_size + 1
    char_vocabulary_size = dataset.char_vocab_size + 1

    y_test = to_categorical(y_test, num_y_labels)
    y_train = to_categorical(y_train, num_y_labels)

    ner_model = NERCRF()
    ner_model.build(args.sentence_length,
                    args.word_length,
                    num_y_labels,
                    dataset.word_vocab,
                    vocabulary_size,
                    char_vocabulary_size,
                    word_embedding_dims=args.word_embedding_dims,
                    char_embedding_dims=args.character_embedding_dims,
                    word_lstm_dims=args.char_features_lstm_dims,
                    tagger_lstm_dims=args.entity_tagger_lstm_dims,
                    dropout=args.dropout,
                    external_embedding_model=args.embedding_model)

    conll_cb = ConllCallback([x_test, x_char_test], y_test, dataset.y_labels,
                             batch_size=args.b)
예제 #11
0
class NerApi(AbstractApi):
    """
    NER model API
    """
    model_dir = str(LIBRARY_OUT / 'ner-pretrained')
    pretrained_model = path.join(model_dir, 'model_v4.h5')
    pretrained_model_info = path.join(model_dir, 'model_info_v4.dat')

    def __init__(self, prompt=True):
        self.model = None
        self.model_info = None
        self.word_vocab = None
        self.y_vocab = None
        self.char_vocab = None
        self._download_pretrained_model(prompt)
        self.nlp = SpacyInstance(disable=['tagger', 'ner', 'parser', 'vectors', 'textcat'])

    @staticmethod
    def _prompt():
        response = input('\nTo download \'{}\', please enter YES: '.
                         format('ner'))
        res = response.lower().strip()
        if res == "yes" or (len(res) == 1 and res == 'y'):
            print('Downloading {}...'.format('ner'))
            responded_yes = True
        else:
            print('Download declined. Response received {} != YES|Y. '.format(res))
            responded_yes = False
        return responded_yes

    def _download_pretrained_model(self, prompt=True):
        """Downloads the pre-trained BIST model if non-existent."""
        model_exists = path.isfile(self.pretrained_model)
        model_info_exists = path.isfile(self.pretrained_model_info)
        if not model_exists or not model_info_exists:
            print('The pre-trained models to be downloaded for the NER dataset '
                  'are licensed under Apache 2.0. By downloading, you accept the terms '
                  'and conditions provided by the license')
            makedirs(self.model_dir, exist_ok=True)
            if prompt is True:
                agreed = NerApi._prompt()
                if agreed is False:
                    sys.exit(0)
            download_unlicensed_file('https://s3-us-west-2.amazonaws.com/nlp-architect-data'
                                     '/models/ner/',
                                     'model_v4.h5', self.pretrained_model)
            download_unlicensed_file('https://s3-us-west-2.amazonaws.com/nlp-architect-data'
                                     '/models/ner/',
                                     'model_info_v4.dat', self.pretrained_model_info)
            print('Done.')

    def load_model(self):
        self.model = NERCRF()
        self.model.load(self.pretrained_model)
        with open(self.pretrained_model_info, 'rb') as fp:
            model_info = pickle.load(fp)
        self.word_vocab = model_info['word_vocab']
        self.y_vocab = {v: k for k, v in model_info['y_vocab'].items()}
        self.char_vocab = model_info['char_vocab']

    @staticmethod
    def pretty_print(text, tags):
        spans = []
        for s, e, tag in bio_to_spans(text, tags):
            spans.append({
                'start': s,
                'end': e,
                'type': tag
            })
        ents = dict((obj['type'].lower(), obj) for obj in spans).keys()
        ret = {'doc_text': ' '.join(text),
               'annotation_set': list(ents),
               'spans': spans,
               'title': 'None'}
        print({"doc": ret, 'type': 'high_level'})
        return {"doc": ret, 'type': 'high_level'}

    def process_text(self, text):
        input_text = ' '.join(text.strip().split())
        return self.nlp.tokenize(input_text)

    def vectorize(self, doc, vocab, char_vocab):
        words = np.asarray([vocab[w.lower()] if w.lower() in vocab else 1 for w in doc]) \
            .reshape(1, -1)
        sentence_chars = []
        for w in doc:
            word_chars = []
            for c in w:
                if c in char_vocab:
                    _cid = char_vocab[c]
                else:
                    _cid = 1
                word_chars.append(_cid)
            sentence_chars.append(word_chars)
        sentence_chars = np.expand_dims(pad_sentences(sentence_chars, self.model.word_length),
                                        axis=0)
        return words, sentence_chars

    def inference(self, doc):
        text_arr = self.process_text(doc)
        doc_vec = self.vectorize(text_arr, self.word_vocab, self.char_vocab)
        seq_len = np.array([len(text_arr)]).reshape(-1, 1)
        inputs = list(doc_vec)
        # pylint: disable=no-member
        inputs = list(doc_vec) + [seq_len]
        doc_ner = self.model.predict(inputs, batch_size=1).argmax(2).flatten()
        tags = [self.y_vocab.get(n, None) for n in doc_ner]
        return self.pretty_print(text_arr, tags)
예제 #12
0
class NerApi(AbstractApi):
    """
    Ner model API
    """
    dir = path.dirname(path.realpath(__file__))
    pretrained_model = path.join(dir, 'ner-pretrained', 'model.h5')
    pretrained_model_info = path.join(dir, 'ner-pretrained', 'model_info.dat')

    def __init__(self, ner_model=None, prompt=True):
        self.model = None
        self.model_info = None
        self.model_path = NerApi.pretrained_model
        self.model_info_path = NerApi.pretrained_model_info
        self._download_pretrained_model(prompt)

    def encode_word(self, word):
        return self.model_info['word_vocab'].get(word, 1.0)

    def encode_word_chars(self, word):
        return [self.model_info['char_vocab'].get(c, 1.0) for c in word]

    def encode_input(self, text_arr):
        sentence = []
        sentence_chars = []
        for word in text_arr:
            sentence.append(self.encode_word(word))
            sentence_chars.append(self.encode_word_chars(word))
        encoded_sentence = pad_sequences(
            [np.asarray(sentence)], maxlen=self.model_info['sentence_len'])
        chars_padded = pad_sequences(sentence_chars,
                                     maxlen=self.model_info['word_len'])
        if self.model_info['sentence_len'] - chars_padded.shape[0] > 0:
            chars_padded = np.concatenate((np.zeros(
                (self.model_info['sentence_len'] - chars_padded.shape[0],
                 self.model_info['word_len'])), chars_padded))
        encoded_chars = chars_padded.reshape(1,
                                             self.model_info['sentence_len'],
                                             self.model_info['word_len'])
        return encoded_sentence, encoded_chars

    def _prompt(self):
        response = input(
            '\nTo download \'{}\', please enter YES: '.format('ner'))
        res = response.lower().strip()
        if res == "yes" or (len(res) == 1 and res == 'y'):
            print('Downloading {}...'.format('ner'))
            responded_yes = True
        else:
            print('Download declined. Response received {} != YES|Y. '.format(
                res))
            responded_yes = False
        return responded_yes

    def _download_pretrained_model(self, prompt=True):
        """Downloads the pre-trained BIST model if non-existent."""
        dir_path = path.join(self.dir, 'ner-pretrained')
        if not path.isfile(path.join(dir_path, 'model.h5')):
            print(
                'The pre-trained models to be downloaded for the NER dataset '
                'are licensed under Apache 2.0. By downloading, you accept the terms '
                'and conditions provided by the license')
            makedirs(dir_path, exist_ok=True)
            if prompt is True:
                agreed = self._prompt()
                if agreed is False:
                    sys.exit(0)
            download_unlicensed_file(
                'http://nervana-modelzoo.s3.amazonaws.com/NLP/NER/',
                'model.h5', self.model_path)
            download_unlicensed_file(
                'http://nervana-modelzoo.s3.amazonaws.com/NLP/NER/',
                'model_info.dat', self.model_info_path)
            print('Done.')

    def load_model(self):
        with open(self.model_info_path, 'rb') as fp:
            self.model_info = pickle.load(fp)
            self.model = NERCRF()
            self.model.build(
                self.model_info['sentence_len'],
                self.model_info['word_len'],
                self.model_info['num_of_labels'],
                self.model_info['word_vocab'],
                self.model_info['vocab_size'],
                self.model_info['char_vocab_size'],
                word_embedding_dims=self.model_info['word_embedding_dims'],
                char_embedding_dims=self.model_info['char_embedding_dims'],
                word_lstm_dims=self.model_info['word_lstm_dims'],
                tagger_lstm_dims=self.model_info['tagger_lstm_dims'],
                dropout=self.model_info['dropout'],
                external_embedding_model=self.
                model_info['external_embedding_model'])
            self.model.load(self.model_path)

    def pretty_print(self, text, tags):
        tags_str = [
            self.model_info['labels_id_to_word'].get(t, None) for t in tags[0]
        ][-len(text):]
        mapped = [{
            'index': idx,
            'word': el,
            'label': tags_str[idx]
        } for idx, el in enumerate(text)]
        counter = 0
        ents = []
        spans = []
        for obj in mapped:
            if (obj['label'] != 'O'):
                spans.append({
                    'start': counter,
                    'end': (counter + len(obj['word'])),
                    'type': obj['label']
                })
            counter += len(obj['word']) + 1
        ents = dict((obj['type'].lower(), obj) for obj in spans).keys()
        ret = {}
        ret['doc_text'] = ' '.join(text)
        ret['annotation_set'] = list(ents)
        ret['spans'] = spans
        ret['title'] = 'None'
        return {"doc": ret, 'type': 'high_level'}

    def process_text(self, text):
        input_text = ' '.join(text.strip().split())
        return nlp.tokenize(input_text)

    def inference(self, doc):
        text_arr = self.process_text(doc)
        words, chars = self.encode_input(text_arr)
        tags = self.model.predict([words, chars])
        tags = tags.argmax(2)
        return self.pretty_print(text_arr, tags)