示例#1
0
文件: nn.py 项目: joydeb28/NLP
def predict(sentence, model):
    sen_list = [[[i, 'O\n'] for i in sentence.split()]]
    #sen_list = [[['SOCCER', 'O\n'], ['-', 'O\n'], ['JAPAN', 'O\n'], ['GET', 'O\n'], ['LUCKY', 'O\n'], ['WIN', 'O\n'], [',', 'O\n'], ['CHINA', 'O\n'], ['IN', 'O\n'], ['SURPRISE', 'O\n'], ['DEFEAT', 'O\n'], ['.', 'O\n']]]
    test = addCharInformatioin(sen_list)

    predLabels = []

    test_set = padding(
        createMatrices(test, word2Idx, label2Idx, case2Idx, char2Idx))

    test_batch, test_batch_len = createBatches(test_set)

    for i, data in enumerate(test_batch):
        tokens, casing, char, labels = data

        tokens = np.asarray([tokens])
        casing = np.asarray([casing])
        char = np.asarray([char])
        pred = model.predict([tokens, casing, char], verbose=False)[0]
        pred = pred.argmax(axis=-1)  #Predict the classes
        predLabels.append(pred)
    entity_labels = []
    j = 0
    words_list = sentence.split()
    for i in predLabels[-1]:
        entity_labels.append((words_list[j], idx2Label[int(i)]))
        j += 1
    print("predLabels", entity_labels)

    return entity_labels
示例#2
0
        word2Idx["UNKNOWN_TOKEN"] = len(word2Idx)
        vector = np.random.uniform(-0.25, 0.25, len(split) - 1)
        wordEmbeddings.append(vector)

    if split[0].lower() in words:
        vector = np.array([float(num) for num in split[1:]])
        wordEmbeddings.append(vector)
        word2Idx[split[0]] = len(word2Idx)

wordEmbeddings = np.array(wordEmbeddings)

char2Idx = {"PADDING": 0, "UNKNOWN": 1}
for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|":
    char2Idx[c] = len(char2Idx)

train_set = padding(
    createMatrices(trainSentences, word2Idx, label2Idx, case2Idx, char2Idx))
dev_set = padding(
    createMatrices(devSentences, word2Idx, label2Idx, case2Idx, char2Idx))
test_set = padding(
    createMatrices(testSentences, word2Idx, label2Idx, case2Idx, char2Idx))

idx2Label = {v: k for k, v in label2Idx.items()}
np.save("models/idx2Label.npy", idx2Label)
np.save("models/word2Idx.npy", word2Idx)

train_batch, train_batch_len = createBatches(train_set)
dev_batch, dev_batch_len = createBatches(dev_set)
test_batch, test_batch_len = createBatches(test_set)

words_input = Input(shape=(None, ), dtype='int32', name='words_input')
words = Embedding(input_dim=wordEmbeddings.shape[0],
示例#3
0
    char2Idx[c] = len(char2Idx)

# :: Hard coded case lookup ::
case2Idx = {
    'numeric': 0,
    'allLower': 1,
    'allUpper': 2,
    'initialUpper': 3,
    'other': 4,
    'mainly_numeric': 5,
    'contains_digit': 6,
    'PADDING_TOKEN': 7
}
caseEmbeddings = np.identity(len(case2Idx), dtype='float32')

train_set = padding(
    createMatrices(trainSentences, word2Idx, label2Idx, case2Idx, char2Idx))

train_batch, train_batch_len = createBatches(train_set)

words_input = Input(shape=(None, ), dtype='int32', name='words_input')
words = Embedding(input_dim=wordEmbeddings.shape[0],
                  output_dim=wordEmbeddings.shape[1],
                  weights=[wordEmbeddings],
                  trainable=False)(words_input)
casing_input = Input(shape=(None, ), dtype='int32', name='casing_input')
casing = Embedding(output_dim=caseEmbeddings.shape[1],
                   input_dim=caseEmbeddings.shape[0],
                   weights=[caseEmbeddings],
                   trainable=False)(casing_input)
character_input = Input(shape=(
    None,
示例#4
0
    def embed(self):
        """Create word- and character-level embeddings"""

        labelSet = set()
        words = {}

        # unique words and labels in data
        for dataset in [
                self.trainSentences, self.devSentences, self.testSentences
        ]:
            for sentence in dataset:
                for token, char, label in sentence:
                    # token ... token, char ... list of chars, label ... BIO labels
                    labelSet.add(label)
                    words[token.lower()] = True

        # mapping for labels
        self.label2Idx = {}
        for label in labelSet:
            self.label2Idx[label] = len(self.label2Idx)

        # mapping for token cases
        case2Idx = {
            'numeric': 0,
            'allLower': 1,
            'allUpper': 2,
            'initialUpper': 3,
            'other': 4,
            'mainly_numeric': 5,
            'contains_digit': 6,
            'PADDING_TOKEN': 7
        }
        self.caseEmbeddings = np.identity(
            len(case2Idx), dtype='float32')  # identity matrix used

        # read GLoVE word embeddings
        word2Idx = {}
        self.wordEmbeddings = []

        fEmbeddings = open("embeddings/glove.6B.50d.txt", encoding="utf-8")

        # loop through each word in embeddings
        for line in fEmbeddings:
            split = line.strip().split(" ")
            word = split[0]  # embedding word entry

            if len(word2Idx) == 0:  # add padding+unknown
                word2Idx["PADDING_TOKEN"] = len(word2Idx)
                vector = np.zeros(len(split) -
                                  1)  # zero vector for 'PADDING' word
                self.wordEmbeddings.append(vector)

                word2Idx["UNKNOWN_TOKEN"] = len(word2Idx)
                vector = np.random.uniform(-0.25, 0.25, len(split) - 1)
                self.wordEmbeddings.append(vector)

            if split[0].lower() in words:
                vector = np.array([float(num) for num in split[1:]])
                self.wordEmbeddings.append(vector)  # word embedding vector
                word2Idx[split[0]] = len(word2Idx)  # corresponding word dict

        self.wordEmbeddings = np.array(self.wordEmbeddings)

        # dictionary of all possible characters
        self.char2Idx = {"PADDING": 0, "UNKNOWN": 1}
        for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|<>":
            self.char2Idx[c] = len(self.char2Idx)

        # format: [[wordindices], [caseindices], [padded word indices], [label indices]]
        self.train_set = padding(
            createMatrices(self.trainSentences, word2Idx, self.label2Idx,
                           case2Idx, self.char2Idx))
        self.dev_set = padding(
            createMatrices(self.devSentences, word2Idx, self.label2Idx,
                           case2Idx, self.char2Idx))
        self.test_set = padding(
            createMatrices(self.testSentences, word2Idx, self.label2Idx,
                           case2Idx, self.char2Idx))

        self.idx2Label = {v: k for k, v in self.label2Idx.items()}
示例#5
0
    def embed(self):
        """Create word- and character-level embeddings"""

        s = SemEvalData()
        k = Komn(s.make_normal_vocabulary(), s.make_syntactical_vocabulary())
        syntax_x, _, syntax_test_x, _ = s.get_data_syntax_concatenation(k)
        # can call s.make_syntactical_vocabulary() to get unique syntactic_words
        labelSet, words = self.get_unique_labels_and_words()

        self.map_labels_to_indexes(labelSet)

        # mapping for token cases
        case2Idx = {
            'numeric': 0,
            'allLower': 1,
            'allUpper': 2,
            'initialUpper': 3,
            'other': 4,
            'mainly_numeric': 5,
            'contains_digit': 6,
            'PADDING_TOKEN': 7
        }
        self.caseEmbeddings = np.identity(
            len(case2Idx), dtype='float32')  # identity matrix used

        # read GLoVE word embeddings
        word2Idx = {}
        self.wordEmbeddings = []

        # loop through each word in embeddings
        for word, vector in k.word_to_emb.items():

            if len(word2Idx) == 0:  # add padding+unknown
                word2Idx["PADDING_TOKEN"] = len(word2Idx)
                vector = np.zeros(
                    len(vector))  # zero vector for 'PADDING' word
                self.wordEmbeddings.append(vector)

                word2Idx["UNKNOWN_TOKEN"] = len(word2Idx)
                vector = np.random.uniform(-0.25, 0.25, len(vector))
                self.wordEmbeddings.append(vector)

            if word.lower() in words:
                vector = np.array(vector)
                self.wordEmbeddings.append(vector)  # word embedding vector
                word2Idx[word] = len(word2Idx)  # corresponding word dict

        self.wordEmbeddings = np.array(self.wordEmbeddings)

        # dictionary of all possible characters
        self.char2Idx = {"PADDING": 0, "UNKNOWN": 1}
        for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|<>–™Ã©˜¦":
            self.char2Idx[c] = len(self.char2Idx)

        self.train_set = padding(
            createMatrices_syntax(self.trainSentences, syntax_x, word2Idx,
                                  self.label2Idx, case2Idx, self.char2Idx))
        # self.dev_set = padding(createMatrices(self.devSentences, word2Idx, self.label2Idx, case2Idx, self.char2Idx))
        self.test_set = padding(
            createMatrices_syntax(self.testSentences, syntax_test_x, word2Idx,
                                  self.label2Idx, case2Idx, self.char2Idx))

        # format: [[wordindices], [caseindices], [padded word indices], [label indices]]
        #  self.train_set = padding(createMatrices(self.trainSentences, word2Idx, self.label2Idx, case2Idx, self.char2Idx))
        #  self.test_set = padding(createMatrices(self.testSentences, word2Idx, self.label2Idx, case2Idx, self.char2Idx))

        self.idx2Label = {v: k for k, v in self.label2Idx.items()}
def main(tweets):

    model = load_model("NER.h5")
    tweetsList = []
    ne_chunked_sents_list = []
    for tweet in tweets:
        tokenized_doc = nltk.word_tokenize(tweet)
        tagged_sentences = nltk.pos_tag(tokenized_doc)
        ne_chunked_sents = nltk.ne_chunk(tagged_sentences)
        tweetsList.append(tokenized_doc)
        ne_chunked_sents_list.append(ne_chunked_sents)
    word2Idx = {}
    f = io.open("embeddings/glove.6B.100d.txt", encoding="utf-8")
    for line in f:
        split = line.strip().split(" ")
        if len(word2Idx) == 0:  # Add padding+unknown
            word2Idx["PADDING_TOKEN"] = len(word2Idx)
            word2Idx["UNKNOWN_TOKEN"] = len(word2Idx)
        word2Idx[split[0].lower()] = len(word2Idx)

    char2Idx = {"PADDING": 0, "UNKNOWN": 1}
    for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|":
        char2Idx[c] = len(char2Idx)

    word_input = [[]]
    words = tweet.split()
    charInd = []
    wordInd = []
    res = []
    named_entities = []
    for word in words:
        if word.lower() in word2Idx:
            wordIdx = word2Idx[word.lower()]
        else:
            wordIdx = word2Idx["UNKNOWN_TOKEN"]
        temp_char = []
        for char in word:
            temp_char.append(char2Idx[char])
        charInd.append(temp_char)
        wordInd.append(wordIdx)
    res.append([wordInd, charInd])
    res = padding(res)
    i = 0
    for ne_chunked_sents in ne_chunked_sents_list:
        named_entities.append([])
        for element in ne_chunked_sents:
            if hasattr(element, 'label'):
                entity_name = ' '.join(c[0] for c in element.leaves())
                #entity_type = element.label()  # get NE category
                named_entities[i].append((entity_name))
        i += 1
    ans = []
    for i in res:
        for j in range(len(i[0])):
            tokens = np.asarray([i[0][j]])
            char = np.asarray([i[1][j]])
            pred = model.predict([[tokens], [char]], verbose=False)[0]
            pred = pred.argmax(axis=-1)  # Predict the classes
            if pred == 0 or pred == 1:
                ans.append("ORG")
            elif pred == 3 or pred == 8:
                ans.append("LOC")
            elif pred == 4 or pred == 7:
                ans.append("PER")
            else:
                ans.append("O")

    return named_entities