def load_model(model_path, max_features, word_embedding_dim, maxlen, nb_seg_tags, lstm_dim): model = Sequential() model.add( Embedding(max_features, word_embedding_dim, input_length=maxlen, name='word_emb', mask_zero=True)) model.add(Dropout(0.5)) model.add(Bidirectional(LSTM(lstm_dim, return_sequences=True))) model.add(Dropout(0.5)) model.add(TimeDistributed(Dense(nb_seg_tags))) crf = ChainCRF() model.add(crf) model.compile(loss=crf.sparse_loss, optimizer=RMSprop(0.01), metrics=['sparse_categorical_accuracy']) #model.compile('adam', loss=crf.sparse_loss, metrics=['sparse_categorical_accuracy']) #early_stopping = EarlyStopping(patience=10, verbose=1) #checkpointer = ModelCheckpoint(options.model + "/seg_keras_weights.hdf5",verbose=1,save_best_only=True) eprint( strftime("%Y-%m-%d %H:%M:%S", gmtime()) + ' Loading saved model:' + model_path + '/seg_keras_weights.hdf5') model.load_weights(model_path + '/seg_keras_weights.hdf5') return model
def test_generate_transition_matrix(): # Generate data n_samples, n_steps, n_classes = 20000, 16, 3 U_true = get_test_transition_matrix(n_classes) (X_train, y_train), (X_test, y_test) = get_test_sequences(n_samples=n_samples, n_steps=n_steps, U=U_true) model = Sequential() crf = ChainCRF(input_shape=X_train[0].shape) model.add(crf) sgd = SGD(lr=0.01, momentum=0.0, decay=0.0, nesterov=False) model.compile(loss=crf.loss, optimizer=sgd, metrics=['accuracy']) model.fit(X_train, y_train, nb_epoch=1, batch_size=32, validation_data=(X_test, y_test)) print('Example predictions:') y_pred = model.predict_classes(X_test) for i in range(10): print(i) print('y_true', np.argmax(y_test[i], axis=1)) print('y_pred', y_pred[i]) U_pred = K.get_value(crf.U) print('U:\n', U_pred) print('b_start:\n', K.get_value(crf.b_start)) print('b_end:\n', K.get_value(crf.b_end)) U_pred = np.exp(U_pred) U_pred /= np.sum(U_pred, axis=1, keepdims=True) print('transitions_true:\n', U_true) print('transitions_pred:\n', U_pred) assert_allclose(U_pred, U_true, atol=5e-2)
def segment_file(): embeddings = build_embeddings(args.max_features) # print('Loading data...') X_chars, y_test = load_file_as_words(args.test_set) X_idxs = np.array([[word2index.get(w, word2index['<UNK>']) for w in words] for words in X_chars]) X_idxs_padded = sequence.pad_sequences(X_idxs, maxlen=args.maxlen, padding='post') print('loading model...') word_input = Input(shape=(args.maxlen,), dtype='int32', name='word_input') word_emb = Embedding(embeddings.shape[0], args.word_embedding_dim, input_length=args.maxlen, name='word_emb', weights=[embeddings])(word_input) word_emb_d = Dropout(0.5)(word_emb) bilstm = Bidirectional(LSTM(args.lstm_dim, return_sequences=True))(word_emb_d) bilstm_d = Dropout(0.5)(bilstm) dense = TimeDistributed(Dense(args.nb_pos_tags))(bilstm_d) crf = ChainCRF() crf_output = crf(dense) model = load_model("model/keras_weights_0921.hdf5", custom_objects={'ChainCRF': ChainCRF, 'sparse_loss': crf.sparse_loss}, compile=False) model.compile(loss=crf.sparse_loss, optimizer='adam', metrics=['sparse_categorical_accuracy']) prediction = model.predict(X_idxs_padded, args.batch_size, verbose=0) # TODO - 01. the function should return a segmented string with codecs.open(args.output_file, mode='w', encoding='utf-8') as results: for pred, word in zip(np.argmax(prediction, axis=2), X_chars): assert len(pred) >= len(word) for ch, est in zip(word, pred): results.write(ch + '\t' + index2pos[est] + '\n') else: results.write('WB\tWB\n')
def build_model(self, word_embedding_dim=200, lstm_dim=100, batch_size=10, nb_epoch=1, optimizer='adam'): self.lstm_dim = lstm_dim # cut texts after this number of words (among top max_features most common words) self.epoches = nb_epoch self.batch_size = batch_size self.embedding_dim = word_embedding_dim self.embeddings = self.build_embeddings() word_input = Input(shape=(self.maxlen, ), dtype='int32', name='word_input') word_emb = Embedding(self.embeddings.shape[0], self.embedding_dim, input_length=self.maxlen, name='word_emb', weights=[self.embeddings])(word_input) word_emb_d = Dropout(0.5)(word_emb) bilstm = Bidirectional(LSTM(self.lstm_dim, return_sequences=True))(word_emb_d) bilstm_d = Dropout(0.5)(bilstm) dense = TimeDistributed(Dense(len(self.index2pos)))(bilstm_d) crf = ChainCRF() crf_output = crf(dense) self.segmentation_model = Model(inputs=[word_input], outputs=[crf_output]) self.segmentation_model.compile( loss=crf.sparse_loss, optimizer=optimizer, metrics=['sparse_categorical_accuracy'])
def test_tag_sequence(): # Generate data n_samples, n_steps, n_classes = 1000, 16, 3 U_true = get_test_transition_matrix(n_classes) (X_train, y_train), (X_test, y_test) = get_test_sequences(n_samples, n_steps, U_true) model = Sequential() crf = ChainCRF(input_shape=(n_steps, n_classes)) model.add(crf) sgd = SGD(lr=0.2, momentum=0.0, decay=0.0, nesterov=False) model.compile(loss=crf.loss, optimizer=sgd, metrics=['accuracy']) history = model.fit(X_train, y_train, nb_epoch=1, batch_size=32, validation_data=(X_test, y_test)) assert(history.history['val_acc'][-1] >= 0.94)
def terminate_task(shared_layer_output, task): """Terminate Task Terminate the provided task by sending the LSTM output through hidden layers first (if they are defined) and then sending the result to a softmax classifier. Args: shared_layer_output (object): Output of an LSTM layer. task (TaskConfig): Task configuration Returns: `tuple` of object: Reference to CRF layer, output layer, and task in case of CRF classifier, None, output layer, and task otherwise. """ assert isinstance(task, TaskConfig) input_layer = shared_layer_output # Add hidden layers for i, hidden_layer_config in enumerate(task.hidden_layers): input_layer = Dense( units=hidden_layer_config.units, activation=hidden_layer_config.activation, name="hidden_%s_%d" % (task.name, i + 1) )(input_layer) if task.classifier == CLASSIFIER_SOFTMAX: # Add softmax layer return None, TimeDistributed(Dense( units=len(task.data_reader.get_labels()), activation=softmax ), name="softmax_output_%s" % task.name)(input_layer), task else: # Add dense layer to achieve the correct size input_layer = TimeDistributed(Dense( units=len(task.data_reader.get_labels()) ))(input_layer) crf = ChainCRF(name="CRF_output_%s" % task.name) return crf, crf(input_layer), task
bilstm_embedding = Embedding(len(chars) + 1, embedding_dim, input_length=sequence_length, mask_zero=True)(bilstm_inputs) #合并cnn提取特征和普通字向量 total_emb = merge([bilstm_embedding, cnn_max_pooling], mode='concat', concat_axis=2, name='total_emb') emb_droput = Dropout(dropout)(total_emb) #blstm = Bidirectional(LSTM(64, return_sequences=True), merge_mode='sum')(emb_droput) blstm = Bidirectional(LSTM(64, return_sequences=True), merge_mode='sum')(bilstm_embedding) drop = Dropout(dropout)(blstm) output = TimeDistributed(Dense(5))(drop) crf = ChainCRF() crf_output = crf(output) model = Model(input=[bilstm_inputs, cnn_inputs], output=crf_output) # checkpoint = ModelCheckpoint('./model/weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, # save_best_only=True, mode='auto') model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) batch_size = 1024 history = model.fit([np.array(list(d['x'])), np.array(list(d['x']))], np.array(list(d['y'])).reshape((-1, maxlen, 5)), batch_size=batch_size, nb_epoch=50) model.save('./model/cnn_bilstm_crf_model.h5')
def build_model(parameters,embedding_matrix =None, weightsPath = None): lstm_dim = parameters['word_lstm_dim'] word_vocab_size = parameters['word_vocab_size'] char_vocab_size = parameters['char_vocab_size'] char_embedding_dim = parameters['char_dim'] word_embedding_dim = parameters['word_dim'] maxCharSize = parameters['maxCharSize'] cap_size = parameters['cap_size'] cap_embed_size = parameters['cap_dim'] max_words = parameters['max_words'] nb_filters = parameters['cnn_nb_filters'] window_length = parameters['cnn_window_length'] learning_rate = parameters['learning_rate'] decay_rate = parameters['decay_rate'] momentum = parameters['momentum'] clipvalue = parameters['clipvalue'] tag_label_size = parameters['tag_label_size'] dropout = parameters['dropout'] char_input = Input(shape=(maxCharSize * max_words,), dtype='int32', name='char_input') char_emb = Embedding(char_vocab_size, char_embedding_dim, input_length=max_words*maxCharSize, dropout=dropout, name='char_emb')(char_input) char_cnn = Convolution1D(nb_filter=nb_filters,filter_length= window_length, activation='tanh', border_mode='full') (char_emb) char_max_pooling = MaxPooling1D(pool_length=maxCharSize) (char_cnn) # get output per word. this is the size of the hidden layer """ Summary for char layer alone. ____________________________________________________________________________________________________ Layer (type) Output Shape Param # Connected to ==================================================================================================== char_input (InputLayer) (None, 2000) 0 None refers to batch size ____________________________________________________________________________________________________ char_emb (Embedding) (None, 2000, 25) 1250 char_input[0][0] ____________________________________________________________________________________________25 is embedding dimension convolution1d_1 (Convolution1D) (None, 2002, 30) 2280 char_emb[0][0] ____________________________________________________________________________________________30 is the number of filters plus 2 because we use full padding maxpooling1d_1 (MaxPooling1D) (None, 100, 30) 0 convolution1d_1[0][0] =============================================================================================max poolign to get 100 hidden units which will be carried over Total params: 3530 """ #based on https://github.com/pressrelations/keras/blob/a2d358e17ea7979983c3c6704390fe2d4b29bbbf/examples/conll2000_bi_lstm_crf.py word_input = Input(shape=(max_words,), dtype='int32', name='word_input') if (embedding_matrix is not None): word_emb = Embedding(word_vocab_size+1, word_embedding_dim,weights=[embedding_matrix], input_length=max_words, dropout=0, name='word_emb')(word_input) else: word_emb = Embedding(word_vocab_size+1, word_embedding_dim, input_length=max_words, dropout=0, name='word_emb')(word_input) caps_input = Input(shape=(max_words,), dtype='int32', name='caps_input') caps_emb = Embedding(cap_size, cap_embed_size, input_length=None, dropout=dropout, name='caps_emb')(caps_input) #concat axis refers to the axis whose dimension can be different total_emb = merge([word_emb, caps_emb,char_max_pooling], mode='concat', concat_axis=2,name ='total_emb') emb_droput = Dropout(dropout)(total_emb) #inner_init : initialization function of the inner cells. I believe this is Cell state bilstm_word = Bidirectional(LSTM(lstm_dim,inner_init='uniform', forget_bias_init='one',return_sequences=True))(emb_droput) bilstm_word_d = Dropout(dropout)(bilstm_word) dense = TimeDistributed(Dense(tag_label_size))(bilstm_word_d) crf = ChainCRF()def shared(shape, name): crf_output = crf(dense) #to accoutn for gradient clipping #info on nesterov http://stats.stackexchange.com/questions/211334/keras-how-does-sgd-learning-rate-decay-work sgd = SGD(lr=learning_rate, decay=decay_rate, momentum=momentum, nesterov=False,clipvalue = clipvalue) model = Model(input=[word_input,caps_input,char_input], output=[crf_output]) if(weightsPath): model.load_weights(weightsPath) model.compile(loss=crf.sparse_loss, optimizer=sgd, metrics=['sparse_categorical_accuracy']) model.summary() return model def train_model (model,parameters,Words_id_train,caps_train,char_train,tag_train,Words_id_dev=None,caps_dev=None,char_dev = None,tag_dev=None): # define the checkpoint filepath="weights-improvement-BiLSTM-All-no-wd-{epoch:02d}-{loss:.4f}.hdf5" checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min') callbacks_list = [checkpoint] batch_size = parameters['batch_size'] epoch_number = parameters['epoch_number'] model.fit([Words_id_train,caps_train,char_train], tag_train, batch_size=batch_size, validation_data=([Words_id_dev,caps_dev,char_dev], tag_dev), nb_epoch=epoch_number,callbacks=callbacks_list) return model
def save_result_to_hbase(x): nb_word = len(index_word) # 1008 nb_tag = len(index_tag) # 16/14 maxlen = 100 word_embedding_dim = 100 lstm_dim = 100 batch_size = 64 word_input = Input(shape=(maxlen, ), dtype='float32', name='word_input') word_emb = Embedding(nb_word, word_embedding_dim, input_length=maxlen, dropout=0.2, name='word_emb')(word_input) bilstm = Bidirectional( LSTM(lstm_dim, dropout_W=0.1, dropout_U=0.1, return_sequences=True))(word_emb) bilstm_d = Dropout(0.1)(bilstm) half_window_size = 5 paddinglayer = ZeroPadding1D(padding=half_window_size)(word_emb) conv = Conv1D(nb_filter=50, filter_length=(2 * half_window_size + 1), border_mode='valid')(paddinglayer) conv_d = Dropout(0.1)(conv) dense_conv = TimeDistributed(Dense(50))(conv_d) rnn_cnn_merge = merge([bilstm_d, dense_conv], mode='concat', concat_axis=2) dense = TimeDistributed(Dense(nb_tag))(rnn_cnn_merge) crf = ChainCRF() crf_output = crf(dense) model = Model(input=[word_input], output=[crf_output]) model.compile(loss=crf.sparse_loss, optimizer=RMSprop(0.001), metrics=['sparse_categorical_accuracy']) # model.load_weights('/home/weiwc/pkl/model.weights') model.load_weights('model.weights') X_test_cut = x[0] X_test_len = x[1] X_word = x[2] rowkey = str(x[3]) # print type(X_test_cut) # print len(X_test_cut) # print X_test_cut # print (X_test_len) # print len(X_test_len) # print X_test_cut[0],X_test_cut[1],X_test_cut[2] Y_pred = model.predict(X_test_cut) # print "Y_pred",len(Y_pred),len(Y_pred[0]),len(Y_pred[1]),Y_pred # print "X_word",len(X_word),X_word j2 = 0 i2 = 0 t = [] # tt = [] # for i in range(12): # tt.append([]) for j1 in range(len(X_word)): # index_tag {0: 'PAD', 1: 'O', 2: 'B-ROLE', 3: 'I-ROLE', 4: 'B-PER', 5: 'I-PER', 6: 'B-CRIME', 7: 'I-CRIME', 8: 'B-TIME', # 9: 'I-TIME', 10: 'B-ORG', 11: 'I-ORG', 12: 'B-LOC', 13: 'I-LOC'} w = X_word[j1] tags = Y_pred[i2][j2] tag_flag = False t_tmp = [] for i in range(14): if (tags[i] == 1) and i > 0: t_tmp.append(index_tag[i]) t_tmp.append(w) t.append(t_tmp) break j2 += 1 if j2 == X_test_len[ i2]: #X_test_len = [89, 37, 95, 86, 90, 100, 90, 94, 80, 79, 44, 59] j2 = 0 i2 += 1 for i in t: print i[0], i[1] # l2 = [] # l3 = [] # l22 = [] # l23 = [] # c = 0 # ttl = "" # for i in t: # if i[0].startswith('B') and c == 0: # l2.append(i[0]) # l3.append(i[1].decode("utf-8")) # ttl = i[0].replace('B','I') # c = c + 1 # # elif i[0] == ttl: # l2.append(i[0]) # l3.append(i[1].decode("utf-8")) # elif i[0].startswith('B') and c != 0: # l22.append(l2) # l23.append("".join(l3)) # l2 = [] # l3 = [] # l2.append(i[0]) # l3.append(i[1].decode("utf-8")) # ttl = i[0].replace('B', 'I') # l22.append(l2) # l23.append("".join(l3)) # taglist = ['B_ROLE','I_ROLE','B_PER','I_PER','B_CRIME','I_CRIME','B_TIME','I_TIME','B_ORG','I_ORG','B_LOC','I_LOC'] # ret_t = {'PER': [], 'LOC': [], 'ORG': [], 'TIME': [], 'ROLE': [], 'CRIME': []} # index_tag {0: 'PAD', 1: 'O', 2: 'B-ROLE', 3: 'I-ROLE', 4: 'B-PER', 5: 'I-PER', 6: 'B-CRIME', 7: 'I-CRIME', 8: 'B-TIME', # 9: 'I-TIME', 10: 'B-ORG', 11: 'I-ORG', 12: 'B-LOC', 13: 'I-LOC'} # id = 0 # for i in l22: # ret_t[i[0].split("-")[1]].append(l23[id]) # id += 1 # # t2 = [] # for i in ret_t.keys(): # tmp = (rowkey, [rowkey, "d", i, ",".join(ret_t[i])]) # t2.append(tmp) # for i in t2: # print i[1][2],i[1][3] # return t2 return "-"
def get_X(o_content): nb_word = len(index_word) # 1008 nb_tag = len(index_tag) # 16/14 maxlen = 100 word_embedding_dim = 100 lstm_dim = 100 batch_size = 64 word_input = Input(shape=(maxlen,), dtype='float32', name='word_input') word_emb = Embedding(nb_word, word_embedding_dim, input_length=maxlen, dropout=0.2, name='word_emb')(word_input) bilstm = Bidirectional(LSTM(lstm_dim, dropout_W=0.1, dropout_U=0.1, return_sequences=True))(word_emb) bilstm_d = Dropout(0.1)(bilstm) half_window_size = 5 paddinglayer = ZeroPadding1D(padding=half_window_size)(word_emb) conv = Conv1D(nb_filter=50, filter_length=(2 * half_window_size + 1), border_mode='valid')(paddinglayer) conv_d = Dropout(0.1)(conv) dense_conv = TimeDistributed(Dense(50))(conv_d) rnn_cnn_merge = merge([bilstm_d, dense_conv], mode='concat', concat_axis=2) dense = TimeDistributed(Dense(nb_tag))(rnn_cnn_merge) crf = ChainCRF() crf_output = crf(dense) model = Model(input=[word_input], output=[crf_output]) model.compile(loss=crf.sparse_loss, optimizer=RMSprop(0.001), metrics=['sparse_categorical_accuracy']) # model.load_weights('/home/weiwc/pkl/model.weights') model.load_weights('model.weights') x_sen=[] word_sen=[] content_re = o_content.replace(" ","") for line in content_re: word_sen.append(line) if line in dict_word: x_sen.append(dict_word[line]) else: x_sen.append(1) X_test_cut=[] X_test_len=[] max_sen_len=100 if len(x_sen) <= max_sen_len: X_test_cut.append(x_sen) X_test_len.append(len(x_sen)) X_test_cut=pad_sequences(X_test_cut,maxlen=max_sen_len,padding='post') Y_pred = model.predict(X_test_cut) j2=0 i2=0 t = [] for j1 in range(len(word_sen)): w = word_sen[j1] tags = Y_pred[i2][j2] t_tmp = [] for i in range(14): if (tags[i] == 1): # t_tmp.append(index_tag[i]) # t_tmp.append(w) # t.append(t_tmp) t.append(index_tag[i]) break j2 += 1 # if j2 == X_test_len[i2]: #X_test_len = [89, 37, 95, 86, 90, 100, 90, 94, 80, 79, 44, 59] # j2 = 0 # i2 += 1 wl = re.split("[ ]{1,100}", o_content) tt = [] start = 0 end = 0 for i in wl: end += len(i) tt.append(t[start:end]) start += len(i) tt2 = [] for i in range(len(tt)): flag = False for j in tt[i]: if j.startswith('B'): flag = True tt2.append("".join(wl[i]) + "|" + j.split("-")[1]) break if not flag: for j in tt[i]: if j.startswith('I'): flag = True tt2.append("".join(wl[i]) + "|" + j.split("-")[1]) break if not flag: for j in tt[i]: tt2.append("".join(wl[i]) + "|" + j) break return " ".join(tt2)
def main(): # parse user input parser = argparse.ArgumentParser() #file related args parser.add_argument("-m", "--model-dir", default="./models/", help="directory to save the best models") parser.add_argument( "-t", "--train-set", default="./data/EG.txt-train.txt", help="maximul sentence length (for fixed size input)") # parser.add_argument("-v", "--dev-set", default="./data/EG.txt-dev.txt", help="source vocabulary size") # parser.add_argument("-s", "--test-set", default="./data/EG.txt-test.txt", help="target vocabulary size") # parser.add_argument("-i", "--input", default="./data/EG.txt-test.sample-eng.txt", help="a sample input segmened file") # parser.add_argument("-o", "--output", default="", help="POS output") # # network related #input parser.add_argument("-e", "--emb-size", default=300, type=int, help="dimension of embedding") # emb matrix col size parser.add_argument("-w", "--window-size", default=10, type=int, help="dimension of embedding") # parser.add_argument("-d", "--vocab-emb", default="./data/segmented-vectors", help="vocabulatry pre-trained embeddings") # parser.add_argument("-r", "--final_layer", default="lstm", help="Final optimization layer 'crf' or 'lstm'") #learning related parser.add_argument( "-a", "--learning-algorithm", default="adam", help="optimization algorithm (adam, sgd, adagrad, rmsprop, adadelta)") parser.add_argument("-b", "--batch-size", default=128, type=int, help="batch size") parser.add_argument("-n", "--epochs", default=100, type=int, help="nb of epochs") #others parser.add_argument("-V", "--verbose-level", default=1, type=int, help="verbosity level (0 < 1 < 2)") parser.add_argument("-g", "--showGraph", default=False, help="show precision and accuracy graphs") # parser.add_argument("-l", "--train-model", default=False, type=lambda x: (str(x).lower() == 'true'), help="Train the model, default False") parser.parse_args() args = parser.parse_args() if not os.path.exists(args.model_dir): os.makedirs(args.model_dir) # 5 to the left, 5 to the right windowSize = args.window_size print("Pos with Keras, only token, window size %d" % (windowSize)) print("Train the model: %s" % (args.train_model)) # Read in the vocab #print("Read in the vocab") vocabPath = args.vocab_emb word2Idx = {} # Maps a word to the index in the embeddings matrix idx2word = {} embeddings = [] # Embeddings matrix with open(vocabPath, 'r') as fIn: idx = 0 for line in fIn: split = line.strip().split(' ') embeddings.append(np.array([float(num) for num in split[1:]])) word2Idx[split[0]] = idx idx += 1 idx2word = {v: k for k, v in word2Idx.items()} embeddings = np.asarray(embeddings, dtype='float32') embedding_size = embeddings.shape[1] # Create a mapping for our labels labels_list = getLabels(args.train_set) labels_list = set(labels_list + getLabels(args.dev_set)) label2Idx = dict((l, i) for i, l in enumerate(labels_list)) idx2Label = {v: k for k, v in label2Idx.items()} if (args.train_model == False): word2Idx = load_pickled_file(args.model_dir + '/word2Idx') label2Idx = load_pickled_file(args.model_dir + '/label2Idx') idx2Label = {v: k for k, v in label2Idx.items()} elif (not os.path.isfile(args.model_dir + '/list2idx.pkl')): save_pickled_file(word2Idx, args.model_dir + '/word2Idx') save_pickled_file(label2Idx, args.model_dir + '/label2Idx') print("Idx2Label:", idx2Label) if (args.train_model == True): # Read in data print("Read in data and create matrices") train_sentences = readFile(args.train_set) dev_sentences = readFile(args.dev_set) test_sentences = readFile(args.test_set) else: test_sentences = readTestFile(args.input) test_src = [] test_trg = [] for sentence in test_sentences: for word in sentence: if (args.train_model == True): test_src.append(word[0]) test_trg.append(word[1]) else: test_src.append(word.split('\t')[0]) if (args.train_model == True): # Create numpy arrays X_train, y_train = createNumpyArray(train_sentences, windowSize, word2Idx, label2Idx) X_dev, y_dev = createNumpyArray(dev_sentences, windowSize, word2Idx, label2Idx) X_test, y_test = createNumpyArray(test_sentences, windowSize, word2Idx, label2Idx) else: X_test = createTestArray(test_sentences, windowSize, word2Idx, label2Idx) #print(test_src) # Create the Network n_in = 2 * windowSize + 1 n_out = len(label2Idx) batch_size = args.batch_size epochs = args.epochs # If CRF change Tensor to shape '(?, ?, ?)' if (args.final_layer == 'crf'): maxlen = n_in if (args.train_model == True): X_train = sequence.pad_sequences(X_train, maxlen=maxlen, padding='post') y_train = sequence.pad_sequences(y_train, maxlen=maxlen, padding='post') y_train = np.expand_dims(y_train, -1) X_dev = sequence.pad_sequences(X_dev, maxlen=maxlen, padding='post') y_dev = sequence.pad_sequences(y_dev, maxlen=maxlen, padding='post') y_dev = np.expand_dims(y_dev, -1) X_test = sequence.pad_sequences(X_test, maxlen=maxlen, padding='post') y_test = sequence.pad_sequences(y_test, maxlen=maxlen, padding='post') y_test = np.expand_dims(y_test, -1) else: X_test = sequence.pad_sequences(X_test, maxlen=maxlen, padding='post') print('number of classes:', n_out) print("Embeddings shape", embeddings.shape) print("input dim", embeddings.shape[0], embeddings.shape[1]) if (args.final_layer == 'crf'): model = Sequential() model.add( Embedding(output_dim=embeddings.shape[1], input_dim=embeddings.shape[0], input_length=n_in, weights=[embeddings], trainable=False)) model.add(Dropout(0.5)) model.add(Bidirectional(LSTM(300, return_sequences=True))) model.add(Dropout(0.5)) model.add(TimeDistributed(Dense(n_out))) crf = ChainCRF() model.add(crf) model.compile(loss=crf.sparse_loss, optimizer=RMSprop(0.01), metrics=['sparse_categorical_accuracy']) else: model = Sequential() model.add( Embedding(output_dim=embeddings.shape[1], input_dim=embeddings.shape[0], input_length=n_in, weights=[embeddings], trainable=False)) model.add(Dropout(0.5)) #model.add(LSTM(300, return_sequences=False)) model.add(Bidirectional(LSTM(embedding_size, return_sequences=False))) model.add(Dropout(0.5)) model.add(Dense(output_dim=n_out, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer=args.learning_algorithm, metrics=['accuracy']) model.summary() if (os.path.isfile(args.model_dir + '/keras_weights.hdf5')): model.load_weights(args.model_dir + '/keras_weights.hdf5') if (args.train_model == True): early_stopping = EarlyStopping(patience=5, verbose=1) checkpointer = ModelCheckpoint(args.model_dir + "/keras_weights.hdf5", verbose=1, save_best_only=True) history = model.fit( X_train, y_train, batch_size=batch_size, #epochs=epochs, nb_epoch=epochs, verbose=1, shuffle=True, callbacks=[early_stopping, checkpointer], validation_data=[X_dev, y_dev]) model.load_weights(args.model_dir + '/keras_weights.hdf5') if (args.train_model == True): preds_dev = model.predict_classes(X_dev, batch_size=64, verbose=0) if (args.final_layer == 'crf'): preds_dev = preds_dev.argmax(-1) if (args.final_layer == 'crf'): preds_test = model.predict_classes(X_test, batch_size=512, verbose=0).argmax(-1) else: preds_test = model.predict_classes(X_test, batch_size=512, verbose=0) # print("test_src:",len(test_src)) # print("X_test", len(X_test)) # print("preds_test",len(preds_test)) if (args.output != ''): fout = open(args.output, 'w') else: fout = sys.stdout for w, p in zip(test_src, preds_test): #print("W:",w," P:",p) fout.write(w + '\t' + (idx2Label[p] if (p < len(idx2Label)) else 'UNKNOWN') + '\n') #print(score_test[1]) if (args.train_model == True): from sklearn.metrics import confusion_matrix, classification_report score_test = model.evaluate(X_test, y_test, batch_size=500) print("Test Score:", score_test[1]) score_dev = model.evaluate(X_dev, y_dev, batch_size=500) print("Dev Score:", score_dev[1]) print('') print( classification_report(np.argmax(y_dev, axis=1), preds_dev, target_names=labels_list)) if (args.showGraph): print('') print(confusion_matrix(np.argmax(y_dev, axis=1), preds_dev)) print('') print( classification_report(np.argmax(y_test, axis=1), preds_test, target_names=labels_list)) print('') print(confusion_matrix(np.argmax(y_test, axis=1), preds_test)) # # list all data in history print(history.history.keys()) import matplotlib.pyplot as plt # summarize history for accuracy plt.plot(history.history['acc']) plt.plot(history.history['val_acc']) plt.title('model accuracy') plt.ylabel('accuracy') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') plt.show() #summarize history for loss plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('model loss') plt.ylabel('loss') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper right') plt.show() score, y_true_word, y_pred_word = computeWordLevelAccuracy( test_trg, preds_test, idx2Label) print(score)