예제 #1
0
파일: model_dc.py 프로젝트: Babyzpj/NLP
def predict():
    word_weights, tag_weights = load_embedding()
    word_voc, tag_voc, label_voc = load_voc()

    # train data
    sentences, tags, labels = load_train_data(word_voc, tag_voc, label_voc)
    seed = 137
    np.random.seed(seed)
    np.random.shuffle(sentences)
    np.random.seed(seed)
    np.random.shuffle(tags)
    np.random.seed(seed)
    np.random.shuffle(labels)

    # load data
    sentences_test, tags_test = load_test_data(word_voc, tag_voc, label_voc)
    labels_test = None
    
    # clear reslut
    command = 'rm ./Data/result/*'
    os.popen(command)

    # 划分训练、开发、测试集
    kf = KFold(n_splits=config.KFOLD)
    train_indices, dev_indices = [], []
    for train_index, dev_index in kf.split(labels):
        train_indices.append(train_index)
        dev_indices.append(dev_index)
    for num in range(config.KFOLD):
        train_index, dev_index = train_indices[num], dev_indices[num]
        sentences_train, sentences_dev = sentences[train_index], sentences[dev_index]
        tags_train, tags_dev = tags[train_index], tags[dev_index]
        labels_train, labels_dev = labels[train_index], labels[dev_index]

        # init model
        model = DCModel(
            config.MAX_LEN, word_weights, tag_weights, result_path='./Data/result/result.txt',
            label_voc=label_voc)

        # fit model
        model.fit(
            sentences_train, tags_train, labels_train,
            sentences_dev, tags_dev, labels_dev,
            sentences_test, tags_test, labels_test,
            config.BATCH_SIZE, config.NB_EPOCH, keep_prob=config.KEEP_PROB,
            word_keep_prob=config.WORD_KEEP_PROB, tag_keep_prob=config.TAG_KEEP_PROB)
        print(model.get_best_score())
        [p_test, r_test, f_test], nb_epoch = model.get_best_score()
        command = 'cp ./Data/result/epoch_%d.csv ./Data/result/best_%d' % (nb_epoch+1, num)
        print(command)
        os.popen(command)
        print(p_test, r_test, f_test, '\n')
        # evaluate
        # result_path_k = result_path % k
        # p_test, r_test, f_test = model.evaluate(sentences_test, tags_test, positions_test,
        #    labels_test, simple_compute=False, ignore_label=IGNORE_LABEL,
        #    label_voc=relation_voc, result_path=result_path_k)
        # clear model
        model.clear_model()
        del model
from load_data import load_embedding, load_voc, load_train_data, load_test_data
import time
from generator import BatchGenerator
from TFNN.layers.EmbeddingLayer import Embedding
from sklearn.model_selection import KFold
from triggerType_to_trigger import get_trigger
'''
For Chinese word segmentation.
'''

#############################1.load data   ######################################
class_type = 3
training_count = 16796
test_count = 2570
word_weights, tag_weights = load_embedding()  #矩阵形式
word_voc, tag_voc, label_voc = load_voc()  #字典形式
sentences, tags, labels = load_train_data(word_voc, tag_voc, label_voc,
                                          class_type, training_count)
Xend_sentence, Xend_tag_test, yend_test = load_test_data(
    word_voc, tag_voc, label_voc, class_type, test_count)

#划分训练集,测试集(这里的y为词性tag

kf = KFold(n_splits=10)
train_indices, dev_indices = [], []
for train_index, dev_index in kf.split(labels):
    train_indices.append(train_index)
    dev_indices.append(dev_index)
for num in range(10):
    train_index, dev_index = train_indices[num], dev_indices[num]
    sentences_train, sentences_dev = sentences[train_index], sentences[