Exemplo n.º 1
0
def train():
    with open(os.path.join(filename1, "train1.pkl"), 'rb') as inp:
        word2id = pickle.load(inp)
        id2word = pickle.load(inp)
        tag2id = pickle.load(inp)
        id2tag = pickle.load(inp)
        x_train = pickle.load(inp)
        y_train = pickle.load(inp)
        x_test = pickle.load(inp)
        y_test = pickle.load(inp)
        x_valid = pickle.load(inp)
        y_valid = pickle.load(inp)

    data_train = BatchGenerator(x_train, y_train, shuffle=True)
    data_valid = BatchGenerator(x_valid, y_valid, shuffle=False)
    data_test = BatchGenerator(x_test, y_test, shuffle=False)
    epochs = 31
    batch_size = 32

    config = {}
    config["lr"] = 0.001
    config["embedding_dim"] = 100
    config["sen_len"] = len(x_train[0])
    config["batch_size"] = batch_size
    config["embedding_size"] = len(word2id) + 1
    config["tag_size"] = len(tag2id)
    config["pretrained"] = False

    embedding_pre = []
    #训练模型
    print("begin to train...")
    model = Model(config, embedding_pre, dropout_keep=0.5)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        train(model, sess, saver, epochs, batch_size, data_train, data_valid,
              id2word, id2tag)
Exemplo n.º 2
0
with open('../data/renmindata.pkl', 'rb') as inp:
    word2id = pickle.load(inp)
    id2word = pickle.load(inp)
    tag2id = pickle.load(inp)
    id2tag = pickle.load(inp)
    x_train = pickle.load(inp)
    y_train = pickle.load(inp)
    x_test = pickle.load(inp)
    y_test = pickle.load(inp)
    x_valid = pickle.load(inp)
    y_valid = pickle.load(inp)
print "train len:", len(x_train)
print "test len:", len(x_test)
print "word2id len", len(word2id)
print 'Creating the data generator ...'
data_train = BatchGenerator(x_train, y_train, shuffle=True)
data_valid = BatchGenerator(x_valid, y_valid, shuffle=False)
data_test = BatchGenerator(x_test, y_test, shuffle=False)
print 'Finished creating the data generator.'

epochs = 31
batch_size = 32

config = {}
config["lr"] = 0.001
config["embedding_dim"] = 100
config["sen_len"] = len(x_train[0])
config["batch_size"] = batch_size
config["embedding_size"] = len(word2id) + 1
config["tag_size"] = len(tag2id)
config["pretrained"] = False
Exemplo n.º 3
0
def max_index(s):
    index_max = 0
    for ii in range(len(s)):
        if s[ii] > index_max:
            index_max = ii
    return index_max


if __name__ == '__main__':
    data = DataLoad()
    length_words, length_tags = data.words
    max_len = data.max_len
    x_train, y_train = data.train
    x_test, y_test = data.test
    id2tag, id2word = data.id2tag, data.id2word
    id2 = (id2word, id2tag)
    # 构建输入数据类
    print('Creating the data generator ...')
    data_train = BatchGenerator(x_train, y_train, shuffle=True)
    print(data_train.num_examples)
    a, b = data_train.next_batch(10)
    data_test = BatchGenerator(x_test, y_test, shuffle=False)
    print('Finished creating the data generator.')

    # building model
    test = u'中年男性,48岁,主因:腹痛、腹胀3天。'
    ids = data.test2ids(test)
    model = Model(id2, length_words, length_tags, data_train, data_test)
    result = []
Exemplo n.º 4
0
def main(args):
    if args.train:
        # trian
        # data generator
        batch_size = 32
        batch_nums = (int(len(x_train) / batch_size),
                      int(len(x_test) / batch_size))
        data_train = BatchGenerator(x_train, y_train, shuffle=True)
        data_test = BatchGenerator(x_test, y_test, shuffle=False)
        # model
        model = BiLSTM_CRF(
            len(word2id) + 1, tag2id, EMBEDDING_DIM, HIDDEN_DIM, batch_size)
        # optimizer
        optimizer = optim.SGD(model.parameters(), lr=0.005, weight_decay=1e-4)
        for epoch in range(EPOCHS):
            for batch in range(batch_nums[0]):
                batch += 1
                sentence, tags = data_train.next_batch(batch_size)
                model.zero_grad()

                sentence = torch.tensor(sentence, dtype=torch.long)
                tags = torch.tensor(tags, dtype=torch.long)

                loss = model.neg_log_likelihood(sentence, tags)
                loss.mean().backward()

                optimizer.step()
                if batch % 100 == 0:
                    score, predict = model(sentence)
                    print(
                        "epoch: {}, batch {} / bath_nums {}, loss: {}".format(
                            epoch, batch, batch_nums[0], loss.mean()))

                    res = eval(sentence, predict, tags)
                    print("accuracy: {} recall: {} f1: {}".format(
                        res[0], res[1], res[2]))
            for batch in range(batch_nums[1]):
                sentence, tags = data_test.next_batch(batch_size)
                sentence = torch.tensor(sentence, dtype=torch.long)
                score, predict = model(sentence)
                res = eval(sentence, predict, tags)
                if batch % 100 == 0:
                    print("accuracy: {} recall: {} f1: {}".format(
                        res[0], res[1], res[2]))

            path_name = "./model/model" + str(epoch) + ".pkl"
            print(path_name)
            torch.save(model.state_dict(), path_name)
            print("model has been saved")
    elif args.test:
        sents = input("input:")
        max_len = 60

        def X_padding(words):
            ids = list(word2id[words])
            if len(ids) >= max_len:
                return ids[:max_len]
            ids.extend([0] * (max_len - len(ids)))
            return ids

        sentence = [[i for i in sents]]
        df_data = pd.DataFrame({'words': sentence},
                               index=list(range(len(sentence))))

        df_data['x'] = df_data['words'].apply(X_padding)
        x = np.asarray(list(df_data['x'].values))
        x = torch.tensor(x, dtype=torch.long)
        model = BiLSTM_CRF(
            len(word2id) + 1, tag2id, EMBEDDING_DIM, HIDDEN_DIM, 1)
        model.load_state_dict(
            torch.load("model/model99.pkl", map_location="cpu"))
        score, predict = model(x)
        predict = predict.permute(1, 0)

        print(id2tag)
        for i in range(len(predict)):
            print([
                sentence[i][j] + "/" + id2tag[predict[i][j].item()]
                for j in range(len(sentence[i]))
            ])
    else:
        print("please choose right mode!")
Exemplo n.º 5
0
with open('../data/Bosondata.pkl', 'rb') as inp:
    word2id = pickle.load(inp)
    id2word = pickle.load(inp)
    tag2id = pickle.load(inp)
    id2tag = pickle.load(inp)
    x_train = pickle.load(inp)
    y_train = pickle.load(inp)
    x_test = pickle.load(inp)
    y_test = pickle.load(inp)
    x_valid = pickle.load(inp)
    y_valid = pickle.load(inp)
print "train len:", len(x_train)
print "test len:", len(x_test)
print "valid len", len(x_valid)
print 'Creating the data generator ...'
data_train = BatchGenerator(x_train, y_train, shuffle=True)
data_valid = BatchGenerator(x_valid, y_valid, shuffle=False)
data_test = BatchGenerator(x_test, y_test, shuffle=False)
print 'Finished creating the data generator.'

# Bidirectional LSTM + CRF model.
learning_rate = 0.005
training_epochs = 10
input_size = 1
batch_size = 16
embedding_size = 100
display_num = 5  # 每个 epoch 显示几个结果
batch_num = int(data_train.y.shape[0] / batch_size)  # 每个 epoch 中包含的 batch 数
batch_num_test = int(data_test.y.shape[0] /
                     batch_size)  # 每个 epoch 中包含的 batch 数
display_batch = int(batch_num / display_num)  # 每训练 display_batch 之后输出一次
Exemplo n.º 6
0
config['SEN_LEN']=len(x_train[0])
config['EMBEDDING_SIZE']=len(word2id)+1
config['TAG_SIZE']=len(tag2id)+1
config['BATCH_SIZE'] = 32
if len(sys.argv)==2 and sys.argv[1]=='train':
    model=Model()
    model.init(config)
    model.build_net()
    EPOCH = 20
    BATCH_SIZE = config['BATCH_SIZE']
    sess=tf.Session()
    init=tf.global_variables_initializer()
    sess.run(init)
    saver=tf.train.Saver(max_to_keep=1)

    train_data=BatchGenerator(x_train,y_train,shuffle=True)
    test_data=BatchGenerator(x_test,y_test,shuffle=False)

    for epoch in range(EPOCH):
        num_train_batch_per_epoch=int(train_data.x.shape[0]/BATCH_SIZE)
        num_test_batch_per_epoch=int(test_data.x.shape[0]/BATCH_SIZE)
        max_F1=0
        for batch in range(num_train_batch_per_epoch):
            batch_x,batch_y=train_data.next_batch(BATCH_SIZE)
            loss_,pred,_=sess.run([model.loss,model.viterbi_sequence,model.train_op],feed_dict={model.tf_x:batch_x,model.tf_y:batch_y,model.keep_prob:0.5})
            if batch %200==0:
                accuracy=np.equal(pred, batch_y)
                accuracy=accuracy.astype(np.int32)
                accuracy_rate = np.mean(accuracy)
                print('epoch:',epoch,'| batch:',batch,'| loss:%.4f'%loss_,'| accuracy_rate:%.4f'%accuracy_rate)